diff options
Diffstat (limited to 'lib')
137 files changed, 7025 insertions, 2902 deletions
diff --git a/lib/Makefile b/lib/Makefile index f254dba8..ca7c02fd 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -57,6 +57,7 @@ DIRS-$(CONFIG_RTE_LIBRTE_PORT) += librte_port DIRS-$(CONFIG_RTE_LIBRTE_TABLE) += librte_table DIRS-$(CONFIG_RTE_LIBRTE_PIPELINE) += librte_pipeline DIRS-$(CONFIG_RTE_LIBRTE_REORDER) += librte_reorder +DIRS-$(CONFIG_RTE_LIBRTE_PDUMP) += librte_pdump ifeq ($(CONFIG_RTE_EXEC_ENV_LINUXAPP),y) DIRS-$(CONFIG_RTE_LIBRTE_KNI) += librte_kni diff --git a/lib/librte_acl/Makefile b/lib/librte_acl/Makefile index 2e394c97..9803e9dd 100644 --- a/lib/librte_acl/Makefile +++ b/lib/librte_acl/Makefile @@ -73,7 +73,7 @@ else $(shell $(CC) -march=core-avx2 -dM -E - </dev/null 2>&1 | \ grep -q AVX2 && echo 1) ifeq ($(CC_AVX2_SUPPORT), 1) - ifeq ($(CC), icc) + ifeq ($(CONFIG_RTE_TOOLCHAIN_ICC),y) CFLAGS_acl_run_avx2.o += -march=core-avx2 else CFLAGS_acl_run_avx2.o += -mavx2 diff --git a/lib/librte_cfgfile/rte_cfgfile.c b/lib/librte_cfgfile/rte_cfgfile.c index 75625a28..d72052a0 100644 --- a/lib/librte_cfgfile/rte_cfgfile.c +++ b/lib/librte_cfgfile/rte_cfgfile.c @@ -232,6 +232,7 @@ rte_cfgfile_load(const char *filename, int flags) return cfg; error1: + cfg->num_sections = curr_section + 1; rte_cfgfile_close(cfg); error2: fclose(f); diff --git a/lib/librte_cfgfile/rte_cfgfile.h b/lib/librte_cfgfile/rte_cfgfile.h index 834f8287..f649836c 100644 --- a/lib/librte_cfgfile/rte_cfgfile.h +++ b/lib/librte_cfgfile/rte_cfgfile.h @@ -72,7 +72,7 @@ struct rte_cfgfile_entry { * @param flags * Config file flags, Reserved for future use. Must be set to 0. * @return -* Handle to configuration file +* Handle to configuration file on success, NULL otherwise */ struct rte_cfgfile *rte_cfgfile_load(const char *filename, int flags); diff --git a/lib/librte_cmdline/cmdline.c b/lib/librte_cmdline/cmdline.c index c405878e..a9c47be3 100644 --- a/lib/librte_cmdline/cmdline.c +++ b/lib/librte_cmdline/cmdline.c @@ -130,6 +130,7 @@ struct cmdline * cmdline_new(cmdline_parse_ctx_t *ctx, const char *prompt, int s_in, int s_out) { struct cmdline *cl; + int ret; if (!ctx || !prompt) return NULL; @@ -142,8 +143,13 @@ cmdline_new(cmdline_parse_ctx_t *ctx, const char *prompt, int s_in, int s_out) cl->s_out = s_out; cl->ctx = ctx; - rdline_init(&cl->rdl, cmdline_write_char, - cmdline_valid_buffer, cmdline_complete_buffer); + ret = rdline_init(&cl->rdl, cmdline_write_char, cmdline_valid_buffer, + cmdline_complete_buffer); + if (ret != 0) { + free(cl); + return NULL; + } + cl->rdl.opaque = cl; cmdline_set_prompt(cl, prompt); rdline_newline(&cl->rdl, cl->prompt); diff --git a/lib/librte_cmdline/cmdline_parse.c b/lib/librte_cmdline/cmdline_parse.c index 24a6ed67..b496067a 100644 --- a/lib/librte_cmdline/cmdline_parse.c +++ b/lib/librte_cmdline/cmdline_parse.c @@ -118,6 +118,14 @@ cmdline_isendoftoken(char c) return 0; } +int +cmdline_isendofcommand(char c) +{ + if (!c || iscomment(c) || isendofline(c)) + return 1; + return 0; +} + static unsigned int nb_common_chars(const char * s1, const char * s2) { diff --git a/lib/librte_cmdline/cmdline_parse.h b/lib/librte_cmdline/cmdline_parse.h index 4b25c456..4ac05d6b 100644 --- a/lib/librte_cmdline/cmdline_parse.h +++ b/lib/librte_cmdline/cmdline_parse.h @@ -184,6 +184,9 @@ int cmdline_complete(struct cmdline *cl, const char *buf, int *state, * isendofline(c)) */ int cmdline_isendoftoken(char c); +/* return true if(!c || iscomment(c) || isendofline(c)) */ +int cmdline_isendofcommand(char c); + #ifdef __cplusplus } #endif diff --git a/lib/librte_cmdline/cmdline_parse_string.c b/lib/librte_cmdline/cmdline_parse_string.c index 45883b3e..35917a7b 100644 --- a/lib/librte_cmdline/cmdline_parse_string.c +++ b/lib/librte_cmdline/cmdline_parse_string.c @@ -76,9 +76,10 @@ struct cmdline_token_ops cmdline_token_string_ops = { .get_help = cmdline_get_help_string, }; -#define MULTISTRING_HELP "Mul-choice STRING" -#define ANYSTRING_HELP "Any STRING" -#define FIXEDSTRING_HELP "Fixed STRING" +#define CHOICESTRING_HELP "Mul-choice STRING" +#define ANYSTRING_HELP "Any STRING" +#define ANYSTRINGS_HELP "Any STRINGS" +#define FIXEDSTRING_HELP "Fixed STRING" static unsigned int get_token_len(const char *s) @@ -123,8 +124,8 @@ cmdline_parse_string(cmdline_parse_token_hdr_t *tk, const char *buf, void *res, sd = &tk2->string_data; - /* fixed string */ - if (sd->str) { + /* fixed string (known single token) */ + if ((sd->str != NULL) && (strcmp(sd->str, TOKEN_STRING_MULTI) != 0)) { str = sd->str; do { token_len = get_token_len(str); @@ -148,7 +149,21 @@ cmdline_parse_string(cmdline_parse_token_hdr_t *tk, const char *buf, void *res, if (!str) return -1; } - /* unspecified string */ + /* multi string */ + else if (sd->str != NULL) { + if (ressize < STR_MULTI_TOKEN_SIZE) + return -1; + + token_len = 0; + while (!cmdline_isendofcommand(buf[token_len]) && + token_len < (STR_MULTI_TOKEN_SIZE - 1)) + token_len++; + + /* return if token too long */ + if (token_len >= (STR_MULTI_TOKEN_SIZE - 1)) + return -1; + } + /* unspecified string (unknown single token) */ else { token_len = 0; while(!cmdline_isendoftoken(buf[token_len]) && @@ -162,12 +177,16 @@ cmdline_parse_string(cmdline_parse_token_hdr_t *tk, const char *buf, void *res, } if (res) { - /* we are sure that token_len is < STR_TOKEN_SIZE-1 */ - snprintf(res, STR_TOKEN_SIZE, "%s", buf); + if ((sd->str != NULL) && (strcmp(sd->str, TOKEN_STRING_MULTI) == 0)) + /* we are sure that token_len is < STR_MULTI_TOKEN_SIZE-1 */ + snprintf(res, STR_MULTI_TOKEN_SIZE, "%s", buf); + else + /* we are sure that token_len is < STR_TOKEN_SIZE-1 */ + snprintf(res, STR_TOKEN_SIZE, "%s", buf); + *((char *)res + token_len) = 0; } - return token_len; } @@ -242,8 +261,10 @@ int cmdline_get_help_string(cmdline_parse_token_hdr_t *tk, char *dstbuf, s = sd->str; if (s) { - if (get_next_token(s)) - snprintf(dstbuf, size, MULTISTRING_HELP); + if (strcmp(s, TOKEN_STRING_MULTI) == 0) + snprintf(dstbuf, size, ANYSTRINGS_HELP); + else if (get_next_token(s)) + snprintf(dstbuf, size, CHOICESTRING_HELP); else snprintf(dstbuf, size, FIXEDSTRING_HELP); } else diff --git a/lib/librte_cmdline/cmdline_parse_string.h b/lib/librte_cmdline/cmdline_parse_string.h index 94aa1f1b..a84291b0 100644 --- a/lib/librte_cmdline/cmdline_parse_string.h +++ b/lib/librte_cmdline/cmdline_parse_string.h @@ -70,8 +70,13 @@ extern "C" { /* size of a parsed string */ #define STR_TOKEN_SIZE 128 +/* size of a parsed multi string */ +#define STR_MULTI_TOKEN_SIZE 4096 + typedef char cmdline_fixed_string_t[STR_TOKEN_SIZE]; +typedef char cmdline_multi_string_t[STR_MULTI_TOKEN_SIZE]; + struct cmdline_token_string_data { const char *str; }; @@ -92,6 +97,16 @@ int cmdline_complete_get_elt_string(cmdline_parse_token_hdr_t *tk, int idx, int cmdline_get_help_string(cmdline_parse_token_hdr_t *tk, char *dstbuf, unsigned int size); +/** +* Token marked as TOKEN_STRING_MULTI takes entire parsing string +* until “#” sign appear. Everything after “#” sign is treated as a comment. +* +* Note: +* In this case second parameter of TOKEN_STRING_INITIALIZER must be a type of +* cmdline_multi_string_t. +*/ +#define TOKEN_STRING_MULTI "" + #define TOKEN_STRING_INITIALIZER(structure, field, string) \ { \ /* hdr */ \ diff --git a/lib/librte_cmdline/rte_cmdline_version.map b/lib/librte_cmdline/rte_cmdline_version.map index c9fc18ab..04bcb387 100644 --- a/lib/librte_cmdline/rte_cmdline_version.map +++ b/lib/librte_cmdline/rte_cmdline_version.map @@ -50,7 +50,6 @@ DPDK_2.0 { cmdline_token_num_ops; cmdline_token_portlist_ops; cmdline_token_string_ops; - cmdline_token_string_ops; cmdline_write_char; rdline_add_history; rdline_char_in; diff --git a/lib/librte_cryptodev/rte_crypto_sym.h b/lib/librte_cryptodev/rte_crypto_sym.h index 4ae9b9e8..d9bd8210 100644 --- a/lib/librte_cryptodev/rte_crypto_sym.h +++ b/lib/librte_cryptodev/rte_crypto_sym.h @@ -388,7 +388,8 @@ struct rte_crypto_sym_op { * this location. * * @note - * For Snow3G @ RTE_CRYPTO_CIPHER_SNOW3G_UEA2, + * For Snow3G @ RTE_CRYPTO_CIPHER_SNOW3G_UEA2 + * and KASUMI @ RTE_CRYPTO_CIPHER_KASUMI_F8, * this field should be in bits. */ @@ -413,6 +414,7 @@ struct rte_crypto_sym_op { * * @note * For Snow3G @ RTE_CRYPTO_AUTH_SNOW3G_UEA2 + * and KASUMI @ RTE_CRYPTO_CIPHER_KASUMI_F8, * this field should be in bits. */ } data; /**< Data offsets and length for ciphering */ @@ -485,6 +487,7 @@ struct rte_crypto_sym_op { * * @note * For Snow3G @ RTE_CRYPTO_AUTH_SNOW3G_UIA2 + * and KASUMI @ RTE_CRYPTO_AUTH_KASUMI_F9, * this field should be in bits. */ @@ -504,6 +507,7 @@ struct rte_crypto_sym_op { * * @note * For Snow3G @ RTE_CRYPTO_AUTH_SNOW3G_UIA2 + * and KASUMI @ RTE_CRYPTO_AUTH_KASUMI_F9, * this field should be in bits. */ } data; /**< Data offsets and length for authentication */ diff --git a/lib/librte_cryptodev/rte_cryptodev.c b/lib/librte_cryptodev/rte_cryptodev.c index aa4ea425..20e5beb8 100644 --- a/lib/librte_cryptodev/rte_cryptodev.c +++ b/lib/librte_cryptodev/rte_cryptodev.c @@ -102,6 +102,97 @@ struct rte_cryptodev_callback { uint32_t active; /**< Callback is executing */ }; +#define RTE_CRYPTODEV_VDEV_MAX_NB_QP_ARG ("max_nb_queue_pairs") +#define RTE_CRYPTODEV_VDEV_MAX_NB_SESS_ARG ("max_nb_sessions") +#define RTE_CRYPTODEV_VDEV_SOCKET_ID ("socket_id") + +static const char *cryptodev_vdev_valid_params[] = { + RTE_CRYPTODEV_VDEV_MAX_NB_QP_ARG, + RTE_CRYPTODEV_VDEV_MAX_NB_SESS_ARG, + RTE_CRYPTODEV_VDEV_SOCKET_ID +}; + +static uint8_t +number_of_sockets(void) +{ + int sockets = 0; + int i; + const struct rte_memseg *ms = rte_eal_get_physmem_layout(); + + for (i = 0; ((i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL)); i++) { + if (sockets < ms[i].socket_id) + sockets = ms[i].socket_id; + } + + /* Number of sockets = maximum socket_id + 1 */ + return ++sockets; +} + +/** Parse integer from integer argument */ +static int +parse_integer_arg(const char *key __rte_unused, + const char *value, void *extra_args) +{ + int *i = (int *) extra_args; + + *i = atoi(value); + if (*i < 0) { + CDEV_LOG_ERR("Argument has to be positive."); + return -1; + } + + return 0; +} + +int +rte_cryptodev_parse_vdev_init_params(struct rte_crypto_vdev_init_params *params, + const char *input_args) +{ + struct rte_kvargs *kvlist; + int ret; + + if (params == NULL) + return -EINVAL; + + if (input_args) { + kvlist = rte_kvargs_parse(input_args, + cryptodev_vdev_valid_params); + if (kvlist == NULL) + return -1; + + ret = rte_kvargs_process(kvlist, + RTE_CRYPTODEV_VDEV_MAX_NB_QP_ARG, + &parse_integer_arg, + ¶ms->max_nb_queue_pairs); + if (ret < 0) + goto free_kvlist; + + ret = rte_kvargs_process(kvlist, + RTE_CRYPTODEV_VDEV_MAX_NB_SESS_ARG, + &parse_integer_arg, + ¶ms->max_nb_sessions); + if (ret < 0) + goto free_kvlist; + + ret = rte_kvargs_process(kvlist, RTE_CRYPTODEV_VDEV_SOCKET_ID, + &parse_integer_arg, + ¶ms->socket_id); + if (ret < 0) + goto free_kvlist; + + if (params->socket_id >= number_of_sockets()) { + CDEV_LOG_ERR("Invalid socket id specified to create " + "the virtual crypto device on"); + goto free_kvlist; + } + } + + return 0; + +free_kvlist: + rte_kvargs_free(kvlist); + return ret; +} const char * rte_cryptodev_get_feature_name(uint64_t flag) @@ -956,7 +1047,7 @@ rte_cryptodev_sym_session_init(struct rte_mempool *mp, sess->mp = mp; if (dev->dev_ops->session_initialize) - (*dev->dev_ops->session_initialize)(mp, sess->_private); + (*dev->dev_ops->session_initialize)(mp, sess); } static int diff --git a/lib/librte_cryptodev/rte_cryptodev.h b/lib/librte_cryptodev/rte_cryptodev.h index d47f1e87..7768f0ae 100644 --- a/lib/librte_cryptodev/rte_cryptodev.h +++ b/lib/librte_cryptodev/rte_cryptodev.h @@ -59,12 +59,15 @@ extern "C" { /**< Intel QAT Symmetric Crypto PMD device name */ #define CRYPTODEV_NAME_SNOW3G_PMD ("cryptodev_snow3g_pmd") /**< SNOW 3G PMD device name */ +#define CRYPTODEV_NAME_KASUMI_PMD ("cryptodev_kasumi_pmd") +/**< KASUMI PMD device name */ /** Crypto device type */ enum rte_cryptodev_type { RTE_CRYPTODEV_NULL_PMD = 1, /**< Null crypto PMD */ RTE_CRYPTODEV_AESNI_GCM_PMD, /**< AES-NI GCM PMD */ RTE_CRYPTODEV_AESNI_MB_PMD, /**< AES-NI multi buffer PMD */ + RTE_CRYPTODEV_KASUMI_PMD, /**< KASUMI PMD */ RTE_CRYPTODEV_QAT_SYM_PMD, /**< QAT PMD Symmetric Crypto */ RTE_CRYPTODEV_SNOW3G_PMD, /**< SNOW 3G PMD */ }; @@ -297,48 +300,6 @@ struct rte_crypto_vdev_init_params { uint8_t socket_id; }; -#define RTE_CRYPTODEV_VDEV_MAX_NB_QP_ARG ("max_nb_queue_pairs") -#define RTE_CRYPTODEV_VDEV_MAX_NB_SESS_ARG ("max_nb_sessions") -#define RTE_CRYPTODEV_VDEV_SOCKET_ID ("socket_id") - -static const char *cryptodev_vdev_valid_params[] = { - RTE_CRYPTODEV_VDEV_MAX_NB_QP_ARG, - RTE_CRYPTODEV_VDEV_MAX_NB_SESS_ARG, - RTE_CRYPTODEV_VDEV_SOCKET_ID -}; - -static inline uint8_t -number_of_sockets(void) -{ - int sockets = 0; - int i; - const struct rte_memseg *ms = rte_eal_get_physmem_layout(); - - for (i = 0; ((i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL)); i++) { - if (sockets < ms[i].socket_id) - sockets = ms[i].socket_id; - } - - /* Number of sockets = maximum socket_id + 1 */ - return ++sockets; -} - -/** Parse integer from integer argument */ -static inline int -__rte_cryptodev_parse_integer_arg(const char *key __rte_unused, - const char *value, void *extra_args) -{ - int *i = (int *) extra_args; - - *i = atoi(value); - if (*i < 0) { - CDEV_LOG_ERR("Argument has to be positive."); - return -1; - } - - return 0; -} - /** * Parse virtual device initialisation parameters input arguments * @internal @@ -350,55 +311,10 @@ __rte_cryptodev_parse_integer_arg(const char *key __rte_unused, * 0 on successful parse * <0 on failure to parse */ -static inline int -rte_cryptodev_parse_vdev_init_params(struct rte_crypto_vdev_init_params *params, - const char *input_args) -{ - struct rte_kvargs *kvlist; - int ret; - - if (params == NULL) - return -EINVAL; - - if (input_args) { - kvlist = rte_kvargs_parse(input_args, - cryptodev_vdev_valid_params); - if (kvlist == NULL) - return -1; - - ret = rte_kvargs_process(kvlist, - RTE_CRYPTODEV_VDEV_MAX_NB_QP_ARG, - &__rte_cryptodev_parse_integer_arg, - ¶ms->max_nb_queue_pairs); - if (ret < 0) - goto free_kvlist; - - ret = rte_kvargs_process(kvlist, - RTE_CRYPTODEV_VDEV_MAX_NB_SESS_ARG, - &__rte_cryptodev_parse_integer_arg, - ¶ms->max_nb_sessions); - if (ret < 0) - goto free_kvlist; - - ret = rte_kvargs_process(kvlist, RTE_CRYPTODEV_VDEV_SOCKET_ID, - &__rte_cryptodev_parse_integer_arg, - ¶ms->socket_id); - if (ret < 0) - goto free_kvlist; - - if (params->socket_id >= number_of_sockets()) { - CDEV_LOG_ERR("Invalid socket id specified to create " - "the virtual crypto device on"); - goto free_kvlist; - } - } - - return 0; - -free_kvlist: - rte_kvargs_free(kvlist); - return ret; -} +int +rte_cryptodev_parse_vdev_init_params( + struct rte_crypto_vdev_init_params *params, + const char *input_args); /** * Create a virtual crypto device diff --git a/lib/librte_cryptodev/rte_cryptodev_version.map b/lib/librte_cryptodev/rte_cryptodev_version.map index 41004e1c..a08fd202 100644 --- a/lib/librte_cryptodev/rte_cryptodev_version.map +++ b/lib/librte_cryptodev/rte_cryptodev_version.map @@ -32,3 +32,10 @@ DPDK_16.04 { local: *; }; + +DPDK_16.07 { + global: + + rte_cryptodev_parse_vdev_init_params; + +} DPDK_16.04; diff --git a/lib/librte_eal/bsdapp/eal/Makefile b/lib/librte_eal/bsdapp/eal/Makefile index 9054ad61..698fa0a1 100644 --- a/lib/librte_eal/bsdapp/eal/Makefile +++ b/lib/librte_eal/bsdapp/eal/Makefile @@ -40,8 +40,6 @@ VPATH += $(RTE_SDK)/lib/librte_eal/common/arch/$(ARCH_DIR) CFLAGS += -I$(SRCDIR)/include CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common/include -CFLAGS += -I$(RTE_SDK)/lib/librte_ring -CFLAGS += -I$(RTE_SDK)/lib/librte_mempool CFLAGS += $(WERROR_FLAGS) -O3 LDLIBS += -lexecinfo diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c index 06bfd4e0..a0c8f8c8 100644 --- a/lib/librte_eal/bsdapp/eal/eal.c +++ b/lib/librte_eal/bsdapp/eal/eal.c @@ -605,7 +605,7 @@ rte_eal_init(int argc, char **argv) /* Set thread_name for aid in debugging. */ snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "lcore-slave-%d", i); - pthread_set_name_np(lcore_config[i].thread_id, thread_name); + rte_thread_setname(lcore_config[i].thread_id, thread_name); } /* diff --git a/lib/librte_eal/bsdapp/eal/eal_debug.c b/lib/librte_eal/bsdapp/eal/eal_debug.c index 907fbfa7..5fbc17c5 100644 --- a/lib/librte_eal/bsdapp/eal/eal_debug.c +++ b/lib/librte_eal/bsdapp/eal/eal_debug.c @@ -77,9 +77,6 @@ void __rte_panic(const char *funcname, const char *format, ...) { va_list ap; - /* disable history */ - rte_log_set_history(0); - rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, "PANIC in %s():\n", funcname); va_start(ap, format); rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap); @@ -98,9 +95,6 @@ rte_exit(int exit_code, const char *format, ...) { va_list ap; - /* disable history */ - rte_log_set_history(0); - if (exit_code != 0) RTE_LOG(CRIT, EAL, "Error - exiting with code: %d\n" " Cause: ", exit_code); diff --git a/lib/librte_eal/bsdapp/eal/eal_pci.c b/lib/librte_eal/bsdapp/eal/eal_pci.c index 2d16d782..374b68f2 100644 --- a/lib/librte_eal/bsdapp/eal/eal_pci.c +++ b/lib/librte_eal/bsdapp/eal/eal_pci.c @@ -278,6 +278,11 @@ pci_scan_one(int dev_pci_fd, struct pci_conf *conf) /* get subsystem_device id */ dev->id.subsystem_device_id = conf->pc_subdevice; + /* get class id */ + dev->id.class_id = (conf->pc_class << 16) | + (conf->pc_subclass << 8) | + (conf->pc_progif); + /* TODO: get max_vfs */ dev->max_vfs = 0; @@ -422,7 +427,7 @@ int rte_eal_pci_read_config(const struct rte_pci_device *dev, goto error; } - fd = open("/dev/pci", O_RDONLY); + fd = open("/dev/pci", O_RDWR); if (fd < 0) { RTE_LOG(ERR, EAL, "%s(): error opening /dev/pci\n", __func__); goto error; @@ -466,7 +471,7 @@ int rte_eal_pci_write_config(const struct rte_pci_device *dev, memcpy(&pi.pi_data, buf, len); - fd = open("/dev/pci", O_RDONLY); + fd = open("/dev/pci", O_RDWR); if (fd < 0) { RTE_LOG(ERR, EAL, "%s(): error opening /dev/pci\n", __func__); goto error; diff --git a/lib/librte_eal/bsdapp/eal/eal_thread.c b/lib/librte_eal/bsdapp/eal/eal_thread.c index 9a034373..1b8cd8a6 100644 --- a/lib/librte_eal/bsdapp/eal/eal_thread.c +++ b/lib/librte_eal/bsdapp/eal/eal_thread.c @@ -199,3 +199,10 @@ int rte_sys_gettid(void) thr_self(&lwpid); return (int)lwpid; } + +int rte_thread_setname(pthread_t id, const char *name) +{ + /* this BSD function returns no error */ + pthread_set_name_np(id, name); + return 0; +} diff --git a/lib/librte_eal/bsdapp/eal/rte_eal_version.map b/lib/librte_eal/bsdapp/eal/rte_eal_version.map index 58c2951e..1852c4a4 100644 --- a/lib/librte_eal/bsdapp/eal/rte_eal_version.map +++ b/lib/librte_eal/bsdapp/eal/rte_eal_version.map @@ -151,3 +151,13 @@ DPDK_16.04 { rte_eal_primary_proc_alive; } DPDK_2.2; + +DPDK_16.07 { + global: + + pci_get_sysfs_path; + rte_keepalive_mark_sleep; + rte_keepalive_register_relay_callback; + rte_thread_setname; + +} DPDK_16.04; diff --git a/lib/librte_eal/common/eal_common_devargs.c b/lib/librte_eal/common/eal_common_devargs.c index 2bfe54a1..e403717b 100644 --- a/lib/librte_eal/common/eal_common_devargs.c +++ b/lib/librte_eal/common/eal_common_devargs.c @@ -58,7 +58,7 @@ rte_eal_parse_devargs_str(const char *devargs_str, return -1; *drvname = strdup(devargs_str); - if (drvname == NULL) + if (*drvname == NULL) return -1; /* set the first ',' to '\0' to split name and arguments */ diff --git a/lib/librte_eal/common/eal_common_lcore.c b/lib/librte_eal/common/eal_common_lcore.c index a4263ba5..2cd41320 100644 --- a/lib/librte_eal/common/eal_common_lcore.c +++ b/lib/librte_eal/common/eal_common_lcore.c @@ -104,7 +104,7 @@ rte_eal_cpu_init(void) RTE_LOG(DEBUG, EAL, "Support maximum %u logical core(s) by configuration.\n", RTE_MAX_LCORE); - RTE_LOG(DEBUG, EAL, "Detected %u lcore(s)\n", config->lcore_count); + RTE_LOG(INFO, EAL, "Detected %u lcore(s)\n", config->lcore_count); return 0; } diff --git a/lib/librte_eal/common/eal_common_log.c b/lib/librte_eal/common/eal_common_log.c index 1ae8de70..7916c781 100644 --- a/lib/librte_eal/common/eal_common_log.c +++ b/lib/librte_eal/common/eal_common_log.c @@ -31,54 +31,16 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include <string.h> #include <stdio.h> #include <stdint.h> #include <stdarg.h> -#include <sys/types.h> #include <stdlib.h> -#include <unistd.h> -#include <inttypes.h> -#include <errno.h> -#include <sys/queue.h> #include <rte_log.h> -#include <rte_memory.h> -#include <rte_memzone.h> -#include <rte_launch.h> -#include <rte_common.h> -#include <rte_cycles.h> -#include <rte_eal.h> #include <rte_per_lcore.h> -#include <rte_lcore.h> -#include <rte_atomic.h> -#include <rte_debug.h> -#include <rte_spinlock.h> -#include <rte_branch_prediction.h> -#include <rte_ring.h> -#include <rte_mempool.h> #include "eal_private.h" -#define LOG_ELT_SIZE 2048 - -#define LOG_HISTORY_MP_NAME "log_history" - -STAILQ_HEAD(log_history_list, log_history); - -/** - * The structure of a message log in the log history. - */ -struct log_history { - STAILQ_ENTRY(log_history) next; - unsigned size; - char buf[0]; -}; - -static struct rte_mempool *log_history_mp = NULL; -static unsigned log_history_size = 0; -static struct log_history_list log_history; - /* global log structure */ struct rte_logs rte_logs = { .type = ~0, @@ -86,10 +48,7 @@ struct rte_logs rte_logs = { .file = NULL, }; -static rte_spinlock_t log_dump_lock = RTE_SPINLOCK_INITIALIZER; -static rte_spinlock_t log_list_lock = RTE_SPINLOCK_INITIALIZER; static FILE *default_log_stream; -static int history_enabled = 1; /** * This global structure stores some informations about the message @@ -98,66 +57,24 @@ static int history_enabled = 1; struct log_cur_msg { uint32_t loglevel; /**< log level - see rte_log.h */ uint32_t logtype; /**< log type - see rte_log.h */ -} __rte_cache_aligned; -static struct log_cur_msg log_cur_msg[RTE_MAX_LCORE]; /**< per core log */ +}; + /* per core log */ +static RTE_DEFINE_PER_LCORE(struct log_cur_msg, log_cur_msg); /* default logs */ int -rte_log_add_in_history(const char *buf, size_t size) +rte_log_add_in_history(const char *buf __rte_unused, size_t size __rte_unused) { - struct log_history *hist_buf = NULL; - static const unsigned hist_buf_size = LOG_ELT_SIZE - sizeof(*hist_buf); - void *obj; - - if (history_enabled == 0) - return 0; - - rte_spinlock_lock(&log_list_lock); - - /* get a buffer for adding in history */ - if (log_history_size > RTE_LOG_HISTORY) { - hist_buf = STAILQ_FIRST(&log_history); - if (hist_buf) { - STAILQ_REMOVE_HEAD(&log_history, next); - log_history_size--; - } - } - else { - if (rte_mempool_mc_get(log_history_mp, &obj) < 0) - obj = NULL; - hist_buf = obj; - } - - /* no buffer */ - if (hist_buf == NULL) { - rte_spinlock_unlock(&log_list_lock); - return -ENOBUFS; - } - - /* not enough room for msg, buffer go back in mempool */ - if (size >= hist_buf_size) { - rte_mempool_mp_put(log_history_mp, hist_buf); - rte_spinlock_unlock(&log_list_lock); - return -ENOBUFS; - } - - /* add in history */ - memcpy(hist_buf->buf, buf, size); - hist_buf->buf[size] = hist_buf->buf[hist_buf_size-1] = '\0'; - hist_buf->size = size; - STAILQ_INSERT_TAIL(&log_history, hist_buf, next); - log_history_size++; - rte_spinlock_unlock(&log_list_lock); - return 0; } void rte_log_set_history(int enable) { - history_enabled = enable; + if (enable) + RTE_LOG(WARNING, EAL, "The log history is deprecated.\n"); } /* Change the stream that will be used by logging system */ @@ -205,63 +122,19 @@ rte_get_log_type(void) /* get the current loglevel for the message beeing processed */ int rte_log_cur_msg_loglevel(void) { - unsigned lcore_id; - lcore_id = rte_lcore_id(); - if (lcore_id >= RTE_MAX_LCORE) - return rte_get_log_level(); - return log_cur_msg[lcore_id].loglevel; + return RTE_PER_LCORE(log_cur_msg).loglevel; } /* get the current logtype for the message beeing processed */ int rte_log_cur_msg_logtype(void) { - unsigned lcore_id; - lcore_id = rte_lcore_id(); - if (lcore_id >= RTE_MAX_LCORE) - return rte_get_log_type(); - return log_cur_msg[lcore_id].logtype; + return RTE_PER_LCORE(log_cur_msg).logtype; } /* Dump log history to file */ void -rte_log_dump_history(FILE *out) +rte_log_dump_history(FILE *out __rte_unused) { - struct log_history_list tmp_log_history; - struct log_history *hist_buf; - unsigned i; - - /* only one dump at a time */ - rte_spinlock_lock(&log_dump_lock); - - /* save list, and re-init to allow logging during dump */ - rte_spinlock_lock(&log_list_lock); - tmp_log_history = log_history; - STAILQ_INIT(&log_history); - log_history_size = 0; - rte_spinlock_unlock(&log_list_lock); - - for (i=0; i<RTE_LOG_HISTORY; i++) { - - /* remove one message from history list */ - hist_buf = STAILQ_FIRST(&tmp_log_history); - - if (hist_buf == NULL) - break; - - STAILQ_REMOVE_HEAD(&tmp_log_history, next); - - /* write on stdout */ - if (fwrite(hist_buf->buf, hist_buf->size, 1, out) == 0) { - rte_mempool_mp_put(log_history_mp, hist_buf); - break; - } - - /* put back message structure in pool */ - rte_mempool_mp_put(log_history_mp, hist_buf); - } - fflush(out); - - rte_spinlock_unlock(&log_dump_lock); } /* @@ -273,17 +146,13 @@ rte_vlog(uint32_t level, uint32_t logtype, const char *format, va_list ap) { int ret; FILE *f = rte_logs.file; - unsigned lcore_id; if ((level > rte_logs.level) || !(logtype & rte_logs.type)) return 0; /* save loglevel and logtype in a global per-lcore variable */ - lcore_id = rte_lcore_id(); - if (lcore_id < RTE_MAX_LCORE) { - log_cur_msg[lcore_id].loglevel = level; - log_cur_msg[lcore_id].logtype = logtype; - } + RTE_PER_LCORE(log_cur_msg).loglevel = level; + RTE_PER_LCORE(log_cur_msg).logtype = logtype; ret = vfprintf(f, format, ap); fflush(f); @@ -308,30 +177,17 @@ rte_log(uint32_t level, uint32_t logtype, const char *format, ...) } /* - * called by environment-specific log init function to initialize log - * history + * called by environment-specific log init function */ int rte_eal_common_log_init(FILE *default_log) { - STAILQ_INIT(&log_history); - - /* reserve RTE_LOG_HISTORY*2 elements, so we can dump and - * keep logging during this time */ - log_history_mp = rte_mempool_create(LOG_HISTORY_MP_NAME, RTE_LOG_HISTORY*2, - LOG_ELT_SIZE, 0, 0, - NULL, NULL, - NULL, NULL, - SOCKET_ID_ANY, 0); - - if ((log_history_mp == NULL) && - ((log_history_mp = rte_mempool_lookup(LOG_HISTORY_MP_NAME)) == NULL)){ - RTE_LOG(ERR, EAL, "%s(): cannot create log_history mempool\n", - __func__); - return -1; - } - default_log_stream = default_log; rte_openlog_stream(default_log); + +#if RTE_LOG_LEVEL >= RTE_LOG_DEBUG + RTE_LOG(NOTICE, EAL, "Debug logs available - lower performance\n"); +#endif + return 0; } diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c index 711c8457..5d28341f 100644 --- a/lib/librte_eal/common/eal_common_memzone.c +++ b/lib/librte_eal/common/eal_common_memzone.c @@ -119,6 +119,9 @@ find_heap_max_free_elem(int *s, unsigned align) } } + if (len < MALLOC_ELEM_OVERHEAD + align) + return 0; + return len - MALLOC_ELEM_OVERHEAD - align; } @@ -126,6 +129,7 @@ static const struct rte_memzone * memzone_reserve_aligned_thread_unsafe(const char *name, size_t len, int socket_id, unsigned flags, unsigned align, unsigned bound) { + struct rte_memzone *mz; struct rte_mem_config *mcfg; size_t requested_len; int socket, i; @@ -148,6 +152,13 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len, return NULL; } + if (strlen(name) >= sizeof(mz->name) - 1) { + RTE_LOG(DEBUG, EAL, "%s(): memzone <%s>: name too long\n", + __func__, name); + rte_errno = EEXIST; + return NULL; + } + /* if alignment is not a power of two */ if (align && !rte_is_power_of_2(align)) { RTE_LOG(ERR, EAL, "%s(): Invalid alignment: %u\n", __func__, @@ -189,8 +200,13 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len, if (len == 0) { if (bound != 0) requested_len = bound; - else + else { requested_len = find_heap_max_free_elem(&socket_id, align); + if (requested_len == 0) { + rte_errno = ENOMEM; + return NULL; + } + } } if (socket_id == SOCKET_ID_ANY) @@ -223,7 +239,7 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len, const struct malloc_elem *elem = malloc_elem_from_data(mz_addr); /* fill the zone in config */ - struct rte_memzone *mz = get_next_free_memzone(); + mz = get_next_free_memzone(); if (mz == NULL) { RTE_LOG(ERR, EAL, "%s(): Cannot find free memzone but there is room " @@ -321,15 +337,19 @@ rte_memzone_free(const struct rte_memzone *mz) idx = ((uintptr_t)mz - (uintptr_t)mcfg->memzone); idx = idx / sizeof(struct rte_memzone); - addr = mcfg->memzone[idx].addr; #ifdef RTE_LIBRTE_IVSHMEM /* * If ioremap_addr is set, it's an IVSHMEM memzone and we cannot * free it. */ - if (mcfg->memzone[idx].ioremap_addr != 0) - ret = -EINVAL; + if (mcfg->memzone[idx].ioremap_addr != 0) { + rte_rwlock_write_unlock(&mcfg->mlock); + return -EINVAL; + } #endif + + addr = mcfg->memzone[idx].addr; + if (addr == NULL) ret = -EINVAL; else if (mcfg->memzone_cnt == 0) { diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c index 2b418d52..3efc90f0 100644 --- a/lib/librte_eal/common/eal_common_options.c +++ b/lib/librte_eal/common/eal_common_options.c @@ -139,7 +139,11 @@ eal_reset_internal_config(struct internal_config *internal_cfg) internal_cfg->syslog_facility = LOG_DAEMON; /* default value from build option */ +#if RTE_LOG_LEVEL >= RTE_LOG_DEBUG + internal_cfg->log_level = RTE_LOG_INFO; +#else internal_cfg->log_level = RTE_LOG_LEVEL; +#endif internal_cfg->xen_dom0_support = 0; diff --git a/lib/librte_eal/common/eal_common_pci.c b/lib/librte_eal/common/eal_common_pci.c index 40f49229..7248c38b 100644 --- a/lib/librte_eal/common/eal_common_pci.c +++ b/lib/librte_eal/common/eal_common_pci.c @@ -85,6 +85,19 @@ struct pci_driver_list pci_driver_list; struct pci_device_list pci_device_list; +#define SYSFS_PCI_DEVICES "/sys/bus/pci/devices" + +const char *pci_get_sysfs_path(void) +{ + const char *path = NULL; + + path = getenv("SYSFS_PCI_DEVICES"); + if (path == NULL) + return SYSFS_PCI_DEVICES; + + return path; +} + static struct rte_devargs *pci_devargs_lookup(struct rte_pci_device *dev) { struct rte_devargs *devargs; @@ -162,23 +175,26 @@ rte_eal_pci_probe_one_driver(struct rte_pci_driver *dr, struct rte_pci_device *d if (id_table->subsystem_device_id != dev->id.subsystem_device_id && id_table->subsystem_device_id != PCI_ANY_ID) continue; + if (id_table->class_id != dev->id.class_id && + id_table->class_id != RTE_CLASS_ANY_ID) + continue; struct rte_pci_addr *loc = &dev->addr; - RTE_LOG(DEBUG, EAL, "PCI device "PCI_PRI_FMT" on NUMA socket %i\n", + RTE_LOG(INFO, EAL, "PCI device "PCI_PRI_FMT" on NUMA socket %i\n", loc->domain, loc->bus, loc->devid, loc->function, dev->numa_node); - RTE_LOG(DEBUG, EAL, " probe driver: %x:%x %s\n", dev->id.vendor_id, - dev->id.device_id, dr->name); - /* no initialization when blacklisted, return without error */ if (dev->devargs != NULL && dev->devargs->type == RTE_DEVTYPE_BLACKLISTED_PCI) { - RTE_LOG(DEBUG, EAL, " Device is blacklisted, not initializing\n"); + RTE_LOG(INFO, EAL, " Device is blacklisted, not initializing\n"); return 1; } + RTE_LOG(INFO, EAL, " probe driver: %x:%x %s\n", dev->id.vendor_id, + dev->id.device_id, dr->name); + if (dr->drv_flags & RTE_PCI_DRV_NEED_MAPPING) { /* map resources for devices that use igb_uio */ ret = rte_eal_pci_map_device(dev); diff --git a/lib/librte_eal/common/eal_common_pci_uio.c b/lib/librte_eal/common/eal_common_pci_uio.c index f062e81d..367a6816 100644 --- a/lib/librte_eal/common/eal_common_pci_uio.c +++ b/lib/librte_eal/common/eal_common_pci_uio.c @@ -53,7 +53,7 @@ EAL_REGISTER_TAILQ(rte_uio_tailq) static int pci_uio_map_secondary(struct rte_pci_device *dev) { - int fd, i; + int fd, i, j; struct mapped_pci_resource *uio_res; struct mapped_pci_res_list *uio_res_list = RTE_TAILQ_CAST(rte_uio_tailq.head, mapped_pci_res_list); @@ -85,6 +85,16 @@ pci_uio_map_secondary(struct rte_pci_device *dev) "Cannot mmap device resource file %s to address: %p\n", uio_res->maps[i].path, uio_res->maps[i].addr); + if (mapaddr != MAP_FAILED) { + /* unmap addrs correctly mapped */ + for (j = 0; j < i; j++) + pci_unmap_resource( + uio_res->maps[j].addr, + (size_t)uio_res->maps[j].size); + /* unmap addr wrongly mapped */ + pci_unmap_resource(mapaddr, + (size_t)uio_res->maps[i].size); + } return -1; } } @@ -159,7 +169,8 @@ pci_uio_unmap(struct mapped_pci_resource *uio_res) for (i = 0; i != uio_res->nb_maps; i++) { pci_unmap_resource(uio_res->maps[i].addr, (size_t)uio_res->maps[i].size); - rte_free(uio_res->maps[i].path); + if (rte_eal_process_type() == RTE_PROC_PRIMARY) + rte_free(uio_res->maps[i].path); } } diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h index 2342fa16..857dc3ea 100644 --- a/lib/librte_eal/common/eal_private.h +++ b/lib/librte_eal/common/eal_private.h @@ -49,9 +49,6 @@ int rte_eal_memzone_init(void); /** * Common log initialization function (private to eal). * - * Called by environment-specific log initialization function to initialize - * log history. - * * @param default_log * The default log stream to be used. * @return diff --git a/lib/librte_eal/common/include/arch/arm/rte_memcpy_32.h b/lib/librte_eal/common/include/arch/arm/rte_memcpy_32.h index 988125b3..da6c233a 100644 --- a/lib/librte_eal/common/include/arch/arm/rte_memcpy_32.h +++ b/lib/librte_eal/common/include/arch/arm/rte_memcpy_32.h @@ -323,12 +323,6 @@ rte_memcpy(void *dst, const void *src, size_t n) return memcpy(dst, src, n); } -static inline void * -rte_memcpy_func(void *dst, const void *src, size_t n) -{ - return memcpy(dst, src, n); -} - #endif /* RTE_ARCH_ARM_NEON_MEMCPY */ #ifdef __cplusplus diff --git a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h index 917cdc1b..5db66b63 100644 --- a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h +++ b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h @@ -80,12 +80,6 @@ rte_mov256(uint8_t *dst, const uint8_t *src) #define rte_memcpy(d, s, n) memcpy((d), (s), (n)) -static inline void * -rte_memcpy_func(void *dst, const void *src, size_t n) -{ - return memcpy(dst, src, n); -} - #ifdef __cplusplus } #endif diff --git a/lib/librte_eal/common/include/arch/tile/rte_memcpy.h b/lib/librte_eal/common/include/arch/tile/rte_memcpy.h index 9b5b37ef..e606957c 100644 --- a/lib/librte_eal/common/include/arch/tile/rte_memcpy.h +++ b/lib/librte_eal/common/include/arch/tile/rte_memcpy.h @@ -80,12 +80,6 @@ rte_mov256(uint8_t *dst, const uint8_t *src) #define rte_memcpy(d, s, n) memcpy((d), (s), (n)) -static inline void * -rte_memcpy_func(void *dst, const void *src, size_t n) -{ - return memcpy(dst, src, n); -} - #ifdef __cplusplus } #endif diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h index f463ab30..413035e7 100644 --- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h +++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h @@ -363,71 +363,26 @@ rte_mov128(uint8_t *dst, const uint8_t *src) } /** - * Copy 256 bytes from one location to another, - * locations should not overlap. - */ -static inline void -rte_mov256(uint8_t *dst, const uint8_t *src) -{ - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); - rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); - rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); - rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32); - rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32); - rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32); - rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32); -} - -/** - * Copy 64-byte blocks from one location to another, - * locations should not overlap. - */ -static inline void -rte_mov64blocks(uint8_t *dst, const uint8_t *src, size_t n) -{ - __m256i ymm0, ymm1; - - while (n >= 64) { - ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32)); - n -= 64; - ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32)); - src = (const uint8_t *)src + 64; - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0); - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1); - dst = (uint8_t *)dst + 64; - } -} - -/** - * Copy 256-byte blocks from one location to another, + * Copy 128-byte blocks from one location to another, * locations should not overlap. */ static inline void -rte_mov256blocks(uint8_t *dst, const uint8_t *src, size_t n) +rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) { - __m256i ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7; + __m256i ymm0, ymm1, ymm2, ymm3; - while (n >= 256) { + while (n >= 128) { ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32)); - n -= 256; + n -= 128; ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32)); ymm2 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 2 * 32)); ymm3 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 3 * 32)); - ymm4 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 4 * 32)); - ymm5 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 5 * 32)); - ymm6 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 6 * 32)); - ymm7 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 7 * 32)); - src = (const uint8_t *)src + 256; + src = (const uint8_t *)src + 128; _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0); _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1); _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 2 * 32), ymm2); _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 3 * 32), ymm3); - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 4 * 32), ymm4); - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 5 * 32), ymm5); - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 6 * 32), ymm6); - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 7 * 32), ymm7); - dst = (uint8_t *)dst + 256; + dst = (uint8_t *)dst + 128; } } @@ -466,51 +421,56 @@ rte_memcpy(void *dst, const void *src, size_t n) } /** - * Fast way when copy size doesn't exceed 512 bytes + * Fast way when copy size doesn't exceed 256 bytes */ if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); + rte_mov16((uint8_t *)dst - 16 + n, + (const uint8_t *)src - 16 + n); + return ret; + } + if (n <= 48) { + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16); + rte_mov16((uint8_t *)dst - 16 + n, + (const uint8_t *)src - 16 + n); return ret; } if (n <= 64) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); - rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n); + rte_mov32((uint8_t *)dst - 32 + n, + (const uint8_t *)src - 32 + n); return ret; } - if (n <= 512) { - if (n >= 256) { - n -= 256; - rte_mov256((uint8_t *)dst, (const uint8_t *)src); - src = (const uint8_t *)src + 256; - dst = (uint8_t *)dst + 256; - } + if (n <= 256) { if (n >= 128) { n -= 128; rte_mov128((uint8_t *)dst, (const uint8_t *)src); src = (const uint8_t *)src + 128; dst = (uint8_t *)dst + 128; } +COPY_BLOCK_128_BACK31: if (n >= 64) { n -= 64; rte_mov64((uint8_t *)dst, (const uint8_t *)src); src = (const uint8_t *)src + 64; dst = (uint8_t *)dst + 64; } -COPY_BLOCK_64_BACK31: if (n > 32) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); - rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n); + rte_mov32((uint8_t *)dst - 32 + n, + (const uint8_t *)src - 32 + n); return ret; } if (n > 0) { - rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n); + rte_mov32((uint8_t *)dst - 32 + n, + (const uint8_t *)src - 32 + n); } return ret; } /** - * Make store aligned when copy size exceeds 512 bytes + * Make store aligned when copy size exceeds 256 bytes */ dstofss = (uintptr_t)dst & 0x1F; if (dstofss > 0) { @@ -522,35 +482,19 @@ COPY_BLOCK_64_BACK31: } /** - * Copy 256-byte blocks. - * Use copy block function for better instruction order control, - * which is important when load is unaligned. + * Copy 128-byte blocks */ - rte_mov256blocks((uint8_t *)dst, (const uint8_t *)src, n); + rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n); bits = n; - n = n & 255; + n = n & 127; bits -= n; src = (const uint8_t *)src + bits; dst = (uint8_t *)dst + bits; /** - * Copy 64-byte blocks. - * Use copy block function for better instruction order control, - * which is important when load is unaligned. - */ - if (n >= 64) { - rte_mov64blocks((uint8_t *)dst, (const uint8_t *)src, n); - bits = n; - n = n & 63; - bits -= n; - src = (const uint8_t *)src + bits; - dst = (uint8_t *)dst + bits; - } - - /** * Copy whatever left */ - goto COPY_BLOCK_64_BACK31; + goto COPY_BLOCK_128_BACK31; } #else /* RTE_MACHINE_CPUFLAG */ diff --git a/lib/librte_eal/common/include/arch/x86/rte_rtm.h b/lib/librte_eal/common/include/arch/x86/rte_rtm.h index d9356419..0649f794 100644 --- a/lib/librte_eal/common/include/arch/x86/rte_rtm.h +++ b/lib/librte_eal/common/include/arch/x86/rte_rtm.h @@ -50,11 +50,10 @@ void rte_xend(void) asm volatile(".byte 0x0f,0x01,0xd5" ::: "memory"); } -static __attribute__((__always_inline__)) inline -void rte_xabort(const unsigned int status) -{ - asm volatile(".byte 0xc6,0xf8,%P0" :: "i" (status) : "memory"); -} +/* not an inline function to workaround a clang bug with -O0 */ +#define rte_xabort(status) do { \ + asm volatile(".byte 0xc6,0xf8,%P0" :: "i" (status) : "memory"); \ +} while (0) static __attribute__((__always_inline__)) inline int rte_xtest(void) diff --git a/lib/librte_eal/common/include/generic/rte_memcpy.h b/lib/librte_eal/common/include/generic/rte_memcpy.h index 03e84773..afb0afe4 100644 --- a/lib/librte_eal/common/include/generic/rte_memcpy.h +++ b/lib/librte_eal/common/include/generic/rte_memcpy.h @@ -134,11 +134,4 @@ rte_memcpy(void *dst, const void *src, size_t n); #endif /* __DOXYGEN__ */ -/* - * memcpy() function used by rte_memcpy macro - */ -static inline void * -rte_memcpy_func(void *dst, const void *src, size_t n) __attribute__((always_inline)); - - #endif /* _RTE_MEMCPY_H_ */ diff --git a/lib/librte_eal/common/include/rte_debug.h b/lib/librte_eal/common/include/rte_debug.h index 94129fab..cab6fb4c 100644 --- a/lib/librte_eal/common/include/rte_debug.h +++ b/lib/librte_eal/common/include/rte_debug.h @@ -43,6 +43,9 @@ * the implementation is architecture-specific. */ +#include "rte_log.h" +#include "rte_branch_prediction.h" + #ifdef __cplusplus extern "C" { #endif @@ -76,8 +79,13 @@ void rte_dump_registers(void); #define rte_panic(...) rte_panic_(__func__, __VA_ARGS__, "dummy") #define rte_panic_(func, format, ...) __rte_panic(func, format "%.0s", __VA_ARGS__) +#if RTE_LOG_LEVEL >= RTE_LOG_DEBUG +#define RTE_ASSERT(exp) RTE_VERIFY(exp) +#else +#define RTE_ASSERT(exp) do {} while (0) +#endif #define RTE_VERIFY(exp) do { \ - if (!(exp)) \ + if (unlikely(!(exp))) \ rte_panic("line %d\tassert \"" #exp "\" failed\n", __LINE__); \ } while (0) diff --git a/lib/librte_eal/common/include/rte_keepalive.h b/lib/librte_eal/common/include/rte_keepalive.h index 10dac2e0..88ad8e48 100644 --- a/lib/librte_eal/common/include/rte_keepalive.h +++ b/lib/librte_eal/common/include/rte_keepalive.h @@ -1,7 +1,7 @@ /*- * BSD LICENSE * - * Copyright 2015 Intel Shannon Ltd. All rights reserved. + * Copyright 2015-2016 Intel Shannon Ltd. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -48,17 +48,47 @@ #define RTE_KEEPALIVE_MAXCORES RTE_MAX_LCORE #endif +enum rte_keepalive_state { + RTE_KA_STATE_UNUSED = 0, + RTE_KA_STATE_ALIVE = 1, + RTE_KA_STATE_MISSING = 4, + RTE_KA_STATE_DEAD = 2, + RTE_KA_STATE_GONE = 3, + RTE_KA_STATE_DOZING = 5, + RTE_KA_STATE_SLEEP = 6 +}; + /** * Keepalive failure callback. * * Receives a data pointer passed to rte_keepalive_create() and the id of the * failed core. + * @param data Data pointer passed to rte_keepalive_create() + * @param id_core ID of the core that has failed */ typedef void (*rte_keepalive_failure_callback_t)( void *data, const int id_core); /** + * Keepalive relay callback. + * + * Receives a data pointer passed to rte_keepalive_register_relay_callback(), + * the id of the core for which state is to be forwarded, and details of the + * current core state. + * @param data Data pointer passed to rte_keepalive_register_relay_callback() + * @param id_core ID of the core for which state is being reported + * @param core_state The current state of the core + * @param Timestamp of when core was last seen alive + */ +typedef void (*rte_keepalive_relay_callback_t)( + void *data, + const int id_core, + enum rte_keepalive_state core_state, + uint64_t last_seen + ); + +/** * Keepalive state structure. * @internal */ @@ -105,4 +135,35 @@ void rte_keepalive_register_core(struct rte_keepalive *keepcfg, void rte_keepalive_mark_alive(struct rte_keepalive *keepcfg); +/** + * Per-core sleep-time indication. + * @param *keepcfg + * Keepalive structure pointer + * + * If CPU idling is enabled, this function needs to be called from within + * the main process loop of the LCore going to sleep, in order to avoid + * the LCore being mis-detected as dead. + */ +void +rte_keepalive_mark_sleep(struct rte_keepalive *keepcfg); + +/** + * Registers a 'live core' callback. + * + * The complement of the 'dead core' callback. This is called when a + * core is known to be alive, and is intended for cases when an app + * needs to know 'liveness' beyond just knowing when a core has died. + * + * @param *keepcfg + * Keepalive structure pointer + * @param callback + * Function called upon detection of a dead core. + * @param data + * Data pointer to be passed to function callback. + */ +void +rte_keepalive_register_relay_callback(struct rte_keepalive *keepcfg, + rte_keepalive_relay_callback_t callback, + void *data); + #endif /* _KEEPALIVE_H_ */ diff --git a/lib/librte_eal/common/include/rte_lcore.h b/lib/librte_eal/common/include/rte_lcore.h index ac151302..fe7b5865 100644 --- a/lib/librte_eal/common/include/rte_lcore.h +++ b/lib/librte_eal/common/include/rte_lcore.h @@ -250,23 +250,16 @@ void rte_thread_get_affinity(rte_cpuset_t *cpusetp); /** * Set thread names. * - * Macro to wrap `pthread_setname_np()` with a glibc version check. - * Only glibc >= 2.12 supports this feature. + * @note It fails with glibc < 2.12. * - * This macro only used for Linux, BSD does direct libc call. - * BSD libc version of function is `pthread_set_name_np()`. + * @param id + * Thread id. + * @param name + * Thread name to set. + * @return + * On success, return 0; otherwise return a negative value. */ -#if defined(__DOXYGEN__) -#define rte_thread_setname(...) pthread_setname_np(__VA_ARGS__) -#endif - -#if defined(__GLIBC__) && defined(__GLIBC_PREREQ) -#if __GLIBC_PREREQ(2, 12) -#define rte_thread_setname(...) pthread_setname_np(__VA_ARGS__) -#else -#define rte_thread_setname(...) 0 -#endif -#endif +int rte_thread_setname(pthread_t id, const char *name); #ifdef __cplusplus } diff --git a/lib/librte_eal/common/include/rte_log.h b/lib/librte_eal/common/include/rte_log.h index 2e47e7f6..b1add04c 100644 --- a/lib/librte_eal/common/include/rte_log.h +++ b/lib/librte_eal/common/include/rte_log.h @@ -42,6 +42,8 @@ * This file provides a log API to RTE applications. */ +#include "rte_common.h" /* for __rte_deprecated macro */ + #ifdef __cplusplus extern "C" { #endif @@ -179,22 +181,27 @@ int rte_log_cur_msg_loglevel(void); int rte_log_cur_msg_logtype(void); /** + * @deprecated * Enable or disable the history (enabled by default) * * @param enable * true to enable, or 0 to disable history. */ +__rte_deprecated void rte_log_set_history(int enable); /** + * @deprecated * Dump the log history to a file * * @param f * A pointer to a file for output */ +__rte_deprecated void rte_log_dump_history(FILE *f); /** + * @deprecated * Add a log message to the history. * * This function can be called from a user-defined log stream. It adds @@ -209,6 +216,7 @@ void rte_log_dump_history(FILE *f); * - 0: Success. * - (-ENOBUFS) if there is no room to store the message. */ +__rte_deprecated int rte_log_add_in_history(const char *buf, size_t size); /** diff --git a/lib/librte_eal/common/include/rte_memory.h b/lib/librte_eal/common/include/rte_memory.h index f8dbece0..06611093 100644 --- a/lib/librte_eal/common/include/rte_memory.h +++ b/lib/librte_eal/common/include/rte_memory.h @@ -200,21 +200,22 @@ unsigned rte_memory_get_nrank(void); int rte_xen_dom0_supported(void); /**< Internal use only - phys to virt mapping for xen */ -phys_addr_t rte_xen_mem_phy2mch(uint32_t, const phys_addr_t); +phys_addr_t rte_xen_mem_phy2mch(int32_t, const phys_addr_t); /** * Return the physical address of elt, which is an element of the pool mp. * * @param memseg_id - * The mempool is from which memory segment. + * Identifier of the memory segment owning the physical address. If + * set to -1, find it automatically. * @param phy_addr * physical address of elt. * * @return - * The physical address or error. + * The physical address or RTE_BAD_PHYS_ADDR on error. */ static inline phys_addr_t -rte_mem_phy2mch(uint32_t memseg_id, const phys_addr_t phy_addr) +rte_mem_phy2mch(int32_t memseg_id, const phys_addr_t phy_addr) { if (rte_xen_dom0_supported()) return rte_xen_mem_phy2mch(memseg_id, phy_addr); @@ -250,7 +251,7 @@ static inline int rte_xen_dom0_supported(void) } static inline phys_addr_t -rte_mem_phy2mch(uint32_t memseg_id __rte_unused, const phys_addr_t phy_addr) +rte_mem_phy2mch(int32_t memseg_id __rte_unused, const phys_addr_t phy_addr) { return phy_addr; } diff --git a/lib/librte_eal/common/include/rte_pci.h b/lib/librte_eal/common/include/rte_pci.h index e692094e..fa749626 100644 --- a/lib/librte_eal/common/include/rte_pci.h +++ b/lib/librte_eal/common/include/rte_pci.h @@ -91,7 +91,7 @@ extern struct pci_driver_list pci_driver_list; /**< Global list of PCI drivers. extern struct pci_device_list pci_device_list; /**< Global list of PCI devices. */ /** Pathname of PCI devices directory. */ -#define SYSFS_PCI_DEVICES "/sys/bus/pci/devices" +const char *pci_get_sysfs_path(void); /** Formatting string for PCI device identifier: Ex: 0000:00:01.0 */ #define PCI_PRI_FMT "%.4" PRIx16 ":%.2" PRIx8 ":%.2" PRIx8 ".%" PRIx8 @@ -105,9 +105,6 @@ extern struct pci_device_list pci_device_list; /**< Global list of PCI devices. /** Nb. of values in PCI resource format. */ #define PCI_RESOURCE_FMT_NVAL 3 -/** IO resource type: memory address space */ -#define IORESOURCE_MEM 0x00000200 - /** * A structure describing a PCI resource. */ @@ -125,6 +122,7 @@ struct rte_pci_resource { * table of these IDs for each device that it supports. */ struct rte_pci_id { + uint32_t class_id; /**< Class ID (class, subclass, pi) or RTE_CLASS_ANY_ID. */ uint16_t vendor_id; /**< Vendor ID or PCI_ANY_ID. */ uint16_t device_id; /**< Device ID or PCI_ANY_ID. */ uint16_t subsystem_vendor_id; /**< Subsystem vendor ID or PCI_ANY_ID. */ @@ -170,10 +168,12 @@ struct rte_pci_device { /** Any PCI device identifier (vendor, device, ...) */ #define PCI_ANY_ID (0xffff) +#define RTE_CLASS_ANY_ID (0xffffff) #ifdef __cplusplus /** C++ macro used to help building up tables of device IDs */ #define RTE_PCI_DEVICE(vend, dev) \ + RTE_CLASS_ANY_ID, \ (vend), \ (dev), \ PCI_ANY_ID, \ @@ -181,6 +181,7 @@ struct rte_pci_device { #else /** Macro used to help building up tables of device IDs */ #define RTE_PCI_DEVICE(vend, dev) \ + .class_id = RTE_CLASS_ANY_ID, \ .vendor_id = (vend), \ .device_id = (dev), \ .subsystem_vendor_id = PCI_ANY_ID, \ @@ -213,8 +214,6 @@ struct rte_pci_driver { /** Device needs PCI BAR mapping (done with either IGB_UIO or VFIO) */ #define RTE_PCI_DRV_NEED_MAPPING 0x0001 -/** Device driver must be registered several times until failure - deprecated */ -#pragma GCC poison RTE_PCI_DRV_MULTIPLE /** Device needs to be unbound even if no module is provided */ #define RTE_PCI_DRV_FORCE_UNBIND 0x0004 /** Device driver supports link state interrupt */ @@ -520,15 +519,17 @@ int rte_eal_pci_write_config(const struct rte_pci_device *device, struct rte_pci_ioport { struct rte_pci_device *dev; uint64_t base; + uint64_t len; /* only filled for memory mapped ports */ }; /** - * Initialises a rte_pci_ioport object for a pci device io resource. + * Initialize a rte_pci_ioport object for a pci device io resource. + * * This object is then used to gain access to those io resources (see below). * * @param dev - * A pointer to a rte_pci_device structure describing the device. - * to use + * A pointer to a rte_pci_device structure describing the device + * to use. * @param bar * Index of the io pci resource we want to access. * @param p @@ -544,6 +545,8 @@ int rte_eal_pci_ioport_map(struct rte_pci_device *dev, int bar, * * @param p * The rte_pci_ioport object to be uninitialized. + * @return + * 0 on success, negative on error. */ int rte_eal_pci_ioport_unmap(struct rte_pci_ioport *p); @@ -577,20 +580,6 @@ void rte_eal_pci_ioport_read(struct rte_pci_ioport *p, void rte_eal_pci_ioport_write(struct rte_pci_ioport *p, const void *data, size_t len, off_t offset); -#ifdef RTE_PCI_CONFIG -#include <rte_common.h> -/** - * Set special config space registers for performance purpose. - * It is deprecated, as all configurations have been moved into - * each PMDs respectively. - * - * @param dev - * A pointer to a rte_pci_device structure describing the device - * to use - */ -void pci_config_space_set(struct rte_pci_device *dev) __rte_deprecated; -#endif /* RTE_PCI_CONFIG */ - #ifdef __cplusplus } #endif diff --git a/lib/librte_eal/common/include/rte_pci_dev_ids.h b/lib/librte_eal/common/include/rte_pci_dev_ids.h index cf7b5487..af39fbbd 100644 --- a/lib/librte_eal/common/include/rte_pci_dev_ids.h +++ b/lib/librte_eal/common/include/rte_pci_dev_ids.h @@ -63,11 +63,12 @@ * This file contains a list of the PCI device IDs recognised by DPDK, which * can be used to fill out an array of structures describing the devices. * - * Currently four families of devices are recognised: those supported by the - * IGB driver, by EM driver, those supported by the IXGBE driver, and by virtio - * driver which is a para virtualization driver running in guest virtual machine. - * The inclusion of these in an array built using this file depends on the - * definition of + * Currently five families of devices are recognised: those supported by the + * IGB driver, by EM driver, those supported by the IXGBE driver, those + * supported by the BNXT driver, and by virtio driver which is a para + * virtualization driver running in guest virtual machine. The inclusion of + * these in an array built using this file depends on the definition of + * RTE_PCI_DEV_ID_DECL_BNXT * RTE_PCI_DEV_ID_DECL_EM * RTE_PCI_DEV_ID_DECL_IGB * RTE_PCI_DEV_ID_DECL_IGBVF @@ -152,6 +153,10 @@ #define RTE_PCI_DEV_ID_DECL_BNX2XVF(vend, dev) #endif +#ifndef RTE_PCI_DEV_ID_DECL_BNXT +#define RTE_PCI_DEV_ID_DECL_BNXT(vend, dev) +#endif + #ifndef PCI_VENDOR_ID_INTEL /** Vendor ID used by Intel devices */ #define PCI_VENDOR_ID_INTEL 0x8086 @@ -446,12 +451,14 @@ RTE_PCI_DEV_ID_DECL_IGB(PCI_VENDOR_ID_INTEL, E1000_DEV_ID_DH89XXCC_SFP) #define IXGBE_DEV_ID_X550EM_A_KR 0x15C2 #define IXGBE_DEV_ID_X550EM_A_KR_L 0x15C3 #define IXGBE_DEV_ID_X550EM_A_SFP_N 0x15C4 -#define IXGBE_DEV_ID_X550EM_A_1G_T 0x15C6 -#define IXGBE_DEV_ID_X550EM_A_1G_T_L 0x15C7 +#define IXGBE_DEV_ID_X550EM_A_SGMII 0x15C6 +#define IXGBE_DEV_ID_X550EM_A_SGMII_L 0x15C7 #define IXGBE_DEV_ID_X550EM_A_10G_T 0x15C8 #define IXGBE_DEV_ID_X550EM_A_QSFP 0x15CA #define IXGBE_DEV_ID_X550EM_A_QSFP_N 0x15CC #define IXGBE_DEV_ID_X550EM_A_SFP 0x15CE +#define IXGBE_DEV_ID_X550EM_A_1G_T 0x15E4 +#define IXGBE_DEV_ID_X550EM_A_1G_T_L 0x15E5 #define IXGBE_DEV_ID_X550EM_X_KX4 0x15AA #define IXGBE_DEV_ID_X550EM_X_KR 0x15AB @@ -506,12 +513,14 @@ RTE_PCI_DEV_ID_DECL_IXGBE(PCI_VENDOR_ID_INTEL, IXGBE_DEV_ID_X550T1) RTE_PCI_DEV_ID_DECL_IXGBE(PCI_VENDOR_ID_INTEL, IXGBE_DEV_ID_X550EM_A_KR) RTE_PCI_DEV_ID_DECL_IXGBE(PCI_VENDOR_ID_INTEL, IXGBE_DEV_ID_X550EM_A_KR_L) RTE_PCI_DEV_ID_DECL_IXGBE(PCI_VENDOR_ID_INTEL, IXGBE_DEV_ID_X550EM_A_SFP_N) -RTE_PCI_DEV_ID_DECL_IXGBE(PCI_VENDOR_ID_INTEL, IXGBE_DEV_ID_X550EM_A_1G_T) -RTE_PCI_DEV_ID_DECL_IXGBE(PCI_VENDOR_ID_INTEL, IXGBE_DEV_ID_X550EM_A_1G_T_L) +RTE_PCI_DEV_ID_DECL_IXGBE(PCI_VENDOR_ID_INTEL, IXGBE_DEV_ID_X550EM_A_SGMII) +RTE_PCI_DEV_ID_DECL_IXGBE(PCI_VENDOR_ID_INTEL, IXGBE_DEV_ID_X550EM_A_SGMII_L) RTE_PCI_DEV_ID_DECL_IXGBE(PCI_VENDOR_ID_INTEL, IXGBE_DEV_ID_X550EM_A_10G_T) RTE_PCI_DEV_ID_DECL_IXGBE(PCI_VENDOR_ID_INTEL, IXGBE_DEV_ID_X550EM_A_QSFP) RTE_PCI_DEV_ID_DECL_IXGBE(PCI_VENDOR_ID_INTEL, IXGBE_DEV_ID_X550EM_A_QSFP_N) RTE_PCI_DEV_ID_DECL_IXGBE(PCI_VENDOR_ID_INTEL, IXGBE_DEV_ID_X550EM_A_SFP) +RTE_PCI_DEV_ID_DECL_IXGBE(PCI_VENDOR_ID_INTEL, IXGBE_DEV_ID_X550EM_A_1G_T) +RTE_PCI_DEV_ID_DECL_IXGBE(PCI_VENDOR_ID_INTEL, IXGBE_DEV_ID_X550EM_A_1G_T_L) RTE_PCI_DEV_ID_DECL_IXGBE(PCI_VENDOR_ID_INTEL, IXGBE_DEV_ID_X550EM_X_KX4) RTE_PCI_DEV_ID_DECL_IXGBE(PCI_VENDOR_ID_INTEL, IXGBE_DEV_ID_X550EM_X_KR) @@ -532,12 +541,16 @@ RTE_PCI_DEV_ID_DECL_IXGBE(PCI_VENDOR_ID_INTEL, IXGBE_DEV_ID_82599_BYPASS) #define I40E_DEV_ID_20G_KR2 0x1587 #define I40E_DEV_ID_20G_KR2_A 0x1588 #define I40E_DEV_ID_10G_BASE_T4 0x1589 +#define I40E_DEV_ID_25G_B 0x158A +#define I40E_DEV_ID_25G_SFP28 0x158B #define I40E_DEV_ID_X722_A0 0x374C #define I40E_DEV_ID_KX_X722 0x37CE #define I40E_DEV_ID_QSFP_X722 0x37CF #define I40E_DEV_ID_SFP_X722 0x37D0 #define I40E_DEV_ID_1G_BASE_T_X722 0x37D1 #define I40E_DEV_ID_10G_BASE_T_X722 0x37D2 +#define I40E_DEV_ID_SFP_I_X722 0x37D3 +#define I40E_DEV_ID_QSFP_I_X722 0x37D4 RTE_PCI_DEV_ID_DECL_I40E(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_SFP_XL710) RTE_PCI_DEV_ID_DECL_I40E(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_QEMU) @@ -550,12 +563,16 @@ RTE_PCI_DEV_ID_DECL_I40E(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_10G_BASE_T) RTE_PCI_DEV_ID_DECL_I40E(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_20G_KR2) RTE_PCI_DEV_ID_DECL_I40E(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_20G_KR2_A) RTE_PCI_DEV_ID_DECL_I40E(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_10G_BASE_T4) +RTE_PCI_DEV_ID_DECL_I40E(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_25G_B) +RTE_PCI_DEV_ID_DECL_I40E(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_25G_SFP28) RTE_PCI_DEV_ID_DECL_I40E(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_X722_A0) RTE_PCI_DEV_ID_DECL_I40E(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_KX_X722) RTE_PCI_DEV_ID_DECL_I40E(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_QSFP_X722) RTE_PCI_DEV_ID_DECL_I40E(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_SFP_X722) RTE_PCI_DEV_ID_DECL_I40E(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_1G_BASE_T_X722) RTE_PCI_DEV_ID_DECL_I40E(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_10G_BASE_T_X722) +RTE_PCI_DEV_ID_DECL_I40E(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_SFP_I_X722) +RTE_PCI_DEV_ID_DECL_I40E(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_QSFP_I_X722) /*************** Physical FM10K devices from fm10k_type.h ***************/ @@ -686,6 +703,30 @@ RTE_PCI_DEV_ID_DECL_BNX2X(PCI_VENDOR_ID_BROADCOM, BNX2X_DEV_ID_57811_MF) RTE_PCI_DEV_ID_DECL_BNX2X(PCI_VENDOR_ID_BROADCOM, BNX2X_DEV_ID_57840_MF) #endif +/****************** Broadcom bnxt devices ******************/ + +#define BROADCOM_DEV_ID_57301 0x16c8 +#define BROADCOM_DEV_ID_57302 0x16c9 +#define BROADCOM_DEV_ID_57304_PF 0x16ca +#define BROADCOM_DEV_ID_57304_VF 0x16cb +#define BROADCOM_DEV_ID_57402 0x16d0 +#define BROADCOM_DEV_ID_57404 0x16d1 +#define BROADCOM_DEV_ID_57406_PF 0x16d2 +#define BROADCOM_DEV_ID_57406_VF 0x16d3 +#define BROADCOM_DEV_ID_57406_MF 0x16d4 +#define BROADCOM_DEV_ID_57314 0x16df + +RTE_PCI_DEV_ID_DECL_BNXT(PCI_VENDOR_ID_BROADCOM, BROADCOM_DEV_ID_57301) +RTE_PCI_DEV_ID_DECL_BNXT(PCI_VENDOR_ID_BROADCOM, BROADCOM_DEV_ID_57302) +RTE_PCI_DEV_ID_DECL_BNXT(PCI_VENDOR_ID_BROADCOM, BROADCOM_DEV_ID_57304_PF) +RTE_PCI_DEV_ID_DECL_BNXT(PCI_VENDOR_ID_BROADCOM, BROADCOM_DEV_ID_57304_VF) +RTE_PCI_DEV_ID_DECL_BNXT(PCI_VENDOR_ID_BROADCOM, BROADCOM_DEV_ID_57402) +RTE_PCI_DEV_ID_DECL_BNXT(PCI_VENDOR_ID_BROADCOM, BROADCOM_DEV_ID_57404) +RTE_PCI_DEV_ID_DECL_BNXT(PCI_VENDOR_ID_BROADCOM, BROADCOM_DEV_ID_57406_PF) +RTE_PCI_DEV_ID_DECL_BNXT(PCI_VENDOR_ID_BROADCOM, BROADCOM_DEV_ID_57406_VF) +RTE_PCI_DEV_ID_DECL_BNXT(PCI_VENDOR_ID_BROADCOM, BROADCOM_DEV_ID_57406_MF) +RTE_PCI_DEV_ID_DECL_BNXT(PCI_VENDOR_ID_BROADCOM, BROADCOM_DEV_ID_57314) + /* * Undef all RTE_PCI_DEV_ID_DECL_* here. */ @@ -702,3 +743,4 @@ RTE_PCI_DEV_ID_DECL_BNX2X(PCI_VENDOR_ID_BROADCOM, BNX2X_DEV_ID_57840_MF) #undef RTE_PCI_DEV_ID_DECL_VMXNET3 #undef RTE_PCI_DEV_ID_DECL_FM10K #undef RTE_PCI_DEV_ID_DECL_FM10KVF +#undef RTE_PCI_DEV_ID_DECL_BNXT diff --git a/lib/librte_eal/common/include/rte_version.h b/lib/librte_eal/common/include/rte_version.h index f8ea6d73..dbe09975 100644 --- a/lib/librte_eal/common/include/rte_version.h +++ b/lib/librte_eal/common/include/rte_version.h @@ -60,7 +60,7 @@ extern "C" { /** * Minor version/month number i.e. the mm in yy.mm.z */ -#define RTE_VER_MONTH 4 +#define RTE_VER_MONTH 7 /** * Patch level number i.e. the z in yy.mm.z @@ -70,14 +70,14 @@ extern "C" { /** * Extra string to be appended to version number */ -#define RTE_VER_SUFFIX "" +#define RTE_VER_SUFFIX "-rc" /** * Patch release number * 0-15 = release candidates * 16 = release */ -#define RTE_VER_RELEASE 16 +#define RTE_VER_RELEASE 1 /** * Macro to compute a version number usable for comparisons diff --git a/lib/librte_eal/common/rte_keepalive.c b/lib/librte_eal/common/rte_keepalive.c index 23363ec1..9765d1bd 100644 --- a/lib/librte_eal/common/rte_keepalive.c +++ b/lib/librte_eal/common/rte_keepalive.c @@ -42,12 +42,8 @@ struct rte_keepalive { /** Core Liveness. */ - enum rte_keepalive_state { - ALIVE = 1, - MISSING = 0, - DEAD = 2, - GONE = 3 - } __rte_cache_aligned state_flags[RTE_KEEPALIVE_MAXCORES]; + enum rte_keepalive_state __rte_cache_aligned state_flags[ + RTE_KEEPALIVE_MAXCORES]; /** Last-seen-alive timestamps */ uint64_t last_alive[RTE_KEEPALIVE_MAXCORES]; @@ -68,6 +64,15 @@ struct rte_keepalive { void *callback_data; uint64_t tsc_initial; uint64_t tsc_mhz; + + /** Core state relay handler. */ + rte_keepalive_relay_callback_t relay_callback; + + /** + * Core state relay handler app data. + * Pointer is passed to live core handler. + */ + void *relay_callback_data; }; static void @@ -92,16 +97,18 @@ rte_keepalive_dispatch_pings(__rte_unused void *ptr_timer, continue; switch (keepcfg->state_flags[idx_core]) { - case ALIVE: /* Alive */ - keepcfg->state_flags[idx_core] = MISSING; + case RTE_KA_STATE_UNUSED: + break; + case RTE_KA_STATE_ALIVE: /* Alive */ + keepcfg->state_flags[idx_core] = RTE_KA_STATE_MISSING; keepcfg->last_alive[idx_core] = rte_rdtsc(); break; - case MISSING: /* MIA */ + case RTE_KA_STATE_MISSING: /* MIA */ print_trace("Core MIA. ", keepcfg, idx_core); - keepcfg->state_flags[idx_core] = DEAD; + keepcfg->state_flags[idx_core] = RTE_KA_STATE_DEAD; break; - case DEAD: /* Dead */ - keepcfg->state_flags[idx_core] = GONE; + case RTE_KA_STATE_DEAD: /* Dead */ + keepcfg->state_flags[idx_core] = RTE_KA_STATE_GONE; print_trace("Core died. ", keepcfg, idx_core); if (keepcfg->callback) keepcfg->callback( @@ -109,9 +116,22 @@ rte_keepalive_dispatch_pings(__rte_unused void *ptr_timer, idx_core ); break; - case GONE: /* Buried */ + case RTE_KA_STATE_GONE: /* Buried */ + break; + case RTE_KA_STATE_DOZING: /* Core going idle */ + keepcfg->state_flags[idx_core] = RTE_KA_STATE_SLEEP; + keepcfg->last_alive[idx_core] = rte_rdtsc(); + break; + case RTE_KA_STATE_SLEEP: /* Idled core */ break; } + if (keepcfg->relay_callback) + keepcfg->relay_callback( + keepcfg->relay_callback_data, + idx_core, + keepcfg->state_flags[idx_core], + keepcfg->last_alive[idx_core] + ); } } @@ -133,11 +153,19 @@ rte_keepalive_create(rte_keepalive_failure_callback_t callback, return keepcfg; } +void rte_keepalive_register_relay_callback(struct rte_keepalive *keepcfg, + rte_keepalive_relay_callback_t callback, + void *data) +{ + keepcfg->relay_callback = callback; + keepcfg->relay_callback_data = data; +} + void rte_keepalive_register_core(struct rte_keepalive *keepcfg, const int id_core) { if (id_core < RTE_KEEPALIVE_MAXCORES) { - keepcfg->active_cores[id_core] = 1; + keepcfg->active_cores[id_core] = RTE_KA_STATE_ALIVE; keepcfg->last_alive[id_core] = rte_rdtsc(); } } @@ -145,5 +173,11 @@ rte_keepalive_register_core(struct rte_keepalive *keepcfg, const int id_core) void rte_keepalive_mark_alive(struct rte_keepalive *keepcfg) { - keepcfg->state_flags[rte_lcore_id()] = ALIVE; + keepcfg->state_flags[rte_lcore_id()] = RTE_KA_STATE_ALIVE; +} + +void +rte_keepalive_mark_sleep(struct rte_keepalive *keepcfg) +{ + keepcfg->state_flags[rte_lcore_id()] = RTE_KA_STATE_DOZING; } diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile index e1093619..30b30f33 100644 --- a/lib/librte_eal/linuxapp/eal/Makefile +++ b/lib/librte_eal/linuxapp/eal/Makefile @@ -44,9 +44,12 @@ VPATH += $(RTE_SDK)/lib/librte_eal/common CFLAGS += -I$(SRCDIR)/include CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common/include +ifeq ($(CONFIG_RTE_LIBRTE_IVSHMEM),y) +# workaround for circular dependency eal -> ivshmem -> ring/mempool -> eal CFLAGS += -I$(RTE_SDK)/lib/librte_ring CFLAGS += -I$(RTE_SDK)/lib/librte_mempool CFLAGS += -I$(RTE_SDK)/lib/librte_ivshmem +endif CFLAGS += $(WERROR_FLAGS) -O3 LDLIBS += -ldl diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c index 8aafd519..543ef869 100644 --- a/lib/librte_eal/linuxapp/eal/eal.c +++ b/lib/librte_eal/linuxapp/eal/eal.c @@ -465,24 +465,6 @@ eal_parse_vfio_intr(const char *mode) return -1; } -static inline size_t -eal_get_hugepage_mem_size(void) -{ - uint64_t size = 0; - unsigned i, j; - - for (i = 0; i < internal_config.num_hugepage_sizes; i++) { - struct hugepage_info *hpi = &internal_config.hugepage_info[i]; - if (hpi->hugedir != NULL) { - for (j = 0; j < RTE_MAX_NUMA_NODES; j++) { - size += hpi->hugepage_sz * hpi->num_pages[j]; - } - } - } - - return (size < SIZE_MAX) ? (size_t)(size) : SIZE_MAX; -} - /* Parse the arguments for --log-level only */ static void eal_log_level_parse(int argc, char **argv) @@ -715,12 +697,8 @@ rte_eal_iopl_init(void) #if defined(RTE_ARCH_X86) if (iopl(3) != 0) return -1; - return 0; -#elif defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64) - return 0; /* iopl syscall not supported for ARM/ARM64 */ -#else - return -1; #endif + return 0; } /* Launch threads, called at application init(). */ @@ -766,8 +744,6 @@ rte_eal_init(int argc, char **argv) if (internal_config.memory == 0 && internal_config.force_sockets == 0) { if (internal_config.no_hugetlbfs) internal_config.memory = MEMSIZE_IF_NO_HUGE_PAGE; - else - internal_config.memory = eal_get_hugepage_mem_size(); } if (internal_config.vmware_tsc_map == 1) { @@ -863,7 +839,7 @@ rte_eal_init(int argc, char **argv) ret = rte_thread_setname(lcore_config[i].thread_id, thread_name); if (ret != 0) - RTE_LOG(ERR, EAL, + RTE_LOG(DEBUG, EAL, "Cannot set name for lcore thread\n"); } diff --git a/lib/librte_eal/linuxapp/eal/eal_debug.c b/lib/librte_eal/linuxapp/eal/eal_debug.c index 907fbfa7..5fbc17c5 100644 --- a/lib/librte_eal/linuxapp/eal/eal_debug.c +++ b/lib/librte_eal/linuxapp/eal/eal_debug.c @@ -77,9 +77,6 @@ void __rte_panic(const char *funcname, const char *format, ...) { va_list ap; - /* disable history */ - rte_log_set_history(0); - rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, "PANIC in %s():\n", funcname); va_start(ap, format); rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap); @@ -98,9 +95,6 @@ rte_exit(int exit_code, const char *format, ...) { va_list ap; - /* disable history */ - rte_log_set_history(0); - if (exit_code != 0) RTE_LOG(CRIT, EAL, "Error - exiting with code: %d\n" " Cause: ", exit_code); diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c index 06b26a9e..47a3b20a 100644 --- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c +++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c @@ -57,10 +57,8 @@ #include <rte_lcore.h> #include <rte_atomic.h> #include <rte_branch_prediction.h> -#include <rte_ring.h> #include <rte_debug.h> #include <rte_log.h> -#include <rte_mempool.h> #include <rte_pci.h> #include <rte_malloc.h> #include <rte_errno.h> @@ -889,7 +887,7 @@ rte_eal_intr_init(void) "eal-intr-thread"); ret_1 = rte_thread_setname(intr_thread, thread_name); if (ret_1 != 0) - RTE_LOG(ERR, EAL, + RTE_LOG(DEBUG, EAL, "Failed to set thread name for interrupt handling\n"); } diff --git a/lib/librte_eal/linuxapp/eal/eal_ivshmem.c b/lib/librte_eal/linuxapp/eal/eal_ivshmem.c index 07aec694..67b3caf2 100644 --- a/lib/librte_eal/linuxapp/eal/eal_ivshmem.c +++ b/lib/librte_eal/linuxapp/eal/eal_ivshmem.c @@ -49,7 +49,6 @@ #include <rte_string_fns.h> #include <rte_errno.h> #include <rte_ring.h> -#include <rte_mempool.h> #include <rte_malloc.h> #include <rte_common.h> #include <rte_ivshmem.h> @@ -184,21 +183,21 @@ overlap(const struct rte_memzone * mz1, const struct rte_memzone * mz2) i_end2 = mz2->ioremap_addr + mz2->len; /* check for overlap in virtual addresses */ - if (start1 > start2 && start1 < end2) + if (start1 >= start2 && start1 < end2) result |= VIRT; if (start2 >= start1 && start2 < end1) result |= VIRT; /* check for overlap in physical addresses */ - if (p_start1 > p_start2 && p_start1 < p_end2) + if (p_start1 >= p_start2 && p_start1 < p_end2) result |= PHYS; - if (p_start2 > p_start1 && p_start2 < p_end1) + if (p_start2 >= p_start1 && p_start2 < p_end1) result |= PHYS; /* check for overlap in ioremap addresses */ - if (i_start1 > i_start2 && i_start1 < i_end2) + if (i_start1 >= i_start2 && i_start1 < i_end2) result |= IOREMAP; - if (i_start2 > i_start1 && i_start2 < i_end1) + if (i_start2 >= i_start1 && i_start2 < i_end1) result |= IOREMAP; return result; diff --git a/lib/librte_eal/linuxapp/eal/eal_log.c b/lib/librte_eal/linuxapp/eal/eal_log.c index 0b133c3e..d3911004 100644 --- a/lib/librte_eal/linuxapp/eal/eal_log.c +++ b/lib/librte_eal/linuxapp/eal/eal_log.c @@ -50,8 +50,7 @@ #include "eal_private.h" /* - * default log function, used once mempool (hence log history) is - * available + * default log function */ static ssize_t console_log_write(__attribute__((unused)) void *c, const char *buf, size_t size) @@ -60,9 +59,6 @@ console_log_write(__attribute__((unused)) void *c, const char *buf, size_t size) ssize_t ret; uint32_t loglevel; - /* add this log in history */ - rte_log_add_in_history(buf, size); - /* write on stdout */ ret = fwrite(buf, 1, size, stdout); fflush(stdout); @@ -110,8 +106,7 @@ rte_eal_log_init(const char *id, int facility) /* early logs */ /* - * early log function, used during boot when mempool (hence log - * history) is not available + * early log function, used before rte_eal_log_init */ static ssize_t early_log_write(__attribute__((unused)) void *c, const char *buf, size_t size) diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c index 5b9132c6..5578c254 100644 --- a/lib/librte_eal/linuxapp/eal/eal_memory.c +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c @@ -80,6 +80,8 @@ #include <errno.h> #include <sys/ioctl.h> #include <sys/time.h> +#include <signal.h> +#include <setjmp.h> #include <rte_log.h> #include <rte_memory.h> @@ -309,6 +311,22 @@ get_virtual_area(size_t *size, size_t hugepage_sz) return addr; } +static sigjmp_buf huge_jmpenv; + +static void huge_sigbus_handler(int signo __rte_unused) +{ + siglongjmp(huge_jmpenv, 1); +} + +/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile, + * non-static local variable in the stack frame calling sigsetjmp might be + * clobbered by a call to longjmp. + */ +static int huge_wrap_sigsetjmp(void) +{ + return sigsetjmp(huge_jmpenv, 1); +} + /* * Mmap all hugepages of hugepage table: it first open a file in * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the @@ -316,7 +334,7 @@ get_virtual_area(size_t *size, size_t hugepage_sz) * in hugepg_tbl[i].final_va. The second mapping (when orig is 0) tries to * map continguous physical blocks in contiguous virtual blocks. */ -static int +static unsigned map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi, int orig) { @@ -394,9 +412,9 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, /* try to create hugepage file */ fd = open(hugepg_tbl[i].filepath, O_CREAT | O_RDWR, 0755); if (fd < 0) { - RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", __func__, + RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__, strerror(errno)); - return -1; + return i; } /* map the segment, and populate page tables, @@ -404,10 +422,10 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, virtaddr = mmap(vma_addr, hugepage_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, 0); if (virtaddr == MAP_FAILED) { - RTE_LOG(ERR, EAL, "%s(): mmap failed: %s\n", __func__, + RTE_LOG(DEBUG, EAL, "%s(): mmap failed: %s\n", __func__, strerror(errno)); close(fd); - return -1; + return i; } if (orig) { @@ -417,12 +435,33 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, hugepg_tbl[i].final_va = virtaddr; } + if (orig) { + /* In linux, hugetlb limitations, like cgroup, are + * enforced at fault time instead of mmap(), even + * with the option of MAP_POPULATE. Kernel will send + * a SIGBUS signal. To avoid to be killed, save stack + * environment here, if SIGBUS happens, we can jump + * back here. + */ + if (huge_wrap_sigsetjmp()) { + RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more " + "hugepages of size %u MB\n", + (unsigned)(hugepage_sz / 0x100000)); + munmap(virtaddr, hugepage_sz); + close(fd); + unlink(hugepg_tbl[i].filepath); + return i; + } + *(int *)virtaddr = 0; + } + + /* set shared flock on the file. */ if (flock(fd, LOCK_SH | LOCK_NB) == -1) { - RTE_LOG(ERR, EAL, "%s(): Locking file failed:%s \n", + RTE_LOG(DEBUG, EAL, "%s(): Locking file failed:%s \n", __func__, strerror(errno)); close(fd); - return -1; + return i; } close(fd); @@ -430,7 +469,8 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, vma_addr = (char *)vma_addr + hugepage_sz; vma_len -= hugepage_sz; } - return 0; + + return i; } #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS @@ -1036,6 +1076,51 @@ calc_num_pages_per_socket(uint64_t * memory, return total_num_pages; } +static inline size_t +eal_get_hugepage_mem_size(void) +{ + uint64_t size = 0; + unsigned i, j; + + for (i = 0; i < internal_config.num_hugepage_sizes; i++) { + struct hugepage_info *hpi = &internal_config.hugepage_info[i]; + if (hpi->hugedir != NULL) { + for (j = 0; j < RTE_MAX_NUMA_NODES; j++) { + size += hpi->hugepage_sz * hpi->num_pages[j]; + } + } + } + + return (size < SIZE_MAX) ? (size_t)(size) : SIZE_MAX; +} + +static struct sigaction huge_action_old; +static int huge_need_recover; + +static void +huge_register_sigbus(void) +{ + sigset_t mask; + struct sigaction action; + + sigemptyset(&mask); + sigaddset(&mask, SIGBUS); + action.sa_flags = 0; + action.sa_mask = mask; + action.sa_handler = huge_sigbus_handler; + + huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old); +} + +static void +huge_recover_sigbus(void) +{ + if (huge_need_recover) { + sigaction(SIGBUS, &huge_action_old, NULL); + huge_need_recover = 0; + } +} + /* * Prepare physical memory mapping: fill configuration structure with * these infos, return 0 on success. @@ -1122,8 +1207,11 @@ rte_eal_hugepage_init(void) hp_offset = 0; /* where we start the current page size entries */ + huge_register_sigbus(); + /* map all hugepages and sort them */ for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){ + unsigned pages_old, pages_new; struct hugepage_info *hpi; /* @@ -1137,10 +1225,28 @@ rte_eal_hugepage_init(void) continue; /* map all hugepages available */ - if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 1) < 0){ - RTE_LOG(DEBUG, EAL, "Failed to mmap %u MB hugepages\n", - (unsigned)(hpi->hugepage_sz / 0x100000)); + pages_old = hpi->num_pages[0]; + pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, 1); + if (pages_new < pages_old) { +#ifdef RTE_EAL_SINGLE_FILE_SEGMENTS + RTE_LOG(ERR, EAL, + "%d not %d hugepages of size %u MB allocated\n", + pages_new, pages_old, + (unsigned)(hpi->hugepage_sz / 0x100000)); goto fail; +#else + RTE_LOG(DEBUG, EAL, + "%d not %d hugepages of size %u MB allocated\n", + pages_new, pages_old, + (unsigned)(hpi->hugepage_sz / 0x100000)); + + int pages = pages_old - pages_new; + + nr_hugepages -= pages; + hpi->num_pages[0] = pages_new; + if (pages_new == 0) + continue; +#endif } /* find physical addresses and sockets for each hugepage */ @@ -1172,8 +1278,9 @@ rte_eal_hugepage_init(void) hp_offset += new_pages_count[i]; #else /* remap all hugepages */ - if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 0) < 0){ - RTE_LOG(DEBUG, EAL, "Failed to remap %u MB pages\n", + if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 0) != + hpi->num_pages[0]) { + RTE_LOG(ERR, EAL, "Failed to remap %u MB pages\n", (unsigned)(hpi->hugepage_sz / 0x100000)); goto fail; } @@ -1187,6 +1294,11 @@ rte_eal_hugepage_init(void) #endif } + huge_recover_sigbus(); + + if (internal_config.memory == 0 && internal_config.force_sockets == 0) + internal_config.memory = eal_get_hugepage_mem_size(); + #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS nr_hugefiles = 0; for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) { @@ -1373,6 +1485,7 @@ rte_eal_hugepage_init(void) return 0; fail: + huge_recover_sigbus(); free(tmp_hp); return -1; } @@ -1399,7 +1512,7 @@ int rte_eal_hugepage_attach(void) { const struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - const struct hugepage_file *hp = NULL; + struct hugepage_file *hp = NULL; unsigned num_hp = 0; unsigned i, s = 0; /* s used to track the segment number */ off_t size; @@ -1417,7 +1530,7 @@ rte_eal_hugepage_attach(void) if (internal_config.xen_dom0_support) { #ifdef RTE_LIBRTE_XEN_DOM0 if (rte_xen_dom0_memory_attach() < 0) { - RTE_LOG(ERR, EAL,"Failed to attach memory setments of primay " + RTE_LOG(ERR, EAL, "Failed to attach memory segments of primary " "process\n"); return -1; } @@ -1481,7 +1594,7 @@ rte_eal_hugepage_attach(void) size = getFileSize(fd_hugepage); hp = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd_hugepage, 0); - if (hp == NULL) { + if (hp == MAP_FAILED) { RTE_LOG(ERR, EAL, "Could not mmap %s\n", eal_hugepage_info_path()); goto error; } @@ -1545,12 +1658,19 @@ rte_eal_hugepage_attach(void) s++; } /* unmap the hugepage config file, since we are done using it */ - munmap((void *)(uintptr_t)hp, size); + munmap(hp, size); close(fd_zero); close(fd_hugepage); return 0; error: + s = 0; + while (s < RTE_MAX_MEMSEG && mcfg->memseg[s].len > 0) { + munmap(mcfg->memseg[s].addr, mcfg->memseg[s].len); + s++; + } + if (hp != NULL && hp != MAP_FAILED) + munmap(hp, size); if (fd_zero >= 0) close(fd_zero); if (fd_hugepage >= 0) diff --git a/lib/librte_eal/linuxapp/eal/eal_pci.c b/lib/librte_eal/linuxapp/eal/eal_pci.c index dbf12a84..f9c3efd2 100644 --- a/lib/librte_eal/linuxapp/eal/eal_pci.c +++ b/lib/librte_eal/linuxapp/eal/eal_pci.c @@ -66,8 +66,8 @@ pci_unbind_kernel_driver(struct rte_pci_device *dev) /* open /sys/bus/pci/devices/AAAA:BB:CC.D/driver */ snprintf(filename, sizeof(filename), - SYSFS_PCI_DEVICES "/" PCI_PRI_FMT "/driver/unbind", - loc->domain, loc->bus, loc->devid, loc->function); + "%s/" PCI_PRI_FMT "/driver/unbind", pci_get_sysfs_path(), + loc->domain, loc->bus, loc->devid, loc->function); f = fopen(filename, "w"); if (f == NULL) /* device was not bound */ @@ -190,12 +190,13 @@ pci_find_max_end_va(void) return RTE_PTR_ADD(last->addr, last->len); } -/* parse the "resource" sysfs file */ -static int -pci_parse_sysfs_resource(const char *filename, struct rte_pci_device *dev) +/* parse one line of the "resource" sysfs file (note that the 'line' + * string is modified) + */ +int +pci_parse_one_sysfs_resource(char *line, size_t len, uint64_t *phys_addr, + uint64_t *end_addr, uint64_t *flags) { - FILE *f; - char buf[BUFSIZ]; union pci_resource_info { struct { char *phys_addr; @@ -204,6 +205,31 @@ pci_parse_sysfs_resource(const char *filename, struct rte_pci_device *dev) }; char *ptrs[PCI_RESOURCE_FMT_NVAL]; } res_info; + + if (rte_strsplit(line, len, res_info.ptrs, 3, ' ') != 3) { + RTE_LOG(ERR, EAL, + "%s(): bad resource format\n", __func__); + return -1; + } + errno = 0; + *phys_addr = strtoull(res_info.phys_addr, NULL, 16); + *end_addr = strtoull(res_info.end_addr, NULL, 16); + *flags = strtoull(res_info.flags, NULL, 16); + if (errno != 0) { + RTE_LOG(ERR, EAL, + "%s(): bad resource format\n", __func__); + return -1; + } + + return 0; +} + +/* parse the "resource" sysfs file */ +static int +pci_parse_sysfs_resource(const char *filename, struct rte_pci_device *dev) +{ + FILE *f; + char buf[BUFSIZ]; int i; uint64_t phys_addr, end_addr, flags; @@ -220,21 +246,9 @@ pci_parse_sysfs_resource(const char *filename, struct rte_pci_device *dev) "%s(): cannot read resource\n", __func__); goto error; } - - if (rte_strsplit(buf, sizeof(buf), res_info.ptrs, 3, ' ') != 3) { - RTE_LOG(ERR, EAL, - "%s(): bad resource format\n", __func__); + if (pci_parse_one_sysfs_resource(buf, sizeof(buf), &phys_addr, + &end_addr, &flags) < 0) goto error; - } - errno = 0; - phys_addr = strtoull(res_info.phys_addr, NULL, 16); - end_addr = strtoull(res_info.end_addr, NULL, 16); - flags = strtoull(res_info.flags, NULL, 16); - if (errno != 0) { - RTE_LOG(ERR, EAL, - "%s(): bad resource format\n", __func__); - goto error; - } if (flags & IORESOURCE_MEM) { dev->mem_resource[i].phys_addr = phys_addr; @@ -306,6 +320,16 @@ pci_scan_one(const char *dirname, uint16_t domain, uint8_t bus, } dev->id.subsystem_device_id = (uint16_t)tmp; + /* get class_id */ + snprintf(filename, sizeof(filename), "%s/class", + dirname); + if (eal_parse_sysfs_value(filename, &tmp) < 0) { + free(dev); + return -1; + } + /* the least 24 bits are valid: class, subclass, program interface */ + dev->id.class_id = (uint32_t)tmp & RTE_CLASS_ANY_ID; + /* get max_vfs */ dev->max_vfs = 0; snprintf(filename, sizeof(filename), "%s/max_vfs", dirname); @@ -453,7 +477,7 @@ rte_eal_pci_scan(void) uint16_t domain; uint8_t bus, devid, function; - dir = opendir(SYSFS_PCI_DEVICES); + dir = opendir(pci_get_sysfs_path()); if (dir == NULL) { RTE_LOG(ERR, EAL, "%s(): opendir failed: %s\n", __func__, strerror(errno)); @@ -468,8 +492,8 @@ rte_eal_pci_scan(void) &bus, &devid, &function) != 0) continue; - snprintf(dirname, sizeof(dirname), "%s/%s", SYSFS_PCI_DEVICES, - e->d_name); + snprintf(dirname, sizeof(dirname), "%s/%s", + pci_get_sysfs_path(), e->d_name); if (pci_scan_one(dirname, domain, bus, devid, function) < 0) goto error; } @@ -481,18 +505,6 @@ error: return -1; } -#ifdef RTE_PCI_CONFIG -/* - * It is deprecated, all its configurations have been moved into - * each PMD respectively. - */ -void -pci_config_space_set(__rte_unused struct rte_pci_device *dev) -{ - RTE_LOG(DEBUG, EAL, "Nothing here, as it is deprecated\n"); -} -#endif - /* Read PCI config space. */ int rte_eal_pci_read_config(const struct rte_pci_device *device, void *buf, size_t len, off_t offset) diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_init.h b/lib/librte_eal/linuxapp/eal/eal_pci_init.h index 7011753d..f72a2548 100644 --- a/lib/librte_eal/linuxapp/eal/eal_pci_init.h +++ b/lib/librte_eal/linuxapp/eal/eal_pci_init.h @@ -36,12 +36,22 @@ #include "eal_vfio.h" +/** IO resource type: */ +#define IORESOURCE_IO 0x00000100 +#define IORESOURCE_MEM 0x00000200 + /* * Helper function to map PCI resources right after hugepages in virtual memory */ extern void *pci_map_addr; void *pci_find_max_end_va(void); +/* parse one line of the "resource" sysfs file (note that the 'line' + * string is modified) + */ +int pci_parse_one_sysfs_resource(char *line, size_t len, uint64_t *phys_addr, + uint64_t *end_addr, uint64_t *flags); + int pci_uio_alloc_resource(struct rte_pci_device *dev, struct mapped_pci_resource **uio_res); void pci_uio_free_resource(struct rte_pci_device *dev, diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_uio.c b/lib/librte_eal/linuxapp/eal/eal_pci_uio.c index 068694dc..1786b754 100644 --- a/lib/librte_eal/linuxapp/eal/eal_pci_uio.c +++ b/lib/librte_eal/linuxapp/eal/eal_pci_uio.c @@ -35,6 +35,7 @@ #include <unistd.h> #include <fcntl.h> #include <dirent.h> +#include <inttypes.h> #include <sys/stat.h> #include <sys/mman.h> #include <linux/pci_regs.h> @@ -161,14 +162,14 @@ pci_get_uio_dev(struct rte_pci_device *dev, char *dstbuf, * or uio:uioX */ snprintf(dirname, sizeof(dirname), - SYSFS_PCI_DEVICES "/" PCI_PRI_FMT "/uio", + "%s/" PCI_PRI_FMT "/uio", pci_get_sysfs_path(), loc->domain, loc->bus, loc->devid, loc->function); dir = opendir(dirname); if (dir == NULL) { /* retry with the parent directory */ snprintf(dirname, sizeof(dirname), - SYSFS_PCI_DEVICES "/" PCI_PRI_FMT, + "%s/" PCI_PRI_FMT, pci_get_sysfs_path(), loc->domain, loc->bus, loc->devid, loc->function); dir = opendir(dirname); @@ -309,7 +310,7 @@ pci_uio_map_resource_by_index(struct rte_pci_device *dev, int res_idx, struct mapped_pci_resource *uio_res, int map_idx) { int fd; - char devname[PATH_MAX]; /* contains the /dev/uioX */ + char devname[PATH_MAX]; void *mapaddr; struct rte_pci_addr *loc; struct pci_map *maps; @@ -319,7 +320,8 @@ pci_uio_map_resource_by_index(struct rte_pci_device *dev, int res_idx, /* update devname for mmap */ snprintf(devname, sizeof(devname), - SYSFS_PCI_DEVICES "/" PCI_PRI_FMT "/resource%d", + "%s/" PCI_PRI_FMT "/resource%d", + pci_get_sysfs_path(), loc->domain, loc->bus, loc->devid, loc->function, res_idx); @@ -368,11 +370,11 @@ error: return -1; } +#if defined(RTE_ARCH_X86) int pci_uio_ioport_map(struct rte_pci_device *dev, int bar, struct rte_pci_ioport *p) { -#if defined(RTE_ARCH_X86) char dirname[PATH_MAX]; char filename[PATH_MAX]; int uio_num; @@ -411,81 +413,154 @@ pci_uio_ioport_map(struct rte_pci_device *dev, int bar, RTE_LOG(DEBUG, EAL, "PCI Port IO found start=0x%lx\n", start); p->base = start; + p->len = 0; return 0; +} #else - RTE_SET_USED(dev); - RTE_SET_USED(bar); - RTE_SET_USED(p); +int +pci_uio_ioport_map(struct rte_pci_device *dev, int bar, + struct rte_pci_ioport *p) +{ + FILE *f; + char buf[BUFSIZ]; + char filename[PATH_MAX]; + uint64_t phys_addr, end_addr, flags; + int fd, i; + void *addr; + + /* open and read addresses of the corresponding resource in sysfs */ + snprintf(filename, sizeof(filename), "%s/" PCI_PRI_FMT "/resource", + pci_get_sysfs_path(), dev->addr.domain, dev->addr.bus, + dev->addr.devid, dev->addr.function); + f = fopen(filename, "r"); + if (f == NULL) { + RTE_LOG(ERR, EAL, "Cannot open sysfs resource: %s\n", + strerror(errno)); + return -1; + } + for (i = 0; i < bar + 1; i++) { + if (fgets(buf, sizeof(buf), f) == NULL) { + RTE_LOG(ERR, EAL, "Cannot read sysfs resource\n"); + goto error; + } + } + if (pci_parse_one_sysfs_resource(buf, sizeof(buf), &phys_addr, + &end_addr, &flags) < 0) + goto error; + if ((flags & IORESOURCE_IO) == 0) { + RTE_LOG(ERR, EAL, "BAR %d is not an IO resource\n", bar); + goto error; + } + snprintf(filename, sizeof(filename), "%s/" PCI_PRI_FMT "/resource%d", + pci_get_sysfs_path(), dev->addr.domain, dev->addr.bus, + dev->addr.devid, dev->addr.function, bar); + + /* mmap the pci resource */ + fd = open(filename, O_RDWR); + if (fd < 0) { + RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename, + strerror(errno)); + goto error; + } + addr = mmap(NULL, end_addr + 1, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + close(fd); + if (addr == MAP_FAILED) { + RTE_LOG(ERR, EAL, "Cannot mmap IO port resource: %s\n", + strerror(errno)); + goto error; + } + + /* strangely, the base address is mmap addr + phys_addr */ + p->base = (uintptr_t)addr + phys_addr; + p->len = end_addr + 1; + RTE_LOG(DEBUG, EAL, "PCI Port IO found start=0x%"PRIx64"\n", p->base); + fclose(f); + + return 0; + +error: + fclose(f); return -1; -#endif } +#endif void pci_uio_ioport_read(struct rte_pci_ioport *p, void *data, size_t len, off_t offset) { -#if defined(RTE_ARCH_X86) uint8_t *d; int size; - unsigned short reg = p->base + offset; + uintptr_t reg = p->base + offset; for (d = data; len > 0; d += size, reg += size, len -= size) { if (len >= 4) { size = 4; +#if defined(RTE_ARCH_X86) *(uint32_t *)d = inl(reg); +#else + *(uint32_t *)d = *(volatile uint32_t *)reg; +#endif } else if (len >= 2) { size = 2; +#if defined(RTE_ARCH_X86) *(uint16_t *)d = inw(reg); +#else + *(uint16_t *)d = *(volatile uint16_t *)reg; +#endif } else { size = 1; +#if defined(RTE_ARCH_X86) *d = inb(reg); - } - } #else - RTE_SET_USED(p); - RTE_SET_USED(data); - RTE_SET_USED(len); - RTE_SET_USED(offset); + *d = *(volatile uint8_t *)reg; #endif + } + } } void pci_uio_ioport_write(struct rte_pci_ioport *p, const void *data, size_t len, off_t offset) { -#if defined(RTE_ARCH_X86) const uint8_t *s; int size; - unsigned short reg = p->base + offset; + uintptr_t reg = p->base + offset; for (s = data; len > 0; s += size, reg += size, len -= size) { if (len >= 4) { size = 4; +#if defined(RTE_ARCH_X86) outl_p(*(const uint32_t *)s, reg); +#else + *(volatile uint32_t *)reg = *(const uint32_t *)s; +#endif } else if (len >= 2) { size = 2; +#if defined(RTE_ARCH_X86) outw_p(*(const uint16_t *)s, reg); +#else + *(volatile uint16_t *)reg = *(const uint16_t *)s; +#endif } else { size = 1; +#if defined(RTE_ARCH_X86) outb_p(*s, reg); - } - } #else - RTE_SET_USED(p); - RTE_SET_USED(data); - RTE_SET_USED(len); - RTE_SET_USED(offset); + *(volatile uint8_t *)reg = *s; #endif + } + } } int pci_uio_ioport_unmap(struct rte_pci_ioport *p) { - RTE_SET_USED(p); #if defined(RTE_ARCH_X86) + RTE_SET_USED(p); /* FIXME close intr fd ? */ return 0; #else - return -1; + return munmap((void *)(uintptr_t)p->base, p->len); #endif } diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c index 10266f8f..f91b9242 100644 --- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c +++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c @@ -602,7 +602,7 @@ pci_vfio_get_group_no(const char *pci_addr, int *iommu_group_no) /* try to find out IOMMU group for this device */ snprintf(linkname, sizeof(linkname), - SYSFS_PCI_DEVICES "/%s/iommu_group", pci_addr); + "%s/%s/iommu_group", pci_get_sysfs_path(), pci_addr); ret = readlink(linkname, filename, sizeof(filename)); diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio_mp_sync.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio_mp_sync.c index d9188fde..d54ded88 100644 --- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio_mp_sync.c +++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio_mp_sync.c @@ -287,7 +287,10 @@ pci_vfio_mp_sync_thread(void __rte_unused * arg) struct linger l; l.l_onoff = 1; l.l_linger = 60; - setsockopt(conn_sock, SOL_SOCKET, SO_LINGER, &l, sizeof(l)); + + if (setsockopt(conn_sock, SOL_SOCKET, SO_LINGER, &l, sizeof(l)) < 0) + RTE_LOG(WARNING, EAL, "Cannot set SO_LINGER option " + "on listen socket (%s)\n", strerror(errno)); ret = vfio_mp_sync_receive_request(conn_sock); @@ -396,7 +399,7 @@ pci_vfio_mp_sync_setup(void) snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "pci-vfio-sync"); ret = rte_thread_setname(socket_thread, thread_name); if (ret) - RTE_LOG(ERR, EAL, + RTE_LOG(DEBUG, EAL, "Failed to set thread name for secondary processes!\n"); return 0; diff --git a/lib/librte_eal/linuxapp/eal/eal_thread.c b/lib/librte_eal/linuxapp/eal/eal_thread.c index 18bd8e04..9f88530e 100644 --- a/lib/librte_eal/linuxapp/eal/eal_thread.c +++ b/lib/librte_eal/linuxapp/eal/eal_thread.c @@ -197,3 +197,16 @@ int rte_sys_gettid(void) { return (int)syscall(SYS_gettid); } + +int rte_thread_setname(pthread_t id, const char *name) +{ + int ret = -1; +#if defined(__GLIBC__) && defined(__GLIBC_PREREQ) +#if __GLIBC_PREREQ(2, 12) + ret = pthread_setname_np(id, name); +#endif +#endif + RTE_SET_USED(id); + RTE_SET_USED(name); + return ret; +} diff --git a/lib/librte_eal/linuxapp/eal/eal_timer.c b/lib/librte_eal/linuxapp/eal/eal_timer.c index f2abb7b6..afa32f5c 100644 --- a/lib/librte_eal/linuxapp/eal/eal_timer.c +++ b/lib/librte_eal/linuxapp/eal/eal_timer.c @@ -222,8 +222,8 @@ rte_eal_hpet_init(int make_default) snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "hpet-msb-inc"); ret = rte_thread_setname(msb_inc_thread_id, thread_name); if (ret != 0) - RTE_LOG(ERR, EAL, - "ERROR: Cannot set HPET timer thread name!\n"); + RTE_LOG(DEBUG, EAL, + "Cannot set HPET timer thread name!\n"); if (make_default) eal_timer_source = EAL_TIMER_HPET; diff --git a/lib/librte_eal/linuxapp/eal/eal_xen_memory.c b/lib/librte_eal/linuxapp/eal/eal_xen_memory.c index 495eef9e..0b612bb1 100644 --- a/lib/librte_eal/linuxapp/eal/eal_xen_memory.c +++ b/lib/librte_eal/linuxapp/eal/eal_xen_memory.c @@ -156,13 +156,27 @@ get_xen_memory_size(void) * Based on physical address to caculate MFN in Xen Dom0. */ phys_addr_t -rte_xen_mem_phy2mch(uint32_t memseg_id, const phys_addr_t phy_addr) +rte_xen_mem_phy2mch(int32_t memseg_id, const phys_addr_t phy_addr) { - int mfn_id; + int mfn_id, i; uint64_t mfn, mfn_offset; struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; struct rte_memseg *memseg = mcfg->memseg; + /* find the memory segment owning the physical address */ + if (memseg_id == -1) { + for (i = 0; i < RTE_MAX_MEMSEG; i++) { + if ((phy_addr >= memseg[i].phys_addr) && + (phys_addr < memseg[i].phys_addr + + memseg[i].size)) { + memseg_id = i; + break; + } + } + if (memseg_id == -1) + return RTE_BAD_PHYS_ADDR; + } + mfn_id = (phy_addr - memseg[memseg_id].phys_addr) / RTE_PGSIZE_2M; /*the MFN is contiguous in 2M */ diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h index 7e5e5984..2acdfd9b 100644 --- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h +++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h @@ -113,7 +113,9 @@ struct rte_kni_mbuf { void *buf_addr __attribute__((__aligned__(RTE_CACHE_LINE_SIZE))); char pad0[10]; uint16_t data_off; /**< Start address of data in segment buffer. */ - char pad1[4]; + char pad1[2]; + uint8_t nb_segs; /**< Number of segments. */ + char pad4[1]; uint64_t ol_flags; /**< Offload features. */ char pad2[4]; uint32_t pkt_len; /**< Total pkt len: sum of all segment data_len. */ diff --git a/lib/librte_eal/linuxapp/eal/rte_eal_version.map b/lib/librte_eal/linuxapp/eal/rte_eal_version.map index 12503efa..05134673 100644 --- a/lib/librte_eal/linuxapp/eal/rte_eal_version.map +++ b/lib/librte_eal/linuxapp/eal/rte_eal_version.map @@ -154,3 +154,13 @@ DPDK_16.04 { rte_eal_primary_proc_alive; } DPDK_2.2; + +DPDK_16.07 { + global: + + pci_get_sysfs_path; + rte_keepalive_mark_sleep; + rte_keepalive_register_relay_callback; + rte_thread_setname; + +} DPDK_16.04; diff --git a/lib/librte_eal/linuxapp/igb_uio/compat.h b/lib/librte_eal/linuxapp/igb_uio/compat.h index c1d45a66..0d781e48 100644 --- a/lib/librte_eal/linuxapp/igb_uio/compat.h +++ b/lib/librte_eal/linuxapp/igb_uio/compat.h @@ -24,6 +24,15 @@ #define PCI_MSIX_ENTRY_CTRL_MASKBIT 1 #endif +/* + * for kernels < 2.6.38 and backported patch that moves MSI-X entry definition + * to pci_regs.h Those kernels has PCI_MSIX_ENTRY_SIZE defined but not + * PCI_MSIX_ENTRY_CTRL_MASKBIT + */ +#ifndef PCI_MSIX_ENTRY_CTRL_MASKBIT +#define PCI_MSIX_ENTRY_CTRL_MASKBIT 1 +#endif + #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 34) && \ (!(defined(RHEL_RELEASE_CODE) && \ RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(5, 9))) diff --git a/lib/librte_eal/linuxapp/igb_uio/igb_uio.c b/lib/librte_eal/linuxapp/igb_uio/igb_uio.c index 72b26923..45a5720e 100644 --- a/lib/librte_eal/linuxapp/igb_uio/igb_uio.c +++ b/lib/librte_eal/linuxapp/igb_uio/igb_uio.c @@ -81,62 +81,10 @@ store_max_vfs(struct device *dev, struct device_attribute *attr, return err ? err : count; } -#ifdef RTE_PCI_CONFIG -static ssize_t -show_extended_tag(struct device *dev, struct device_attribute *attr, char *buf) -{ - dev_info(dev, "Deprecated\n"); - - return 0; -} - -static ssize_t -store_extended_tag(struct device *dev, - struct device_attribute *attr, - const char *buf, - size_t count) -{ - dev_info(dev, "Deprecated\n"); - - return 0; -} - -static ssize_t -show_max_read_request_size(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - dev_info(dev, "Deprecated\n"); - - return 0; -} - -static ssize_t -store_max_read_request_size(struct device *dev, - struct device_attribute *attr, - const char *buf, - size_t count) -{ - dev_info(dev, "Deprecated\n"); - - return 0; -} -#endif - static DEVICE_ATTR(max_vfs, S_IRUGO | S_IWUSR, show_max_vfs, store_max_vfs); -#ifdef RTE_PCI_CONFIG -static DEVICE_ATTR(extended_tag, S_IRUGO | S_IWUSR, show_extended_tag, - store_extended_tag); -static DEVICE_ATTR(max_read_request_size, S_IRUGO | S_IWUSR, - show_max_read_request_size, store_max_read_request_size); -#endif static struct attribute *dev_attrs[] = { &dev_attr_max_vfs.attr, -#ifdef RTE_PCI_CONFIG - &dev_attr_extended_tag.attr, - &dev_attr_max_read_request_size.attr, -#endif NULL, }; diff --git a/lib/librte_eal/linuxapp/kni/Makefile b/lib/librte_eal/linuxapp/kni/Makefile index ac99d3f1..8cc6b61c 100644 --- a/lib/librte_eal/linuxapp/kni/Makefile +++ b/lib/librte_eal/linuxapp/kni/Makefile @@ -47,7 +47,7 @@ MODULE_CFLAGS += -Wall -Werror ifeq ($(shell lsb_release -si 2>/dev/null),Ubuntu) MODULE_CFLAGS += -DUBUNTU_RELEASE_CODE=$(shell lsb_release -sr | tr -d .) UBUNTU_KERNEL_CODE := $(shell echo `grep UTS_RELEASE $(RTE_KERNELDIR)/include/generated/utsrelease.h \ - | cut -d '"' -f2 | cut -d- -f1,2 | tr .- $(comma)`,1) + | cut -d '"' -f2 | cut -d- -f1,2 | tr .- ,`,1) MODULE_CFLAGS += -D"UBUNTU_KERNEL_CODE=UBUNTU_KERNEL_VERSION($(UBUNTU_KERNEL_CODE))" endif diff --git a/lib/librte_eal/linuxapp/kni/compat.h b/lib/librte_eal/linuxapp/kni/compat.h index cf100b67..647ba3ce 100644 --- a/lib/librte_eal/linuxapp/kni/compat.h +++ b/lib/librte_eal/linuxapp/kni/compat.h @@ -14,16 +14,27 @@ #endif /* < 2.6.39 */ -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,35) +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 33) +#define HAVE_SIMPLIFIED_PERNET_OPERATIONS +#endif +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 35) #define sk_sleep(s) (s)->sk_sleep +#endif -#endif /* < 2.6.35 */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 9, 0) +#define HAVE_CHANGE_CARRIER_CB +#endif -#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,19,0) +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0) #define HAVE_IOV_ITER_MSGHDR #endif -#if ( LINUX_VERSION_CODE < KERNEL_VERSION(4,1,0) ) +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 1, 0) #define HAVE_KIOCB_MSG_PARAM -#endif /* < 4.1.0 */ +#define HAVE_REBUILD_HEADER +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0) +#define HAVE_TRANS_START_HELPER +#endif diff --git a/lib/librte_eal/linuxapp/kni/ethtool/igb/e1000_phy.c b/lib/librte_eal/linuxapp/kni/ethtool/igb/e1000_phy.c index df224702..140a2a47 100644 --- a/lib/librte_eal/linuxapp/kni/ethtool/igb/e1000_phy.c +++ b/lib/librte_eal/linuxapp/kni/ethtool/igb/e1000_phy.c @@ -3300,12 +3300,13 @@ s32 e1000_read_phy_reg_mphy(struct e1000_hw *hw, u32 address, u32 *data) *data = E1000_READ_REG(hw, E1000_MPHY_DATA); /* Disable access to mPHY if it was originally disabled */ - if (locked) + if (locked) { ready = e1000_is_mphy_ready(hw); if (!ready) return -E1000_ERR_PHY; E1000_WRITE_REG(hw, E1000_MPHY_ADDR_CTRL, E1000_MPHY_DIS_ACCESS); + } return E1000_SUCCESS; } @@ -3365,12 +3366,13 @@ s32 e1000_write_phy_reg_mphy(struct e1000_hw *hw, u32 address, u32 data, E1000_WRITE_REG(hw, E1000_MPHY_DATA, data); /* Disable access to mPHY if it was originally disabled */ - if (locked) + if (locked) { ready = e1000_is_mphy_ready(hw); if (!ready) return -E1000_ERR_PHY; E1000_WRITE_REG(hw, E1000_MPHY_ADDR_CTRL, E1000_MPHY_DIS_ACCESS); + } return E1000_SUCCESS; } diff --git a/lib/librte_eal/linuxapp/kni/ethtool/ixgbe/ixgbe_82599.c b/lib/librte_eal/linuxapp/kni/ethtool/ixgbe/ixgbe_82599.c index 017dfe16..c6f4130d 100644 --- a/lib/librte_eal/linuxapp/kni/ethtool/ixgbe/ixgbe_82599.c +++ b/lib/librte_eal/linuxapp/kni/ethtool/ixgbe/ixgbe_82599.c @@ -867,12 +867,13 @@ s32 ixgbe_setup_mac_link_82599(struct ixgbe_hw *hw, link_mode == IXGBE_AUTOC_LMS_KX4_KX_KR_SGMII) { /* Set KX4/KX/KR support according to speed requested */ autoc &= ~(IXGBE_AUTOC_KX4_KX_SUPP_MASK | IXGBE_AUTOC_KR_SUPP); - if (speed & IXGBE_LINK_SPEED_10GB_FULL) + if (speed & IXGBE_LINK_SPEED_10GB_FULL) { if (orig_autoc & IXGBE_AUTOC_KX4_SUPP) autoc |= IXGBE_AUTOC_KX4_SUPP; if ((orig_autoc & IXGBE_AUTOC_KR_SUPP) && (hw->phy.smart_speed_active == false)) autoc |= IXGBE_AUTOC_KR_SUPP; + } if (speed & IXGBE_LINK_SPEED_1GB_FULL) autoc |= IXGBE_AUTOC_KX_SUPP; } else if ((pma_pmd_1g == IXGBE_AUTOC_1G_SFI) && diff --git a/lib/librte_eal/linuxapp/kni/ethtool/ixgbe/ixgbe_main.c b/lib/librte_eal/linuxapp/kni/ethtool/ixgbe/ixgbe_main.c index 8c1d2fe3..92fc9fc7 100644 --- a/lib/librte_eal/linuxapp/kni/ethtool/ixgbe/ixgbe_main.c +++ b/lib/librte_eal/linuxapp/kni/ethtool/ixgbe/ixgbe_main.c @@ -59,8 +59,6 @@ #undef CONFIG_DCA_MODULE char ixgbe_driver_name[] = "ixgbe"; -static const char ixgbe_driver_string[] = - "Intel(R) 10 Gigabit PCI Express Network Driver"; #define DRV_HW_PERF #ifndef CONFIG_IXGBE_NAPI @@ -79,8 +77,6 @@ static const char ixgbe_driver_string[] = #define DRV_VERSION __stringify(MAJ) "." __stringify(MIN) "." \ __stringify(BUILD) DRIVERNAPI DRV_HW_PERF FPGA VMDQ_TAG const char ixgbe_driver_version[] = DRV_VERSION; -static const char ixgbe_copyright[] = - "Copyright (c) 1999-2012 Intel Corporation."; /* ixgbe_pci_tbl - PCI Device ID Table * diff --git a/lib/librte_eal/linuxapp/kni/kni_misc.c b/lib/librte_eal/linuxapp/kni/kni_misc.c index ae8133f3..59d15ca6 100644 --- a/lib/librte_eal/linuxapp/kni/kni_misc.c +++ b/lib/librte_eal/linuxapp/kni/kni_misc.c @@ -26,6 +26,7 @@ #include <linux/module.h> #include <linux/miscdevice.h> #include <linux/netdevice.h> +#include <linux/etherdevice.h> #include <linux/pci.h> #include <linux/kthread.h> #include <linux/rwsem.h> @@ -34,6 +35,8 @@ #include <net/netns/generic.h> #include <exec-env/rte_kni_common.h> + +#include "compat.h" #include "kni_dev.h" MODULE_LICENSE("Dual BSD/GPL"); @@ -104,7 +107,7 @@ struct kni_net { static int __net_init kni_init_net(struct net *net) { -#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 32) +#ifdef HAVE_SIMPLIFIED_PERNET_OPERATIONS struct kni_net *knet = net_generic(net, kni_net_id); #else struct kni_net *knet; @@ -115,7 +118,7 @@ static int __net_init kni_init_net(struct net *net) ret = -ENOMEM; return ret; } -#endif /* LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 32) */ +#endif /* Clear the bit of device in use */ clear_bit(KNI_DEV_IN_USE_BIT_NUM, &knet->device_in_use); @@ -123,7 +126,7 @@ static int __net_init kni_init_net(struct net *net) init_rwsem(&knet->kni_list_lock); INIT_LIST_HEAD(&knet->kni_list_head); -#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 32) +#ifdef HAVE_SIMPLIFIED_PERNET_OPERATIONS return 0; #else ret = net_assign_generic(net, kni_net_id, knet); @@ -131,25 +134,25 @@ static int __net_init kni_init_net(struct net *net) kfree(knet); return ret; -#endif /* LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 32) */ +#endif } static void __net_exit kni_exit_net(struct net *net) { -#if LINUX_VERSION_CODE <= KERNEL_VERSION(2, 6, 32) +#ifndef HAVE_SIMPLIFIED_PERNET_OPERATIONS struct kni_net *knet = net_generic(net, kni_net_id); kfree(knet); -#endif /* LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 32) */ +#endif } static struct pernet_operations kni_net_ops = { .init = kni_init_net, .exit = kni_exit_net, -#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 32) +#ifdef HAVE_SIMPLIFIED_PERNET_OPERATIONS .id = &kni_net_id, .size = sizeof(struct kni_net), -#endif /* LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 32) */ +#endif }; static int __init @@ -164,11 +167,11 @@ kni_init(void) return -EINVAL; } -#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 32) +#ifdef HAVE_SIMPLIFIED_PERNET_OPERATIONS rc = register_pernet_subsys(&kni_net_ops); #else rc = register_pernet_gen_subsys(&kni_net_id, &kni_net_ops); -#endif /* LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 32) */ +#endif if (rc) return -EPERM; @@ -186,11 +189,11 @@ kni_init(void) return 0; out: -#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 32) +#ifdef HAVE_SIMPLIFIED_PERNET_OPERATIONS unregister_pernet_subsys(&kni_net_ops); #else register_pernet_gen_subsys(&kni_net_id, &kni_net_ops); -#endif /* LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 32) */ +#endif return rc; } @@ -198,11 +201,11 @@ static void __exit kni_exit(void) { misc_deregister(&kni_misc); -#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 32) +#ifdef HAVE_SIMPLIFIED_PERNET_OPERATIONS unregister_pernet_subsys(&kni_net_ops); #else register_pernet_gen_subsys(&kni_net_id, &kni_net_ops); -#endif /* LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 32) */ +#endif KNI_PRINT("####### DPDK kni module unloaded #######\n"); } @@ -542,6 +545,15 @@ kni_ioctl_create(struct net *net, if (pci) pci_dev_put(pci); + if (kni->lad_dev) + memcpy(net_dev->dev_addr, kni->lad_dev->dev_addr, ETH_ALEN); + else + /* + * Generate random mac address. eth_random_addr() is the newer + * version of generating mac address in linux kernel. + */ + random_ether_addr(net_dev->dev_addr); + ret = register_netdev(net_dev); if (ret) { KNI_ERR("error %i registering device \"%s\"\n", diff --git a/lib/librte_eal/linuxapp/kni/kni_net.c b/lib/librte_eal/linuxapp/kni/kni_net.c index cfa83398..fc82193a 100644 --- a/lib/librte_eal/linuxapp/kni/kni_net.c +++ b/lib/librte_eal/linuxapp/kni/kni_net.c @@ -38,6 +38,8 @@ #include <exec-env/rte_kni_common.h> #include <kni_fifo.h> + +#include "compat.h" #include "kni_dev.h" #define WD_TIMEOUT 5 /*jiffies */ @@ -69,15 +71,6 @@ kni_net_open(struct net_device *dev) struct rte_kni_request req; struct kni_dev *kni = netdev_priv(dev); - if (kni->lad_dev) - memcpy(dev->dev_addr, kni->lad_dev->dev_addr, ETH_ALEN); - else - /* - * Generate random mac address. eth_random_addr() is the newer - * version of generating mac address in linux kernel. - */ - random_ether_addr(dev->dev_addr); - netif_start_queue(dev); memset(&req, 0, sizeof(req)); @@ -156,7 +149,8 @@ kni_net_rx_normal(struct kni_dev *kni) /* Transfer received packets to netif */ for (i = 0; i < num_rx; i++) { kva = (void *)va[i] - kni->mbuf_va + kni->mbuf_kva; - len = kva->data_len; + len = kva->pkt_len; + data_kva = kva->buf_addr + kva->data_off - kni->mbuf_va + kni->mbuf_kva; @@ -165,22 +159,41 @@ kni_net_rx_normal(struct kni_dev *kni) KNI_ERR("Out of mem, dropping pkts\n"); /* Update statistics */ kni->stats.rx_dropped++; + continue; } - else { - /* Align IP on 16B boundary */ - skb_reserve(skb, 2); + + /* Align IP on 16B boundary */ + skb_reserve(skb, 2); + + if (kva->nb_segs == 1) { memcpy(skb_put(skb, len), data_kva, len); - skb->dev = dev; - skb->protocol = eth_type_trans(skb, dev); - skb->ip_summed = CHECKSUM_UNNECESSARY; + } else { + int nb_segs; + int kva_nb_segs = kva->nb_segs; - /* Call netif interface */ - netif_rx_ni(skb); + for (nb_segs = 0; nb_segs < kva_nb_segs; nb_segs++) { + memcpy(skb_put(skb, kva->data_len), + data_kva, kva->data_len); - /* Update statistics */ - kni->stats.rx_bytes += len; - kni->stats.rx_packets++; + if (!kva->next) + break; + + kva = kva->next - kni->mbuf_va + kni->mbuf_kva; + data_kva = kva->buf_addr + kva->data_off + - kni->mbuf_va + kni->mbuf_kva; + } } + + skb->dev = dev; + skb->protocol = eth_type_trans(skb, dev); + skb->ip_summed = CHECKSUM_UNNECESSARY; + + /* Call netif interface */ + netif_rx_ni(skb); + + /* Update statistics */ + kni->stats.rx_bytes += len; + kni->stats.rx_packets++; } /* Burst enqueue mbufs into free_q */ @@ -317,7 +330,7 @@ kni_net_rx_lo_fifo_skb(struct kni_dev *kni) /* Copy mbufs to sk buffer and then call tx interface */ for (i = 0; i < num; i++) { kva = (void *)va[i] - kni->mbuf_va + kni->mbuf_kva; - len = kva->data_len; + len = kva->pkt_len; data_kva = kva->buf_addr + kva->data_off - kni->mbuf_va + kni->mbuf_kva; @@ -338,20 +351,39 @@ kni_net_rx_lo_fifo_skb(struct kni_dev *kni) if (skb == NULL) { KNI_ERR("Out of mem, dropping pkts\n"); kni->stats.rx_dropped++; + continue; } - else { - /* Align IP on 16B boundary */ - skb_reserve(skb, 2); + + /* Align IP on 16B boundary */ + skb_reserve(skb, 2); + + if (kva->nb_segs == 1) { memcpy(skb_put(skb, len), data_kva, len); - skb->dev = dev; - skb->ip_summed = CHECKSUM_UNNECESSARY; + } else { + int nb_segs; + int kva_nb_segs = kva->nb_segs; - kni->stats.rx_bytes += len; - kni->stats.rx_packets++; + for (nb_segs = 0; nb_segs < kva_nb_segs; nb_segs++) { + memcpy(skb_put(skb, kva->data_len), + data_kva, kva->data_len); - /* call tx interface */ - kni_net_tx(skb, dev); + if (!kva->next) + break; + + kva = kva->next - kni->mbuf_va + kni->mbuf_kva; + data_kva = kva->buf_addr + kva->data_off + - kni->mbuf_va + kni->mbuf_kva; + } } + + skb->dev = dev; + skb->ip_summed = CHECKSUM_UNNECESSARY; + + kni->stats.rx_bytes += len; + kni->stats.rx_packets++; + + /* call tx interface */ + kni_net_tx(skb, dev); } /* enqueue all the mbufs from rx_q into free_q */ @@ -396,7 +428,12 @@ kni_net_tx(struct sk_buff *skb, struct net_device *dev) struct rte_kni_mbuf *pkt_kva = NULL; struct rte_kni_mbuf *pkt_va = NULL; - dev->trans_start = jiffies; /* save the timestamp */ + /* save the timestamp */ +#ifdef HAVE_TRANS_START_HELPER + netif_trans_update(dev); +#else + dev->trans_start = jiffies; +#endif /* Check if the length of skb is less than mbuf size */ if (skb->len > kni->mbuf_size) @@ -604,7 +641,7 @@ kni_net_header(struct sk_buff *skb, struct net_device *dev, /* * Re-fill the eth header */ -#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 1, 0)) +#ifdef HAVE_REBUILD_HEADER static int kni_net_rebuild_header(struct sk_buff *skb) { @@ -634,7 +671,7 @@ static int kni_net_set_mac(struct net_device *netdev, void *p) return 0; } -#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 9, 0)) +#ifdef HAVE_CHANGE_CARRIER_CB static int kni_net_change_carrier(struct net_device *dev, bool new_carrier) { if (new_carrier) @@ -647,7 +684,7 @@ static int kni_net_change_carrier(struct net_device *dev, bool new_carrier) static const struct header_ops kni_net_header_ops = { .create = kni_net_header, -#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 1, 0)) +#ifdef HAVE_REBUILD_HEADER .rebuild = kni_net_rebuild_header, #endif /* < 4.1.0 */ .cache = NULL, /* disable caching */ @@ -664,7 +701,7 @@ static const struct net_device_ops kni_net_netdev_ops = { .ndo_get_stats = kni_net_stats, .ndo_tx_timeout = kni_net_tx_timeout, .ndo_set_mac_address = kni_net_set_mac, -#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 9, 0)) +#ifdef HAVE_CHANGE_CARRIER_CB .ndo_change_carrier = kni_net_change_carrier, #endif }; diff --git a/lib/librte_ether/Makefile b/lib/librte_ether/Makefile index e8102846..0bb5dc90 100644 --- a/lib/librte_ether/Makefile +++ b/lib/librte_ether/Makefile @@ -41,7 +41,7 @@ CFLAGS += $(WERROR_FLAGS) EXPORT_MAP := rte_ether_version.map -LIBABIVER := 3 +LIBABIVER := 4 SRCS-y += rte_ethdev.c diff --git a/lib/librte_ether/rte_eth_ctrl.h b/lib/librte_ether/rte_eth_ctrl.h index b8c7be90..c3a2c9e4 100644 --- a/lib/librte_ether/rte_eth_ctrl.h +++ b/lib/librte_ether/rte_eth_ctrl.h @@ -74,7 +74,12 @@ extern "C" { #define RTE_ETH_FLOW_IPV6_EX 15 #define RTE_ETH_FLOW_IPV6_TCP_EX 16 #define RTE_ETH_FLOW_IPV6_UDP_EX 17 -#define RTE_ETH_FLOW_MAX 18 +#define RTE_ETH_FLOW_PORT 18 + /**< Consider device port number as a flow differentiator */ +#define RTE_ETH_FLOW_VXLAN 19 /**< VXLAN protocol based flow */ +#define RTE_ETH_FLOW_GENEVE 20 /**< GENEVE protocol based flow */ +#define RTE_ETH_FLOW_NVGRE 21 /**< NVGRE protocol based flow */ +#define RTE_ETH_FLOW_MAX 22 /** * Feature filter types diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c index a31018e8..eac260f1 100644 --- a/lib/librte_ether/rte_ethdev.c +++ b/lib/librte_ether/rte_ethdev.c @@ -77,6 +77,12 @@ static uint8_t nb_ports; /* spinlock for eth device callbacks */ static rte_spinlock_t rte_eth_dev_cb_lock = RTE_SPINLOCK_INITIALIZER; +/* spinlock for add/remove rx callbacks */ +static rte_spinlock_t rte_eth_rx_cb_lock = RTE_SPINLOCK_INITIALIZER; + +/* spinlock for add/remove tx callbacks */ +static rte_spinlock_t rte_eth_tx_cb_lock = RTE_SPINLOCK_INITIALIZER; + /* store statistics names and its offset in stats structure */ struct rte_eth_xstats_name_off { char name[RTE_ETH_XSTATS_NAME_SIZE]; @@ -369,8 +375,7 @@ rte_eth_dev_is_valid_port(uint8_t port_id) int rte_eth_dev_socket_id(uint8_t port_id) { - if (!rte_eth_dev_is_valid_port(port_id)) - return -1; + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -1); return rte_eth_devices[port_id].data->numa_node; } @@ -383,8 +388,7 @@ rte_eth_dev_count(void) static enum rte_eth_dev_type rte_eth_dev_get_device_type(uint8_t port_id) { - if (!rte_eth_dev_is_valid_port(port_id)) - return RTE_ETH_DEV_UNKNOWN; + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, RTE_ETH_DEV_UNKNOWN); return rte_eth_devices[port_id].dev_type; } @@ -402,7 +406,7 @@ rte_eth_dev_get_addr_by_port(uint8_t port_id, struct rte_pci_addr *addr) return 0; } -static int +int rte_eth_dev_get_name_by_port(uint8_t port_id, char *name) { char *tmp; @@ -421,7 +425,7 @@ rte_eth_dev_get_name_by_port(uint8_t port_id, char *name) return 0; } -static int +int rte_eth_dev_get_port_by_name(const char *name, uint8_t *port_id) { int i; @@ -479,10 +483,7 @@ rte_eth_dev_is_detachable(uint8_t port_id) { uint32_t dev_flags; - if (!rte_eth_dev_is_valid_port(port_id)) { - RTE_PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id); - return -EINVAL; - } + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL); switch (rte_eth_devices[port_id].data->kdrv) { case RTE_KDRV_IGB_UIO: @@ -1507,9 +1508,85 @@ rte_eth_stats_reset(uint8_t port_id) dev->data->rx_mbuf_alloc_failed = 0; } +static int +get_xstats_count(uint8_t port_id) +{ + struct rte_eth_dev *dev; + int count; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL); + dev = &rte_eth_devices[port_id]; + if (dev->dev_ops->xstats_get_names != NULL) { + count = (*dev->dev_ops->xstats_get_names)(dev, NULL, 0); + if (count < 0) + return count; + } else + count = 0; + count += RTE_NB_STATS; + count += dev->data->nb_rx_queues * RTE_NB_RXQ_STATS; + count += dev->data->nb_tx_queues * RTE_NB_TXQ_STATS; + return count; +} + +int +rte_eth_xstats_get_names(uint8_t port_id, + struct rte_eth_xstat_name *xstats_names, + unsigned size) +{ + struct rte_eth_dev *dev; + int cnt_used_entries; + int cnt_expected_entries; + uint32_t idx, id_queue; + + cnt_expected_entries = get_xstats_count(port_id); + if (xstats_names == NULL || cnt_expected_entries < 0 || + (int)size < cnt_expected_entries) + return cnt_expected_entries; + + /* port_id checked in get_xstats_count() */ + dev = &rte_eth_devices[port_id]; + if (dev->dev_ops->xstats_get_names != NULL) { + cnt_used_entries = (*dev->dev_ops->xstats_get_names)( + dev, xstats_names, size); + if (cnt_used_entries < 0) + return cnt_used_entries; + } else + /* Driver itself does not support extended stats, but + * still have basic stats. + */ + cnt_used_entries = 0; + + for (idx = 0; idx < RTE_NB_STATS; idx++) { + snprintf(xstats_names[cnt_used_entries].name, + sizeof(xstats_names[0].name), + "%s", rte_stats_strings[idx].name); + cnt_used_entries++; + } + for (id_queue = 0; id_queue < dev->data->nb_rx_queues; id_queue++) { + for (idx = 0; idx < RTE_NB_RXQ_STATS; idx++) { + snprintf(xstats_names[cnt_used_entries].name, + sizeof(xstats_names[0].name), + "rx_q%u%s", + id_queue, rte_rxq_stats_strings[idx].name); + cnt_used_entries++; + } + + } + for (id_queue = 0; id_queue < dev->data->nb_tx_queues; id_queue++) { + for (idx = 0; idx < RTE_NB_TXQ_STATS; idx++) { + snprintf(xstats_names[cnt_used_entries].name, + sizeof(xstats_names[0].name), + "tx_q%u%s", + id_queue, rte_txq_stats_strings[idx].name); + cnt_used_entries++; + } + } + return cnt_used_entries; +} + /* retrieve ethdev extended statistics */ int -rte_eth_xstats_get(uint8_t port_id, struct rte_eth_xstats *xstats, +rte_eth_xstats_get(uint8_t port_id, struct rte_eth_xstat *xstats, unsigned n) { struct rte_eth_stats eth_stats; @@ -1551,8 +1628,7 @@ rte_eth_xstats_get(uint8_t port_id, struct rte_eth_xstats *xstats, stats_ptr = RTE_PTR_ADD(ð_stats, rte_stats_strings[i].offset); val = *stats_ptr; - snprintf(xstats[count].name, sizeof(xstats[count].name), - "%s", rte_stats_strings[i].name); + xstats[count].id = count + xcount; xstats[count++].value = val; } @@ -1563,9 +1639,7 @@ rte_eth_xstats_get(uint8_t port_id, struct rte_eth_xstats *xstats, rte_rxq_stats_strings[i].offset + q * sizeof(uint64_t)); val = *stats_ptr; - snprintf(xstats[count].name, sizeof(xstats[count].name), - "rx_q%u_%s", q, - rte_rxq_stats_strings[i].name); + xstats[count].id = count + xcount; xstats[count++].value = val; } } @@ -1577,9 +1651,7 @@ rte_eth_xstats_get(uint8_t port_id, struct rte_eth_xstats *xstats, rte_txq_stats_strings[i].offset + q * sizeof(uint64_t)); val = *stats_ptr; - snprintf(xstats[count].name, sizeof(xstats[count].name), - "tx_q%u_%s", q, - rte_txq_stats_strings[i].name); + xstats[count].id = count + xcount; xstats[count++].value = val; } } @@ -1639,7 +1711,6 @@ rte_eth_dev_set_rx_queue_stats_mapping(uint8_t port_id, uint16_t rx_queue_id, STAT_QMAP_RX); } - void rte_eth_dev_info_get(uint8_t port_id, struct rte_eth_dev_info *dev_info) { @@ -1661,6 +1732,8 @@ rte_eth_dev_info_get(uint8_t port_id, struct rte_eth_dev_info *dev_info) (*dev->dev_ops->dev_infos_get)(dev, dev_info); dev_info->pci_dev = dev->pci_dev; dev_info->driver_name = dev->data->drv_name; + dev_info->nb_rx_queues = dev->data->nb_rx_queues; + dev_info->nb_tx_queues = dev->data->nb_tx_queues; } int @@ -1994,10 +2067,7 @@ rte_eth_dev_rss_reta_query(uint8_t port_id, struct rte_eth_dev *dev; int ret; - if (port_id >= nb_ports) { - RTE_PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id); - return -ENODEV; - } + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); /* Check mask bits */ ret = rte_eth_check_reta_mask(reta_conf, reta_size); @@ -2641,10 +2711,7 @@ rte_eth_dev_rx_intr_ctl(uint8_t port_id, int epfd, int op, void *data) uint16_t qid; int rc; - if (!rte_eth_dev_is_valid_port(port_id)) { - RTE_PMD_DEBUG_TRACE("Invalid port_id=%u\n", port_id); - return -ENODEV; - } + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); dev = &rte_eth_devices[port_id]; intr_handle = &dev->pci_dev->intr_handle; @@ -2699,10 +2766,7 @@ rte_eth_dev_rx_intr_ctl_q(uint8_t port_id, uint16_t queue_id, struct rte_intr_handle *intr_handle; int rc; - if (!rte_eth_dev_is_valid_port(port_id)) { - RTE_PMD_DEBUG_TRACE("Invalid port_id=%u\n", port_id); - return -ENODEV; - } + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); dev = &rte_eth_devices[port_id]; if (queue_id >= dev->data->nb_rx_queues) { @@ -2734,10 +2798,7 @@ rte_eth_dev_rx_intr_enable(uint8_t port_id, { struct rte_eth_dev *dev; - if (!rte_eth_dev_is_valid_port(port_id)) { - RTE_PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id); - return -ENODEV; - } + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); dev = &rte_eth_devices[port_id]; @@ -2751,10 +2812,7 @@ rte_eth_dev_rx_intr_disable(uint8_t port_id, { struct rte_eth_dev *dev; - if (!rte_eth_dev_is_valid_port(port_id)) { - RTE_PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id); - return -ENODEV; - } + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); dev = &rte_eth_devices[port_id]; @@ -2925,7 +2983,6 @@ rte_eth_add_rx_callback(uint8_t port_id, uint16_t queue_id, rte_errno = EINVAL; return NULL; } - struct rte_eth_rxtx_callback *cb = rte_zmalloc(NULL, sizeof(*cb), 0); if (cb == NULL) { @@ -2936,6 +2993,7 @@ rte_eth_add_rx_callback(uint8_t port_id, uint16_t queue_id, cb->fn.rx = fn; cb->param = user_param; + rte_spinlock_lock(&rte_eth_rx_cb_lock); /* Add the callbacks in fifo order. */ struct rte_eth_rxtx_callback *tail = rte_eth_devices[port_id].post_rx_burst_cbs[queue_id]; @@ -2948,6 +3006,42 @@ rte_eth_add_rx_callback(uint8_t port_id, uint16_t queue_id, tail = tail->next; tail->next = cb; } + rte_spinlock_unlock(&rte_eth_rx_cb_lock); + + return cb; +} + +void * +rte_eth_add_first_rx_callback(uint8_t port_id, uint16_t queue_id, + rte_rx_callback_fn fn, void *user_param) +{ +#ifndef RTE_ETHDEV_RXTX_CALLBACKS + rte_errno = ENOTSUP; + return NULL; +#endif + /* check input parameters */ + if (!rte_eth_dev_is_valid_port(port_id) || fn == NULL || + queue_id >= rte_eth_devices[port_id].data->nb_rx_queues) { + rte_errno = EINVAL; + return NULL; + } + + struct rte_eth_rxtx_callback *cb = rte_zmalloc(NULL, sizeof(*cb), 0); + + if (cb == NULL) { + rte_errno = ENOMEM; + return NULL; + } + + cb->fn.rx = fn; + cb->param = user_param; + + rte_spinlock_lock(&rte_eth_rx_cb_lock); + /* Add the callbacks at fisrt position*/ + cb->next = rte_eth_devices[port_id].post_rx_burst_cbs[queue_id]; + rte_smp_wmb(); + rte_eth_devices[port_id].post_rx_burst_cbs[queue_id] = cb; + rte_spinlock_unlock(&rte_eth_rx_cb_lock); return cb; } @@ -2977,6 +3071,7 @@ rte_eth_add_tx_callback(uint8_t port_id, uint16_t queue_id, cb->fn.tx = fn; cb->param = user_param; + rte_spinlock_lock(&rte_eth_tx_cb_lock); /* Add the callbacks in fifo order. */ struct rte_eth_rxtx_callback *tail = rte_eth_devices[port_id].pre_tx_burst_cbs[queue_id]; @@ -2989,6 +3084,7 @@ rte_eth_add_tx_callback(uint8_t port_id, uint16_t queue_id, tail = tail->next; tail->next = cb; } + rte_spinlock_unlock(&rte_eth_tx_cb_lock); return cb; } @@ -3001,35 +3097,30 @@ rte_eth_remove_rx_callback(uint8_t port_id, uint16_t queue_id, return -ENOTSUP; #endif /* Check input parameters. */ - if (!rte_eth_dev_is_valid_port(port_id) || user_cb == NULL || - queue_id >= rte_eth_devices[port_id].data->nb_rx_queues) { + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL); + if (user_cb == NULL || + queue_id >= rte_eth_devices[port_id].data->nb_rx_queues) return -EINVAL; - } struct rte_eth_dev *dev = &rte_eth_devices[port_id]; - struct rte_eth_rxtx_callback *cb = dev->post_rx_burst_cbs[queue_id]; - struct rte_eth_rxtx_callback *prev_cb; - - /* Reset head pointer and remove user cb if first in the list. */ - if (cb == user_cb) { - dev->post_rx_burst_cbs[queue_id] = user_cb->next; - return 0; - } - - /* Remove the user cb from the callback list. */ - do { - prev_cb = cb; - cb = cb->next; - + struct rte_eth_rxtx_callback *cb; + struct rte_eth_rxtx_callback **prev_cb; + int ret = -EINVAL; + + rte_spinlock_lock(&rte_eth_rx_cb_lock); + prev_cb = &dev->post_rx_burst_cbs[queue_id]; + for (; *prev_cb != NULL; prev_cb = &cb->next) { + cb = *prev_cb; if (cb == user_cb) { - prev_cb->next = user_cb->next; - return 0; + /* Remove the user cb from the callback list. */ + *prev_cb = cb->next; + ret = 0; + break; } + } + rte_spinlock_unlock(&rte_eth_rx_cb_lock); - } while (cb != NULL); - - /* Callback wasn't found. */ - return -EINVAL; + return ret; } int @@ -3040,35 +3131,30 @@ rte_eth_remove_tx_callback(uint8_t port_id, uint16_t queue_id, return -ENOTSUP; #endif /* Check input parameters. */ - if (!rte_eth_dev_is_valid_port(port_id) || user_cb == NULL || - queue_id >= rte_eth_devices[port_id].data->nb_tx_queues) { + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL); + if (user_cb == NULL || + queue_id >= rte_eth_devices[port_id].data->nb_tx_queues) return -EINVAL; - } struct rte_eth_dev *dev = &rte_eth_devices[port_id]; - struct rte_eth_rxtx_callback *cb = dev->pre_tx_burst_cbs[queue_id]; - struct rte_eth_rxtx_callback *prev_cb; - - /* Reset head pointer and remove user cb if first in the list. */ - if (cb == user_cb) { - dev->pre_tx_burst_cbs[queue_id] = user_cb->next; - return 0; - } - - /* Remove the user cb from the callback list. */ - do { - prev_cb = cb; - cb = cb->next; - + int ret = -EINVAL; + struct rte_eth_rxtx_callback *cb; + struct rte_eth_rxtx_callback **prev_cb; + + rte_spinlock_lock(&rte_eth_tx_cb_lock); + prev_cb = &dev->pre_tx_burst_cbs[queue_id]; + for (; *prev_cb != NULL; prev_cb = &cb->next) { + cb = *prev_cb; if (cb == user_cb) { - prev_cb->next = user_cb->next; - return 0; + /* Remove the user cb from the callback list. */ + *prev_cb = cb->next; + ret = 0; + break; } + } + rte_spinlock_unlock(&rte_eth_tx_cb_lock); - } while (cb != NULL); - - /* Callback wasn't found. */ - return -EINVAL; + return ret; } int @@ -3284,10 +3370,7 @@ rte_eth_dev_get_dcb_info(uint8_t port_id, { struct rte_eth_dev *dev; - if (!rte_eth_dev_is_valid_port(port_id)) { - RTE_PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id); - return -ENODEV; - } + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); dev = &rte_eth_devices[port_id]; memset(dcb_info, 0, sizeof(struct rte_eth_dcb_info)); diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h index 022733ec..0f173231 100644 --- a/lib/librte_ether/rte_ethdev.h +++ b/lib/librte_ether/rte_ethdev.h @@ -102,7 +102,7 @@ * rte_eth_dev_configure(), rte_eth_tx_queue_setup(), or * rte_eth_rx_queue_setup()), it must call rte_eth_dev_stop() first to stop the * device and then do the reconfiguration before calling rte_eth_dev_start() - * again. The tramsit and receive functions should not be invoked when the + * again. The transmit and receive functions should not be invoked when the * device is stopped. * * Please note that some configuration is not stored between calls to @@ -200,27 +200,9 @@ struct rte_eth_stats { /**< Total of RX packets dropped by the HW, * because there are no available mbufs (i.e. RX queues are full). */ - uint64_t ibadcrc __rte_deprecated; - /**< Deprecated; Total of RX packets with CRC error. */ - uint64_t ibadlen __rte_deprecated; - /**< Deprecated; Total of RX packets with bad length. */ uint64_t ierrors; /**< Total number of erroneous received packets. */ uint64_t oerrors; /**< Total number of failed transmitted packets. */ - uint64_t imcasts; - /**< Deprecated; Total number of multicast received packets. */ uint64_t rx_nombuf; /**< Total number of RX mbuf allocation failures. */ - uint64_t fdirmatch __rte_deprecated; - /**< Deprecated; Total number of RX packets matching a filter. */ - uint64_t fdirmiss __rte_deprecated; - /**< Deprecated; Total number of RX packets not matching any filter. */ - uint64_t tx_pause_xon __rte_deprecated; - /**< Deprecated; Total nb. of XON pause frame sent. */ - uint64_t rx_pause_xon __rte_deprecated; - /**< Deprecated; Total nb. of XON pause frame received. */ - uint64_t tx_pause_xoff __rte_deprecated; - /**< Deprecated; Total nb. of XOFF pause frame sent. */ - uint64_t rx_pause_xoff __rte_deprecated; - /**< Deprecated; Total nb. of XOFF pause frame received. */ uint64_t q_ipackets[RTE_ETHDEV_QUEUE_STAT_CNTRS]; /**< Total number of queue RX packets. */ uint64_t q_opackets[RTE_ETHDEV_QUEUE_STAT_CNTRS]; @@ -231,14 +213,6 @@ struct rte_eth_stats { /**< Total number of successfully transmitted queue bytes. */ uint64_t q_errors[RTE_ETHDEV_QUEUE_STAT_CNTRS]; /**< Total number of queue packets received that are dropped. */ - uint64_t ilbpackets; - /**< Total number of good packets received from loopback,VF Only */ - uint64_t olbpackets; - /**< Total number of good packets transmitted to loopback,VF Only */ - uint64_t ilbbytes; - /**< Total number of good bytes received from loopback,VF Only */ - uint64_t olbbytes; - /**< Total number of good bytes transmitted to loopback,VF Only */ }; /** @@ -389,8 +363,8 @@ struct rte_eth_rxmode { */ enum rte_vlan_type { ETH_VLAN_TYPE_UNKNOWN = 0, - ETH_VLAN_TYPE_INNER, /**< Single VLAN, or inner VLAN. */ - ETH_VLAN_TYPE_OUTER, /**< Outer VLAN. */ + ETH_VLAN_TYPE_INNER, /**< Inner VLAN. */ + ETH_VLAN_TYPE_OUTER, /**< Single VLAN, or outer VLAN. */ ETH_VLAN_TYPE_MAX, }; @@ -439,6 +413,10 @@ struct rte_eth_rss_conf { #define ETH_RSS_IPV6_EX (1ULL << RTE_ETH_FLOW_IPV6_EX) #define ETH_RSS_IPV6_TCP_EX (1ULL << RTE_ETH_FLOW_IPV6_TCP_EX) #define ETH_RSS_IPV6_UDP_EX (1ULL << RTE_ETH_FLOW_IPV6_UDP_EX) +#define ETH_RSS_PORT (1ULL << RTE_ETH_FLOW_PORT) +#define ETH_RSS_VXLAN (1ULL << RTE_ETH_FLOW_VXLAN) +#define ETH_RSS_GENEVE (1ULL << RTE_ETH_FLOW_GENEVE) +#define ETH_RSS_NVGRE (1ULL << RTE_ETH_FLOW_NVGRE) #define ETH_RSS_IP ( \ ETH_RSS_IPV4 | \ @@ -463,6 +441,12 @@ struct rte_eth_rss_conf { ETH_RSS_NONFRAG_IPV4_SCTP | \ ETH_RSS_NONFRAG_IPV6_SCTP) +#define ETH_RSS_TUNNEL ( \ + ETH_RSS_VXLAN | \ + ETH_RSS_GENEVE | \ + ETH_RSS_NVGRE) + + /**< Mask of valid RSS hash protocols */ #define ETH_RSS_PROTO_MASK ( \ ETH_RSS_IPV4 | \ @@ -480,7 +464,11 @@ struct rte_eth_rss_conf { ETH_RSS_L2_PAYLOAD | \ ETH_RSS_IPV6_EX | \ ETH_RSS_IPV6_TCP_EX | \ - ETH_RSS_IPV6_UDP_EX) + ETH_RSS_IPV6_UDP_EX | \ + ETH_RSS_PORT | \ + ETH_RSS_VXLAN | \ + ETH_RSS_GENEVE | \ + ETH_RSS_NVGRE) /* * Definitions used for redirection table entry size. @@ -489,6 +477,7 @@ struct rte_eth_rss_conf { */ #define ETH_RSS_RETA_SIZE_64 64 #define ETH_RSS_RETA_SIZE_128 128 +#define ETH_RSS_RETA_SIZE_256 256 #define ETH_RSS_RETA_SIZE_512 512 #define RTE_RETA_GROUP_SIZE 64 @@ -908,6 +897,9 @@ struct rte_eth_dev_info { struct rte_eth_desc_lim rx_desc_lim; /**< RX descriptors limits */ struct rte_eth_desc_lim tx_desc_lim; /**< TX descriptors limits */ uint32_t speed_capa; /**< Supported speeds bitmap (ETH_LINK_SPEED_). */ + /** Configured number of rx/tx queues */ + uint16_t nb_rx_queues; /**< Number of RX queues. */ + uint16_t nb_tx_queues; /**< Number of TX queues. */ }; /** @@ -940,11 +932,21 @@ struct rte_eth_txq_info { * statistics that are not provided in the generic rte_eth_stats * structure. */ -struct rte_eth_xstats { - char name[RTE_ETH_XSTATS_NAME_SIZE]; +struct rte_eth_xstat { + uint64_t id; uint64_t value; }; +/** + * A name-key lookup element for extended statistics. + * + * This structure is used to map between names and ID numbers + * for extended ethernet statistics. + */ +struct rte_eth_xstat_name { + char name[RTE_ETH_XSTATS_NAME_SIZE]; +}; + #define ETH_DCB_NUM_TCS 8 #define ETH_MAX_VMDQ_POOL 64 @@ -1074,12 +1076,16 @@ typedef void (*eth_stats_reset_t)(struct rte_eth_dev *dev); /**< @internal Reset global I/O statistics of an Ethernet device to 0. */ typedef int (*eth_xstats_get_t)(struct rte_eth_dev *dev, - struct rte_eth_xstats *stats, unsigned n); + struct rte_eth_xstat *stats, unsigned n); /**< @internal Get extended stats of an Ethernet device. */ typedef void (*eth_xstats_reset_t)(struct rte_eth_dev *dev); /**< @internal Reset extended stats of an Ethernet device. */ +typedef int (*eth_xstats_get_names_t)(struct rte_eth_dev *dev, + struct rte_eth_xstat_name *xstats_names, unsigned size); +/**< @internal Get names of extended stats of an Ethernet device. */ + typedef int (*eth_queue_stats_mapping_set_t)(struct rte_eth_dev *dev, uint16_t queue_id, uint8_t stat_idx, @@ -1150,7 +1156,7 @@ typedef int (*vlan_filter_set_t)(struct rte_eth_dev *dev, typedef int (*vlan_tpid_set_t)(struct rte_eth_dev *dev, enum rte_vlan_type type, uint16_t tpid); -/**< @internal set the outer VLAN-TPID by an Ethernet device. */ +/**< @internal set the outer/inner VLAN-TPID by an Ethernet device. */ typedef void (*vlan_offload_set_t)(struct rte_eth_dev *dev, int mask); /**< @internal set VLAN offload function by an Ethernet device. */ @@ -1427,6 +1433,8 @@ struct eth_dev_ops { eth_stats_reset_t stats_reset; /**< Reset generic device statistics. */ eth_xstats_get_t xstats_get; /**< Get extended device statistics. */ eth_xstats_reset_t xstats_reset; /**< Reset extended device statistics. */ + eth_xstats_get_names_t xstats_get_names; + /**< Get names of extended statistics. */ eth_queue_stats_mapping_set_t queue_stats_mapping_set; /**< Configure per queue stat counter mapping. */ eth_dev_infos_get_t dev_infos_get; /**< Get device info. */ @@ -1434,7 +1442,7 @@ struct eth_dev_ops { /**< Get packet types supported and identified by device*/ mtu_set_t mtu_set; /**< Set MTU. */ vlan_filter_set_t vlan_filter_set; /**< Filter VLAN Setup. */ - vlan_tpid_set_t vlan_tpid_set; /**< Outer VLAN TPID Setup. */ + vlan_tpid_set_t vlan_tpid_set; /**< Outer/Inner VLAN TPID Setup. */ vlan_strip_queue_set_t vlan_strip_queue_set; /**< VLAN Stripping on queue. */ vlan_offload_set_t vlan_offload_set; /**< Set VLAN Offload. */ vlan_pvid_set_t vlan_pvid_set; /**< Set port based TX VLAN insertion */ @@ -1641,7 +1649,7 @@ struct rte_eth_dev { struct rte_eth_rxtx_callback *pre_tx_burst_cbs[RTE_MAX_QUEUES_PER_PORT]; uint8_t attached; /**< Flag indicating the port is attached */ enum rte_eth_dev_type dev_type; /**< Flag indicating the device type */ -}; +} __rte_cache_aligned; struct rte_eth_dev_sriov { uint8_t active; /**< SRIOV is active with 16, 32 or 64 pools */ @@ -2015,7 +2023,7 @@ int rte_eth_tx_queue_setup(uint8_t port_id, uint16_t tx_queue_id, uint16_t nb_tx_desc, unsigned int socket_id, const struct rte_eth_txconf *tx_conf); -/* +/** * Return the NUMA socket to which an Ethernet device is connected * * @param port_id @@ -2027,7 +2035,7 @@ int rte_eth_tx_queue_setup(uint8_t port_id, uint16_t tx_queue_id, */ int rte_eth_dev_socket_id(uint8_t port_id); -/* +/** * Check if port_id of device is attached * * @param port_id @@ -2038,7 +2046,7 @@ int rte_eth_dev_socket_id(uint8_t port_id); */ int rte_eth_dev_is_valid_port(uint8_t port_id); -/* +/** * Allocate mbuf from mempool, setup the DMA physical address * and then start RX for specified queue of a port. It is used * when rx_deferred_start flag of the specified queue is true. @@ -2056,7 +2064,7 @@ int rte_eth_dev_is_valid_port(uint8_t port_id); */ int rte_eth_dev_rx_queue_start(uint8_t port_id, uint16_t rx_queue_id); -/* +/** * Stop specified RX queue of a port * * @param port_id @@ -2072,7 +2080,7 @@ int rte_eth_dev_rx_queue_start(uint8_t port_id, uint16_t rx_queue_id); */ int rte_eth_dev_rx_queue_stop(uint8_t port_id, uint16_t rx_queue_id); -/* +/** * Start TX for specified queue of a port. It is used when tx_deferred_start * flag of the specified queue is true. * @@ -2089,7 +2097,7 @@ int rte_eth_dev_rx_queue_stop(uint8_t port_id, uint16_t rx_queue_id); */ int rte_eth_dev_tx_queue_start(uint8_t port_id, uint16_t tx_queue_id); -/* +/** * Stop specified TX queue of a port * * @param port_id @@ -2279,13 +2287,36 @@ int rte_eth_stats_get(uint8_t port_id, struct rte_eth_stats *stats); void rte_eth_stats_reset(uint8_t port_id); /** + * Retrieve names of extended statistics of an Ethernet device. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param xstats_names + * Block of memory to insert names into. Must be at least size in capacity. + * If set to NULL, function returns required capacity. + * @param size + * Capacity of xstats_names (number of names). + * @return + * - positive value lower or equal to size: success. The return value + * is the number of entries filled in the stats table. + * - positive value higher than size: error, the given statistics table + * is too small. The return value corresponds to the size that should + * be given to succeed. The entries in the table are not valid and + * shall not be used by the caller. + * - negative value on error (invalid port id) + */ +int rte_eth_xstats_get_names(uint8_t port_id, + struct rte_eth_xstat_name *xstats_names, + unsigned size); + +/** * Retrieve extended statistics of an Ethernet device. * * @param port_id * The port identifier of the Ethernet device. * @param xstats - * A pointer to a table of structure of type *rte_eth_xstats* - * to be filled with device statistics names and values. + * A pointer to a table of structure of type *rte_eth_xstat* + * to be filled with device statistics ids and values. * This parameter can be set to NULL if n is 0. * @param n * The size of the stats table, which should be large enough to store @@ -2299,7 +2330,7 @@ void rte_eth_stats_reset(uint8_t port_id); * shall not be used by the caller. * - negative value on error (invalid port id) */ -int rte_eth_xstats_get(uint8_t port_id, struct rte_eth_xstats *xstats, +int rte_eth_xstats_get(uint8_t port_id, struct rte_eth_xstat *xstats, unsigned n); /** @@ -2376,6 +2407,21 @@ void rte_eth_dev_info_get(uint8_t port_id, struct rte_eth_dev_info *dev_info); /** * Retrieve the supported packet types of an Ethernet device. * + * When a packet type is announced as supported, it *must* be recognized by + * the PMD. For instance, if RTE_PTYPE_L2_ETHER, RTE_PTYPE_L2_ETHER_VLAN + * and RTE_PTYPE_L3_IPV4 are announced, the PMD must return the following + * packet types for these packets: + * - Ether/IPv4 -> RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4 + * - Ether/Vlan/IPv4 -> RTE_PTYPE_L2_ETHER_VLAN | RTE_PTYPE_L3_IPV4 + * - Ether/[anything else] -> RTE_PTYPE_L2_ETHER + * - Ether/Vlan/[anything else] -> RTE_PTYPE_L2_ETHER_VLAN + * + * When a packet is received by a PMD, the most precise type must be + * returned among the ones supported. However a PMD is allowed to set + * packet type that is not in the supported list, at the condition that it + * is more precise. Therefore, a PMD announcing no supported packet types + * can still set a matching packet type in a received packet. + * * @note * Better to invoke this API after the device is already started or rx burst * function is decided, to obtain correct supported ptypes. @@ -2424,6 +2470,7 @@ int rte_eth_dev_get_mtu(uint8_t port_id, uint16_t *mtu); * - (-ENOTSUP) if operation is not supported. * - (-ENODEV) if *port_id* invalid. * - (-EINVAL) if *mtu* invalid. + * - (-EBUSY) if operation is not allowed when the port is running */ int rte_eth_dev_set_mtu(uint8_t port_id, uint16_t mtu); @@ -2709,7 +2756,8 @@ rte_eth_rx_descriptor_done(uint8_t port_id, uint16_t queue_id, uint16_t offset) * on the output queue *queue_id* of the Ethernet device designated by its * *port_id*. * The *nb_pkts* parameter is the number of packets to send which are - * supplied in the *tx_pkts* array of *rte_mbuf* structures. + * supplied in the *tx_pkts* array of *rte_mbuf* structures, each of them + * allocated from a pool created with rte_pktmbuf_pool_create(). * The rte_eth_tx_burst() function loops, sending *nb_pkts* packets, * up to the number of transmit descriptors available in the TX ring of the * transmit queue. @@ -3851,6 +3899,34 @@ int rte_eth_dev_get_dcb_info(uint8_t port_id, void *rte_eth_add_rx_callback(uint8_t port_id, uint16_t queue_id, rte_rx_callback_fn fn, void *user_param); +/* +* Add a callback that must be called first on packet RX on a given port +* and queue. +* +* This API configures a first function to be called for each burst of +* packets received on a given NIC port queue. The return value is a pointer +* that can be used to later remove the callback using +* rte_eth_remove_rx_callback(). +* +* Multiple functions are called in the order that they are added. +* +* @param port_id +* The port identifier of the Ethernet device. +* @param queue_id +* The queue on the Ethernet device on which the callback is to be added. +* @param fn +* The callback function +* @param user_param +* A generic pointer parameter which will be passed to each invocation of the +* callback function on this port and queue. +* +* @return +* NULL on error. +* On success, a pointer value which can later be used to remove the callback. +*/ +void *rte_eth_add_first_rx_callback(uint8_t port_id, uint16_t queue_id, + rte_rx_callback_fn fn, void *user_param); + /** * Add a callback to be called on packet TX on a given port and queue. * @@ -3984,7 +4060,7 @@ int rte_eth_rx_queue_info_get(uint8_t port_id, uint16_t queue_id, int rte_eth_tx_queue_info_get(uint8_t port_id, uint16_t queue_id, struct rte_eth_txq_info *qinfo); -/* +/** * Retrieve number of available registers for access * * @param port_id @@ -4279,6 +4355,35 @@ rte_eth_dev_l2_tunnel_offload_set(uint8_t port_id, uint32_t mask, uint8_t en); +/** +* Get the port id from pci adrress or device name +* Ex: 0000:2:00.0 or vdev name eth_pcap0 +* +* @param name +* pci address or name of the device +* @param port_id +* pointer to port identifier of the device +* @return +* - (0) if successful. +* - (-ENODEV or -EINVAL) on failure. +*/ +int +rte_eth_dev_get_port_by_name(const char *name, uint8_t *port_id); + +/** +* Get the device name from port id +* +* @param port_id +* pointer to port identifier of the device +* @param name +* pci address or name of the device +* @return +* - (0) if successful. +* - (-EINVAL) on failure. +*/ +int +rte_eth_dev_get_name_by_port(uint8_t port_id, char *name); + #ifdef __cplusplus } #endif diff --git a/lib/librte_ether/rte_ether_version.map b/lib/librte_ether/rte_ether_version.map index 214ecc73..e1ccebe0 100644 --- a/lib/librte_ether/rte_ether_version.map +++ b/lib/librte_ether/rte_ether_version.map @@ -66,7 +66,6 @@ DPDK_2.2 { rte_eth_dev_set_vf_rxmode; rte_eth_dev_set_vf_tx; rte_eth_dev_set_vf_vlan_filter; - rte_eth_dev_set_vlan_ether_type; rte_eth_dev_set_vlan_offload; rte_eth_dev_set_vlan_pvid; rte_eth_dev_set_vlan_strip_on_queue; @@ -132,3 +131,12 @@ DPDK_16.04 { rte_eth_tx_buffer_set_err_callback; } DPDK_2.2; + +DPDK_16.07 { + global: + + rte_eth_add_first_rx_callback; + rte_eth_dev_get_name_by_port; + rte_eth_dev_get_port_by_name; + rte_eth_xstats_get_names; +} DPDK_16.04; diff --git a/lib/librte_hash/rte_cuckoo_hash.c b/lib/librte_hash/rte_cuckoo_hash.c index 7b7d1f85..e3cc3a7c 100644 --- a/lib/librte_hash/rte_cuckoo_hash.c +++ b/lib/librte_hash/rte_cuckoo_hash.c @@ -1,7 +1,7 @@ /*- * BSD LICENSE * - * Copyright(c) 2010-2015 Intel Corporation. All rights reserved. + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -59,12 +59,10 @@ #include <rte_compat.h> #include "rte_hash.h" -#if defined(RTE_ARCH_X86) -#include "rte_cmp_x86.h" -#endif +#include "rte_cuckoo_hash.h" -#if defined(RTE_ARCH_ARM64) -#include "rte_cmp_arm64.h" +#if defined(RTE_ARCH_X86) +#include "rte_cuckoo_hash_x86.h" #endif TAILQ_HEAD(rte_hash_list, rte_tailq_entry); @@ -74,153 +72,6 @@ static struct rte_tailq_elem rte_hash_tailq = { }; EAL_REGISTER_TAILQ(rte_hash_tailq) -/* Macro to enable/disable run-time checking of function parameters */ -#if defined(RTE_LIBRTE_HASH_DEBUG) -#define RETURN_IF_TRUE(cond, retval) do { \ - if (cond) \ - return retval; \ -} while (0) -#else -#define RETURN_IF_TRUE(cond, retval) -#endif - -/* Hash function used if none is specified */ -#if defined(RTE_MACHINE_CPUFLAG_SSE4_2) || defined(RTE_MACHINE_CPUFLAG_CRC32) -#include <rte_hash_crc.h> -#define DEFAULT_HASH_FUNC rte_hash_crc -#else -#include <rte_jhash.h> -#define DEFAULT_HASH_FUNC rte_jhash -#endif - -/** Number of items per bucket. */ -#define RTE_HASH_BUCKET_ENTRIES 4 - -#define NULL_SIGNATURE 0 - -#define KEY_ALIGNMENT 16 - -#define LCORE_CACHE_SIZE 8 - -#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64) -/* - * All different options to select a key compare function, - * based on the key size and custom function. - */ -enum cmp_jump_table_case { - KEY_CUSTOM = 0, - KEY_16_BYTES, - KEY_32_BYTES, - KEY_48_BYTES, - KEY_64_BYTES, - KEY_80_BYTES, - KEY_96_BYTES, - KEY_112_BYTES, - KEY_128_BYTES, - KEY_OTHER_BYTES, - NUM_KEY_CMP_CASES, -}; - -/* - * Table storing all different key compare functions - * (multi-process supported) - */ -const rte_hash_cmp_eq_t cmp_jump_table[NUM_KEY_CMP_CASES] = { - NULL, - rte_hash_k16_cmp_eq, - rte_hash_k32_cmp_eq, - rte_hash_k48_cmp_eq, - rte_hash_k64_cmp_eq, - rte_hash_k80_cmp_eq, - rte_hash_k96_cmp_eq, - rte_hash_k112_cmp_eq, - rte_hash_k128_cmp_eq, - memcmp -}; -#else -/* - * All different options to select a key compare function, - * based on the key size and custom function. - */ -enum cmp_jump_table_case { - KEY_CUSTOM = 0, - KEY_OTHER_BYTES, - NUM_KEY_CMP_CASES, -}; - -/* - * Table storing all different key compare functions - * (multi-process supported) - */ -const rte_hash_cmp_eq_t cmp_jump_table[NUM_KEY_CMP_CASES] = { - NULL, - memcmp -}; - -#endif - -struct lcore_cache { - unsigned len; /**< Cache len */ - void *objs[LCORE_CACHE_SIZE]; /**< Cache objects */ -} __rte_cache_aligned; - -/** A hash table structure. */ -struct rte_hash { - char name[RTE_HASH_NAMESIZE]; /**< Name of the hash. */ - uint32_t entries; /**< Total table entries. */ - uint32_t num_buckets; /**< Number of buckets in table. */ - uint32_t key_len; /**< Length of hash key. */ - rte_hash_function hash_func; /**< Function used to calculate hash. */ - uint32_t hash_func_init_val; /**< Init value used by hash_func. */ - rte_hash_cmp_eq_t rte_hash_custom_cmp_eq; - /**< Custom function used to compare keys. */ - enum cmp_jump_table_case cmp_jump_table_idx; - /**< Indicates which compare function to use. */ - uint32_t bucket_bitmask; /**< Bitmask for getting bucket index - from hash signature. */ - uint32_t key_entry_size; /**< Size of each key entry. */ - - struct rte_ring *free_slots; /**< Ring that stores all indexes - of the free slots in the key table */ - void *key_store; /**< Table storing all keys and data */ - struct rte_hash_bucket *buckets; /**< Table with buckets storing all the - hash values and key indexes - to the key table*/ - uint8_t hw_trans_mem_support; /**< Hardware transactional - memory support */ - struct lcore_cache *local_free_slots; - /**< Local cache per lcore, storing some indexes of the free slots */ -} __rte_cache_aligned; - -/* Structure storing both primary and secondary hashes */ -struct rte_hash_signatures { - union { - struct { - hash_sig_t current; - hash_sig_t alt; - }; - uint64_t sig; - }; -}; - -/* Structure that stores key-value pair */ -struct rte_hash_key { - union { - uintptr_t idata; - void *pdata; - }; - /* Variable key size */ - char key[0]; -} __attribute__((aligned(KEY_ALIGNMENT))); - -/** Bucket structure */ -struct rte_hash_bucket { - struct rte_hash_signatures signatures[RTE_HASH_BUCKET_ENTRIES]; - /* Includes dummy key index that always contains index 0 */ - uint32_t key_idx[RTE_HASH_BUCKET_ENTRIES + 1]; - uint8_t flag[RTE_HASH_BUCKET_ENTRIES]; -} __rte_cache_aligned; - struct rte_hash * rte_hash_find_existing(const char *name) { @@ -372,7 +223,7 @@ rte_hash_create(const struct rte_hash_parameters *params) /* * If x86 architecture is used, select appropriate compare function, - * which may use x86 instrinsics, otherwise use memcmp + * which may use x86 intrinsics, otherwise use memcmp */ #if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64) /* Select function to compare keys */ @@ -431,7 +282,23 @@ rte_hash_create(const struct rte_hash_parameters *params) h->free_slots = r; h->hw_trans_mem_support = hw_trans_mem_support; - /* populate the free slots ring. Entry zero is reserved for key misses */ + /* Turn on multi-writer only with explicit flat from user and TM + * support. + */ + if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_MULTI_WRITER_ADD) { + if (h->hw_trans_mem_support) { + h->add_key = ADD_KEY_MULTIWRITER_TM; + } else { + h->add_key = ADD_KEY_MULTIWRITER; + h->multiwriter_lock = rte_malloc(NULL, + sizeof(rte_spinlock_t), + LCORE_CACHE_SIZE); + rte_spinlock_init(h->multiwriter_lock); + } + } else + h->add_key = ADD_KEY_SINGLEWRITER; + + /* Populate free slots ring. Entry zero is reserved for key misses. */ for (i = 1; i < params->entries + 1; i++) rte_ring_sp_enqueue(r, (void *)((uintptr_t) i)); @@ -482,6 +349,8 @@ rte_hash_free(struct rte_hash *h) if (h->hw_trans_mem_support) rte_free(h->local_free_slots); + if (h->add_key == ADD_KEY_MULTIWRITER) + rte_free(h->multiwriter_lock); rte_ring_free(h->free_slots); rte_free(h->key_store); rte_free(h->buckets); @@ -632,6 +501,9 @@ __rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key, unsigned lcore_id; struct lcore_cache *cached_free_slots = NULL; + if (h->add_key == ADD_KEY_MULTIWRITER) + rte_spinlock_lock(h->multiwriter_lock); + prim_bucket_idx = sig & h->bucket_bitmask; prim_bkt = &h->buckets[prim_bucket_idx]; rte_prefetch0(prim_bkt); @@ -712,35 +584,67 @@ __rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key, rte_memcpy(new_k->key, key, h->key_len); new_k->pdata = data; - /* Insert new entry is there is room in the primary bucket */ - for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { - /* Check if slot is available */ - if (likely(prim_bkt->signatures[i].sig == NULL_SIGNATURE)) { - prim_bkt->signatures[i].current = sig; - prim_bkt->signatures[i].alt = alt_hash; - prim_bkt->key_idx[i] = new_idx; +#if defined(RTE_ARCH_X86) /* currently only x86 support HTM */ + if (h->add_key == ADD_KEY_MULTIWRITER_TM) { + ret = rte_hash_cuckoo_insert_mw_tm(prim_bkt, + sig, alt_hash, new_idx); + if (ret >= 0) + return new_idx - 1; + + /* Primary bucket full, need to make space for new entry */ + ret = rte_hash_cuckoo_make_space_mw_tm(h, prim_bkt, sig, + alt_hash, new_idx); + + if (ret >= 0) + return new_idx - 1; + + /* Also search secondary bucket to get better occupancy */ + ret = rte_hash_cuckoo_make_space_mw_tm(h, sec_bkt, sig, + alt_hash, new_idx); + + if (ret >= 0) + return new_idx - 1; + } else { +#endif + for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { + /* Check if slot is available */ + if (likely(prim_bkt->signatures[i].sig == NULL_SIGNATURE)) { + prim_bkt->signatures[i].current = sig; + prim_bkt->signatures[i].alt = alt_hash; + prim_bkt->key_idx[i] = new_idx; + break; + } + } + + if (i != RTE_HASH_BUCKET_ENTRIES) { + if (h->add_key == ADD_KEY_MULTIWRITER) + rte_spinlock_unlock(h->multiwriter_lock); return new_idx - 1; } - } - /* Primary bucket is full, so we need to make space for new entry */ - ret = make_space_bucket(h, prim_bkt); - /* - * After recursive function. - * Insert the new entry in the position of the pushed entry - * if successful or return error and - * store the new slot back in the ring - */ - if (ret >= 0) { - prim_bkt->signatures[ret].current = sig; - prim_bkt->signatures[ret].alt = alt_hash; - prim_bkt->key_idx[ret] = new_idx; - return new_idx - 1; + /* Primary bucket full, need to make space for new entry + * After recursive function. + * Insert the new entry in the position of the pushed entry + * if successful or return error and + * store the new slot back in the ring + */ + ret = make_space_bucket(h, prim_bkt); + if (ret >= 0) { + prim_bkt->signatures[ret].current = sig; + prim_bkt->signatures[ret].alt = alt_hash; + prim_bkt->key_idx[ret] = new_idx; + if (h->add_key == ADD_KEY_MULTIWRITER) + rte_spinlock_unlock(h->multiwriter_lock); + return new_idx - 1; + } +#if defined(RTE_ARCH_X86) } - +#endif /* Error in addition, store new slot back in the ring and return error */ enqueue_slot_back(h, cached_free_slots, (void *)((uintptr_t) new_idx)); + if (h->add_key == ADD_KEY_MULTIWRITER) + rte_spinlock_unlock(h->multiwriter_lock); return ret; } diff --git a/lib/librte_hash/rte_cuckoo_hash.h b/lib/librte_hash/rte_cuckoo_hash.h new file mode 100644 index 00000000..6c76700f --- /dev/null +++ b/lib/librte_hash/rte_cuckoo_hash.h @@ -0,0 +1,219 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* rte_cuckoo_hash.h + * This file hold Cuckoo Hash private data structures to allows include from + * platform specific files like rte_cuckoo_hash_x86.h + */ + +#ifndef _RTE_CUCKOO_HASH_H_ +#define _RTE_CUCKOO_HASH_H_ + +#if defined(RTE_ARCH_X86) +#include "rte_cmp_x86.h" +#endif + +#if defined(RTE_ARCH_ARM64) +#include "rte_cmp_arm64.h" +#endif + +/* Macro to enable/disable run-time checking of function parameters */ +#if defined(RTE_LIBRTE_HASH_DEBUG) +#define RETURN_IF_TRUE(cond, retval) do { \ + if (cond) \ + return retval; \ +} while (0) +#else +#define RETURN_IF_TRUE(cond, retval) +#endif + +/* Hash function used if none is specified */ +#if defined(RTE_MACHINE_CPUFLAG_SSE4_2) || defined(RTE_MACHINE_CPUFLAG_CRC32) +#include <rte_hash_crc.h> +#define DEFAULT_HASH_FUNC rte_hash_crc +#else +#include <rte_jhash.h> +#define DEFAULT_HASH_FUNC rte_jhash +#endif + +#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64) +/* + * All different options to select a key compare function, + * based on the key size and custom function. + */ +enum cmp_jump_table_case { + KEY_CUSTOM = 0, + KEY_16_BYTES, + KEY_32_BYTES, + KEY_48_BYTES, + KEY_64_BYTES, + KEY_80_BYTES, + KEY_96_BYTES, + KEY_112_BYTES, + KEY_128_BYTES, + KEY_OTHER_BYTES, + NUM_KEY_CMP_CASES, +}; + +/* + * Table storing all different key compare functions + * (multi-process supported) + */ +const rte_hash_cmp_eq_t cmp_jump_table[NUM_KEY_CMP_CASES] = { + NULL, + rte_hash_k16_cmp_eq, + rte_hash_k32_cmp_eq, + rte_hash_k48_cmp_eq, + rte_hash_k64_cmp_eq, + rte_hash_k80_cmp_eq, + rte_hash_k96_cmp_eq, + rte_hash_k112_cmp_eq, + rte_hash_k128_cmp_eq, + memcmp +}; +#else +/* + * All different options to select a key compare function, + * based on the key size and custom function. + */ +enum cmp_jump_table_case { + KEY_CUSTOM = 0, + KEY_OTHER_BYTES, + NUM_KEY_CMP_CASES, +}; + +/* + * Table storing all different key compare functions + * (multi-process supported) + */ +const rte_hash_cmp_eq_t cmp_jump_table[NUM_KEY_CMP_CASES] = { + NULL, + memcmp +}; + +#endif + +enum add_key_case { + ADD_KEY_SINGLEWRITER = 0, + ADD_KEY_MULTIWRITER, + ADD_KEY_MULTIWRITER_TM, +}; + +/** Number of items per bucket. */ +#define RTE_HASH_BUCKET_ENTRIES 4 + +#define NULL_SIGNATURE 0 + +#define KEY_ALIGNMENT 16 + +#define LCORE_CACHE_SIZE 64 + +#define RTE_HASH_BFS_QUEUE_MAX_LEN 1000 + +#define RTE_XABORT_CUCKOO_PATH_INVALIDED 0x4 + +#define RTE_HASH_TSX_MAX_RETRY 10 + +struct lcore_cache { + unsigned len; /**< Cache len */ + void *objs[LCORE_CACHE_SIZE]; /**< Cache objects */ +} __rte_cache_aligned; + +/* Structure storing both primary and secondary hashes */ +struct rte_hash_signatures { + union { + struct { + hash_sig_t current; + hash_sig_t alt; + }; + uint64_t sig; + }; +}; + +/* Structure that stores key-value pair */ +struct rte_hash_key { + union { + uintptr_t idata; + void *pdata; + }; + /* Variable key size */ + char key[0]; +} __attribute__((aligned(KEY_ALIGNMENT))); + +/** Bucket structure */ +struct rte_hash_bucket { + struct rte_hash_signatures signatures[RTE_HASH_BUCKET_ENTRIES]; + /* Includes dummy key index that always contains index 0 */ + uint32_t key_idx[RTE_HASH_BUCKET_ENTRIES + 1]; + uint8_t flag[RTE_HASH_BUCKET_ENTRIES]; +} __rte_cache_aligned; + +/** A hash table structure. */ +struct rte_hash { + char name[RTE_HASH_NAMESIZE]; /**< Name of the hash. */ + uint32_t entries; /**< Total table entries. */ + uint32_t num_buckets; /**< Number of buckets in table. */ + uint32_t key_len; /**< Length of hash key. */ + rte_hash_function hash_func; /**< Function used to calculate hash. */ + uint32_t hash_func_init_val; /**< Init value used by hash_func. */ + rte_hash_cmp_eq_t rte_hash_custom_cmp_eq; + /**< Custom function used to compare keys. */ + enum cmp_jump_table_case cmp_jump_table_idx; + /**< Indicates which compare function to use. */ + uint32_t bucket_bitmask; /**< Bitmask for getting bucket index + from hash signature. */ + uint32_t key_entry_size; /**< Size of each key entry. */ + + struct rte_ring *free_slots; /**< Ring that stores all indexes + of the free slots in the key table */ + void *key_store; /**< Table storing all keys and data */ + struct rte_hash_bucket *buckets; /**< Table with buckets storing all the + hash values and key indexes + to the key table*/ + uint8_t hw_trans_mem_support; /**< Hardware transactional + memory support */ + struct lcore_cache *local_free_slots; + /**< Local cache per lcore, storing some indexes of the free slots */ + enum add_key_case add_key; /**< Multi-writer hash add behavior */ + + rte_spinlock_t *multiwriter_lock; /**< Multi-writer spinlock for w/o TM */ +} __rte_cache_aligned; + +struct queue_node { + struct rte_hash_bucket *bkt; /* Current bucket on the bfs search */ + + struct queue_node *prev; /* Parent(bucket) in search path */ + int prev_slot; /* Parent(slot) in search path */ +}; + +#endif diff --git a/lib/librte_hash/rte_cuckoo_hash_x86.h b/lib/librte_hash/rte_cuckoo_hash_x86.h new file mode 100644 index 00000000..fa5630b7 --- /dev/null +++ b/lib/librte_hash/rte_cuckoo_hash_x86.h @@ -0,0 +1,193 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* rte_cuckoo_hash_x86.h + * This file holds all x86 specific Cuckoo Hash functions + */ + +/* Only tries to insert at one bucket (@prim_bkt) without trying to push + * buckets around + */ +static inline unsigned +rte_hash_cuckoo_insert_mw_tm(struct rte_hash_bucket *prim_bkt, + hash_sig_t sig, hash_sig_t alt_hash, uint32_t new_idx) +{ + unsigned i, status; + unsigned try = 0; + + while (try < RTE_HASH_TSX_MAX_RETRY) { + status = rte_xbegin(); + if (likely(status == RTE_XBEGIN_STARTED)) { + /* Insert new entry if there is room in the primary + * bucket. + */ + for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { + /* Check if slot is available */ + if (likely(prim_bkt->signatures[i].sig == + NULL_SIGNATURE)) { + prim_bkt->signatures[i].current = sig; + prim_bkt->signatures[i].alt = alt_hash; + prim_bkt->key_idx[i] = new_idx; + break; + } + } + rte_xend(); + + if (i != RTE_HASH_BUCKET_ENTRIES) + return 0; + + break; /* break off try loop if transaction commits */ + } else { + /* If we abort we give up this cuckoo path. */ + try++; + rte_pause(); + } + } + + return -1; +} + +/* Shift buckets along provided cuckoo_path (@leaf and @leaf_slot) and fill + * the path head with new entry (sig, alt_hash, new_idx) + */ +static inline int +rte_hash_cuckoo_move_insert_mw_tm(const struct rte_hash *h, + struct queue_node *leaf, uint32_t leaf_slot, + hash_sig_t sig, hash_sig_t alt_hash, uint32_t new_idx) +{ + unsigned try = 0; + unsigned status; + uint32_t prev_alt_bkt_idx; + + struct queue_node *prev_node, *curr_node = leaf; + struct rte_hash_bucket *prev_bkt, *curr_bkt = leaf->bkt; + uint32_t prev_slot, curr_slot = leaf_slot; + + while (try < RTE_HASH_TSX_MAX_RETRY) { + status = rte_xbegin(); + if (likely(status == RTE_XBEGIN_STARTED)) { + while (likely(curr_node->prev != NULL)) { + prev_node = curr_node->prev; + prev_bkt = prev_node->bkt; + prev_slot = curr_node->prev_slot; + + prev_alt_bkt_idx + = prev_bkt->signatures[prev_slot].alt + & h->bucket_bitmask; + + if (unlikely(&h->buckets[prev_alt_bkt_idx] + != curr_bkt)) { + rte_xabort(RTE_XABORT_CUCKOO_PATH_INVALIDED); + } + + /* Need to swap current/alt sig to allow later + * Cuckoo insert to move elements back to its + * primary bucket if available + */ + curr_bkt->signatures[curr_slot].alt = + prev_bkt->signatures[prev_slot].current; + curr_bkt->signatures[curr_slot].current = + prev_bkt->signatures[prev_slot].alt; + curr_bkt->key_idx[curr_slot] + = prev_bkt->key_idx[prev_slot]; + + curr_slot = prev_slot; + curr_node = prev_node; + curr_bkt = curr_node->bkt; + } + + curr_bkt->signatures[curr_slot].current = sig; + curr_bkt->signatures[curr_slot].alt = alt_hash; + curr_bkt->key_idx[curr_slot] = new_idx; + + rte_xend(); + + return 0; + } + + /* If we abort we give up this cuckoo path, since most likely it's + * no longer valid as TSX detected data conflict + */ + try++; + rte_pause(); + } + + return -1; +} + +/* + * Make space for new key, using bfs Cuckoo Search and Multi-Writer safe + * Cuckoo + */ +static inline int +rte_hash_cuckoo_make_space_mw_tm(const struct rte_hash *h, + struct rte_hash_bucket *bkt, + hash_sig_t sig, hash_sig_t alt_hash, + uint32_t new_idx) +{ + unsigned i; + struct queue_node queue[RTE_HASH_BFS_QUEUE_MAX_LEN]; + struct queue_node *tail, *head; + struct rte_hash_bucket *curr_bkt, *alt_bkt; + + tail = queue; + head = queue + 1; + tail->bkt = bkt; + tail->prev = NULL; + tail->prev_slot = -1; + + /* Cuckoo bfs Search */ + while (likely(tail != head && head < + queue + RTE_HASH_BFS_QUEUE_MAX_LEN - 4)) { + curr_bkt = tail->bkt; + for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { + if (curr_bkt->signatures[i].sig == NULL_SIGNATURE) { + if (likely(rte_hash_cuckoo_move_insert_mw_tm(h, + tail, i, sig, + alt_hash, new_idx) == 0)) + return 0; + } + + /* Enqueue new node and keep prev node info */ + alt_bkt = &(h->buckets[curr_bkt->signatures[i].alt + & h->bucket_bitmask]); + head->bkt = alt_bkt; + head->prev = tail; + head->prev_slot = i; + head++; + } + tail++; + } + + return -ENOSPC; +} diff --git a/lib/librte_hash/rte_hash.h b/lib/librte_hash/rte_hash.h index ae00b658..c9612fbd 100644 --- a/lib/librte_hash/rte_hash.h +++ b/lib/librte_hash/rte_hash.h @@ -60,6 +60,9 @@ extern "C" { /** Enable Hardware transactional memory support. */ #define RTE_HASH_EXTRA_FLAGS_TRANS_MEM_SUPPORT 0x01 +/** Default behavior of insertion, single writer/multi writer */ +#define RTE_HASH_EXTRA_FLAGS_MULTI_WRITER_ADD 0x02 + /** Signature of key that is stored internally. */ typedef uint32_t hash_sig_t; @@ -362,8 +365,6 @@ rte_hash_lookup_with_hash(const struct rte_hash *h, hash_sig_t rte_hash_hash(const struct rte_hash *h, const void *key); -#define rte_hash_lookup_multi rte_hash_lookup_bulk -#define rte_hash_lookup_multi_data rte_hash_lookup_bulk_data /** * Find multiple keys in the hash table. * This operation is multi-thread safe. diff --git a/lib/librte_ip_frag/Makefile b/lib/librte_ip_frag/Makefile index 9d06780d..e97dfbd3 100644 --- a/lib/librte_ip_frag/Makefile +++ b/lib/librte_ip_frag/Makefile @@ -52,8 +52,9 @@ SRCS-$(CONFIG_RTE_LIBRTE_IP_FRAG) += ip_frag_internal.c # install this header file SYMLINK-$(CONFIG_RTE_LIBRTE_IP_FRAG)-include += rte_ip_frag.h - -# this library depends on rte_ether -DEPDIRS-$(CONFIG_RTE_LIBRTE_IP_FRAG) += lib/librte_mempool lib/librte_ether +DEPDIRS-$(CONFIG_RTE_LIBRTE_IP_FRAG) += lib/librte_eal +DEPDIRS-$(CONFIG_RTE_LIBRTE_IP_FRAG) += lib/librte_ether +DEPDIRS-$(CONFIG_RTE_LIBRTE_IP_FRAG) += lib/librte_mbuf +DEPDIRS-$(CONFIG_RTE_LIBRTE_IP_FRAG) += lib/librte_mempool include $(RTE_SDK)/mk/rte.lib.mk diff --git a/lib/librte_ip_frag/ip_frag_common.h b/lib/librte_ip_frag/ip_frag_common.h index cde6ed4a..835e4f93 100644 --- a/lib/librte_ip_frag/ip_frag_common.h +++ b/lib/librte_ip_frag/ip_frag_common.h @@ -38,17 +38,9 @@ /* logging macros. */ #ifdef RTE_LIBRTE_IP_FRAG_DEBUG - #define IP_FRAG_LOG(lvl, fmt, args...) RTE_LOG(lvl, USER1, fmt, ##args) - -#define IP_FRAG_ASSERT(exp) \ -if (!(exp)) { \ - rte_panic("function %s, line%d\tassert \"" #exp "\" failed\n", \ - __func__, __LINE__); \ -} #else #define IP_FRAG_LOG(lvl, fmt, args...) do {} while(0) -#define IP_FRAG_ASSERT(exp) do {} while (0) #endif /* IP_FRAG_DEBUG */ #define IPV4_KEYLEN 1 @@ -76,8 +68,8 @@ struct ip_frag_pkt * ip_frag_lookup(struct rte_ip_frag_tbl *tbl, struct ip_frag_pkt **free, struct ip_frag_pkt **stale); /* these functions need to be declared here as ip_frag_process relies on them */ -struct rte_mbuf * ipv4_frag_reassemble(const struct ip_frag_pkt *fp); -struct rte_mbuf * ipv6_frag_reassemble(const struct ip_frag_pkt *fp); +struct rte_mbuf *ipv4_frag_reassemble(struct ip_frag_pkt *fp); +struct rte_mbuf *ipv6_frag_reassemble(struct ip_frag_pkt *fp); diff --git a/lib/librte_ip_frag/rte_ipv4_fragmentation.c b/lib/librte_ip_frag/rte_ipv4_fragmentation.c index a4ed9238..a2259e80 100644 --- a/lib/librte_ip_frag/rte_ipv4_fragmentation.c +++ b/lib/librte_ip_frag/rte_ipv4_fragmentation.c @@ -107,7 +107,7 @@ rte_ipv4_fragment_packet(struct rte_mbuf *pkt_in, frag_size = (uint16_t)(mtu_size - sizeof(struct ipv4_hdr)); /* Fragment size should be a multiply of 8. */ - IP_FRAG_ASSERT((frag_size & IPV4_HDR_FO_MASK) == 0); + RTE_ASSERT((frag_size & IPV4_HDR_FO_MASK) == 0); in_hdr = rte_pktmbuf_mtod(pkt_in, struct ipv4_hdr *); flag_offset = rte_cpu_to_be_16(in_hdr->fragment_offset); diff --git a/lib/librte_ip_frag/rte_ipv4_reassembly.c b/lib/librte_ip_frag/rte_ipv4_reassembly.c index 26d07f9a..e084ca59 100644 --- a/lib/librte_ip_frag/rte_ipv4_reassembly.c +++ b/lib/librte_ip_frag/rte_ipv4_reassembly.c @@ -41,11 +41,12 @@ * Reassemble fragments into one packet. */ struct rte_mbuf * -ipv4_frag_reassemble(const struct ip_frag_pkt *fp) +ipv4_frag_reassemble(struct ip_frag_pkt *fp) { struct ipv4_hdr *ip_hdr; struct rte_mbuf *m, *prev; uint32_t i, n, ofs, first_len; + uint32_t curr_idx = 0; first_len = fp->frags[IP_FIRST_FRAG_IDX].len; n = fp->last_idx - 1; @@ -53,6 +54,7 @@ ipv4_frag_reassemble(const struct ip_frag_pkt *fp) /*start from the last fragment. */ m = fp->frags[IP_LAST_FRAG_IDX].mb; ofs = fp->frags[IP_LAST_FRAG_IDX].ofs; + curr_idx = IP_LAST_FRAG_IDX; while (ofs != first_len) { @@ -67,6 +69,10 @@ ipv4_frag_reassemble(const struct ip_frag_pkt *fp) rte_pktmbuf_adj(m, (uint16_t)(m->l2_len + m->l3_len)); rte_pktmbuf_chain(fp->frags[i].mb, m); + /* this mbuf should not be accessed directly */ + fp->frags[curr_idx].mb = NULL; + curr_idx = i; + /* update our last fragment and offset. */ m = fp->frags[i].mb; ofs = fp->frags[i].ofs; diff --git a/lib/librte_ip_frag/rte_ipv6_fragmentation.c b/lib/librte_ip_frag/rte_ipv6_fragmentation.c index 1e30004f..db666bbf 100644 --- a/lib/librte_ip_frag/rte_ipv6_fragmentation.c +++ b/lib/librte_ip_frag/rte_ipv6_fragmentation.c @@ -110,7 +110,7 @@ rte_ipv6_fragment_packet(struct rte_mbuf *pkt_in, frag_size = (uint16_t)(mtu_size - sizeof(struct ipv6_hdr)); /* Fragment size should be a multiple of 8. */ - IP_FRAG_ASSERT((frag_size & ~RTE_IPV6_EHDR_FO_MASK) == 0); + RTE_ASSERT((frag_size & ~RTE_IPV6_EHDR_FO_MASK) == 0); /* Check that pkts_out is big enough to hold all fragments */ if (unlikely (frag_size * nb_pkts_out < diff --git a/lib/librte_ip_frag/rte_ipv6_reassembly.c b/lib/librte_ip_frag/rte_ipv6_reassembly.c index d29cb1da..21a5ef5d 100644 --- a/lib/librte_ip_frag/rte_ipv6_reassembly.c +++ b/lib/librte_ip_frag/rte_ipv6_reassembly.c @@ -59,13 +59,14 @@ ip_frag_memmove(char *dst, char *src, int len) * Reassemble fragments into one packet. */ struct rte_mbuf * -ipv6_frag_reassemble(const struct ip_frag_pkt *fp) +ipv6_frag_reassemble(struct ip_frag_pkt *fp) { struct ipv6_hdr *ip_hdr; struct ipv6_extension_fragment *frag_hdr; struct rte_mbuf *m, *prev; uint32_t i, n, ofs, first_len; uint32_t last_len, move_len, payload_len; + uint32_t curr_idx = 0; first_len = fp->frags[IP_FIRST_FRAG_IDX].len; n = fp->last_idx - 1; @@ -74,6 +75,7 @@ ipv6_frag_reassemble(const struct ip_frag_pkt *fp) m = fp->frags[IP_LAST_FRAG_IDX].mb; ofs = fp->frags[IP_LAST_FRAG_IDX].ofs; last_len = fp->frags[IP_LAST_FRAG_IDX].len; + curr_idx = IP_LAST_FRAG_IDX; payload_len = ofs + last_len; @@ -90,6 +92,10 @@ ipv6_frag_reassemble(const struct ip_frag_pkt *fp) rte_pktmbuf_adj(m, (uint16_t)(m->l2_len + m->l3_len)); rte_pktmbuf_chain(fp->frags[i].mb, m); + /* this mbuf should not be accessed directly */ + fp->frags[curr_idx].mb = NULL; + curr_idx = i; + /* update our last fragment and offset. */ m = fp->frags[i].mb; ofs = fp->frags[i].ofs; diff --git a/lib/librte_ivshmem/Makefile b/lib/librte_ivshmem/Makefile index 16defdba..c099438c 100644 --- a/lib/librte_ivshmem/Makefile +++ b/lib/librte_ivshmem/Makefile @@ -46,7 +46,9 @@ SRCS-$(CONFIG_RTE_LIBRTE_IVSHMEM) := rte_ivshmem.c # install includes SYMLINK-$(CONFIG_RTE_LIBRTE_IVSHMEM)-include := rte_ivshmem.h -# this lib needs eal +# this lib needs EAL, ring and mempool +DEPDIRS-$(CONFIG_RTE_LIBRTE_IVSHMEM) += lib/librte_eal +DEPDIRS-$(CONFIG_RTE_LIBRTE_IVSHMEM) += lib/librte_ring DEPDIRS-$(CONFIG_RTE_LIBRTE_IVSHMEM) += lib/librte_mempool include $(RTE_SDK)/mk/rte.lib.mk diff --git a/lib/librte_ivshmem/rte_ivshmem.c b/lib/librte_ivshmem/rte_ivshmem.c index c8b332ce..c26edb61 100644 --- a/lib/librte_ivshmem/rte_ivshmem.c +++ b/lib/librte_ivshmem/rte_ivshmem.c @@ -548,26 +548,40 @@ add_ring_to_metadata(const struct rte_ring * r, } static int -add_mempool_to_metadata(const struct rte_mempool * mp, - struct ivshmem_config * config) +add_mempool_memzone_to_metadata(const void *addr, + struct ivshmem_config *config) { - struct rte_memzone * mz; - int ret; + struct rte_memzone *mz; - mz = get_memzone_by_addr(mp); - ret = 0; + mz = get_memzone_by_addr(addr); if (!mz) { RTE_LOG(ERR, EAL, "Cannot find memzone for mempool!\n"); return -1; } - /* mempool consists of memzone and ring */ - ret = add_memzone_to_metadata(mz, config); + return add_memzone_to_metadata(mz, config); +} + +static int +add_mempool_to_metadata(const struct rte_mempool *mp, + struct ivshmem_config *config) +{ + struct rte_mempool_memhdr *memhdr; + int ret; + + ret = add_mempool_memzone_to_metadata(mp, config); if (ret < 0) return -1; - return add_ring_to_metadata(mp->ring, config); + STAILQ_FOREACH(memhdr, &mp->mem_list, next) { + ret = add_mempool_memzone_to_metadata(memhdr->addr, config); + if (ret < 0) + return -1; + } + + /* mempool consists of memzone and ring */ + return add_ring_to_metadata(mp->pool_data, config); } int diff --git a/lib/librte_jobstats/rte_jobstats.h b/lib/librte_jobstats/rte_jobstats.h index c2b285f8..b3686030 100644 --- a/lib/librte_jobstats/rte_jobstats.h +++ b/lib/librte_jobstats/rte_jobstats.h @@ -85,7 +85,7 @@ struct rte_jobstats { /**< Minimum execute time. */ uint64_t max_exec_time; - /**< Minimum execute time. */ + /**< Maximum execute time. */ uint64_t exec_cnt; /**< Execute count. */ diff --git a/lib/librte_kni/Makefile b/lib/librte_kni/Makefile index 1398164d..09474461 100644 --- a/lib/librte_kni/Makefile +++ b/lib/librte_kni/Makefile @@ -46,8 +46,9 @@ SRCS-$(CONFIG_RTE_LIBRTE_KNI) := rte_kni.c # install includes SYMLINK-$(CONFIG_RTE_LIBRTE_KNI)-include := rte_kni.h -# this lib needs eal -DEPDIRS-$(CONFIG_RTE_LIBRTE_KNI) += lib/librte_eal lib/librte_mbuf +DEPDIRS-$(CONFIG_RTE_LIBRTE_KNI) += lib/librte_eal +DEPDIRS-$(CONFIG_RTE_LIBRTE_KNI) += lib/librte_mbuf +DEPDIRS-$(CONFIG_RTE_LIBRTE_KNI) += lib/librte_mempool DEPDIRS-$(CONFIG_RTE_LIBRTE_KNI) += lib/librte_ether include $(RTE_SDK)/mk/rte.lib.mk diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c index ea9baf4c..3028fd43 100644 --- a/lib/librte_kni/rte_kni.c +++ b/lib/librte_kni/rte_kni.c @@ -323,6 +323,7 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool, char intf_name[RTE_KNI_NAMESIZE]; char mz_name[RTE_MEMZONE_NAMESIZE]; const struct rte_memzone *mz; + const struct rte_mempool *mp; struct rte_kni_memzone_slot *slot = NULL; if (!pktmbuf_pool || !conf || !conf->name[0]) @@ -415,12 +416,17 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool, /* MBUF mempool */ - snprintf(mz_name, sizeof(mz_name), RTE_MEMPOOL_OBJ_NAME, + snprintf(mz_name, sizeof(mz_name), RTE_MEMPOOL_MZ_FORMAT, pktmbuf_pool->name); mz = rte_memzone_lookup(mz_name); KNI_MEM_CHECK(mz == NULL); - dev_info.mbuf_va = mz->addr; - dev_info.mbuf_phys = mz->phys_addr; + mp = (struct rte_mempool *)mz->addr; + /* KNI currently requires to have only one memory chunk */ + if (mp->nb_mem_chunks != 1) + goto kni_fail; + + dev_info.mbuf_va = STAILQ_FIRST(&mp->mem_list)->addr; + dev_info.mbuf_phys = STAILQ_FIRST(&mp->mem_list)->phys_addr; ctx->pktmbuf_pool = pktmbuf_pool; ctx->group_id = conf->group_id; ctx->slot_id = slot->id; diff --git a/lib/librte_kni/rte_kni.h b/lib/librte_kni/rte_kni.h index 9899a170..7363e6cf 100644 --- a/lib/librte_kni/rte_kni.h +++ b/lib/librte_kni/rte_kni.h @@ -113,6 +113,9 @@ void rte_kni_init(unsigned int max_kni_ifaces); * The rte_kni_alloc shall not be called before rte_kni_init() has been * called. rte_kni_alloc is thread safe. * + * The mempool should have capacity of more than "2 x KNI_FIFO_COUNT_MAX" + * elements for each KNI interface allocated. + * * @param pktmbuf_pool * The mempool for allocting mbufs for packets. * @param conf @@ -160,8 +163,8 @@ int rte_kni_handle_request(struct rte_kni *kni); /** * Retrieve a burst of packets from a KNI interface. The retrieved packets are * stored in rte_mbuf structures whose pointers are supplied in the array of - * mbufs, and the maximum number is indicated by num. It handles the freeing of - * the mbufs in the free queue of KNI interface. + * mbufs, and the maximum number is indicated by num. It handles allocating + * the mbufs for KNI interface alloc queue. * * @param kni * The KNI interface context. @@ -179,8 +182,8 @@ unsigned rte_kni_rx_burst(struct rte_kni *kni, struct rte_mbuf **mbufs, /** * Send a burst of packets to a KNI interface. The packets to be sent out are * stored in rte_mbuf structures whose pointers are supplied in the array of - * mbufs, and the maximum number is indicated by num. It handles allocating - * the mbufs for KNI interface alloc queue. + * mbufs, and the maximum number is indicated by num. It handles the freeing of + * the mbufs in the free queue of KNI interface. * * @param kni * The KNI interface context. diff --git a/lib/librte_lpm/rte_lpm.c b/lib/librte_lpm/rte_lpm.c index 8bdf6065..6f65d1c2 100644 --- a/lib/librte_lpm/rte_lpm.c +++ b/lib/librte_lpm/rte_lpm.c @@ -373,7 +373,6 @@ rte_lpm_free_v20(struct rte_lpm_v20 *lpm) rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); - rte_free(lpm->rules_tbl); rte_free(lpm); rte_free(te); } diff --git a/lib/librte_lpm/rte_lpm6.c b/lib/librte_lpm/rte_lpm6.c index ba4353ce..32fdba01 100644 --- a/lib/librte_lpm/rte_lpm6.c +++ b/lib/librte_lpm/rte_lpm6.c @@ -601,7 +601,7 @@ int rte_lpm6_lookup(const struct rte_lpm6 *lpm, uint8_t *ip, uint8_t *next_hop) { const struct rte_lpm6_tbl_entry *tbl; - const struct rte_lpm6_tbl_entry *tbl_next; + const struct rte_lpm6_tbl_entry *tbl_next = NULL; int status; uint8_t first_byte; uint32_t tbl24_index; @@ -636,7 +636,7 @@ rte_lpm6_lookup_bulk_func(const struct rte_lpm6 *lpm, { unsigned i; const struct rte_lpm6_tbl_entry *tbl; - const struct rte_lpm6_tbl_entry *tbl_next; + const struct rte_lpm6_tbl_entry *tbl_next = NULL; uint32_t tbl24_index; uint8_t first_byte, next_hop; int status; diff --git a/lib/librte_lpm/rte_lpm6.h b/lib/librte_lpm/rte_lpm6.h index cedcea8d..13d027f9 100644 --- a/lib/librte_lpm/rte_lpm6.h +++ b/lib/librte_lpm/rte_lpm6.h @@ -38,6 +38,8 @@ * RTE Longest Prefix Match for IPv6 (LPM6) */ +#include <stdint.h> + #ifdef __cplusplus extern "C" { #endif diff --git a/lib/librte_mbuf/rte_mbuf.c b/lib/librte_mbuf/rte_mbuf.c index dc0467c9..601e5282 100644 --- a/lib/librte_mbuf/rte_mbuf.c +++ b/lib/librte_mbuf/rte_mbuf.c @@ -86,7 +86,7 @@ rte_pktmbuf_pool_init(struct rte_mempool *mp, void *opaque_arg) struct rte_pktmbuf_pool_private default_mbp_priv; uint16_t roomsz; - RTE_MBUF_ASSERT(mp->elt_size >= sizeof(struct rte_mbuf)); + RTE_ASSERT(mp->elt_size >= sizeof(struct rte_mbuf)); /* if no structure is provided, assume no mbuf private area */ user_mbp_priv = opaque_arg; @@ -100,7 +100,7 @@ rte_pktmbuf_pool_init(struct rte_mempool *mp, void *opaque_arg) user_mbp_priv = &default_mbp_priv; } - RTE_MBUF_ASSERT(mp->elt_size >= sizeof(struct rte_mbuf) + + RTE_ASSERT(mp->elt_size >= sizeof(struct rte_mbuf) + user_mbp_priv->mbuf_data_room_size + user_mbp_priv->mbuf_priv_size); @@ -126,9 +126,9 @@ rte_pktmbuf_init(struct rte_mempool *mp, mbuf_size = sizeof(struct rte_mbuf) + priv_size; buf_len = rte_pktmbuf_data_room_size(mp); - RTE_MBUF_ASSERT(RTE_ALIGN(priv_size, RTE_MBUF_PRIV_ALIGN) == priv_size); - RTE_MBUF_ASSERT(mp->elt_size >= mbuf_size); - RTE_MBUF_ASSERT(buf_len <= UINT16_MAX); + RTE_ASSERT(RTE_ALIGN(priv_size, RTE_MBUF_PRIV_ALIGN) == priv_size); + RTE_ASSERT(mp->elt_size >= mbuf_size); + RTE_ASSERT(buf_len <= UINT16_MAX); memset(m, 0, mp->elt_size); @@ -153,6 +153,7 @@ rte_pktmbuf_pool_create(const char *name, unsigned n, unsigned cache_size, uint16_t priv_size, uint16_t data_room_size, int socket_id) { + struct rte_mempool *mp; struct rte_pktmbuf_pool_private mbp_priv; unsigned elt_size; @@ -167,10 +168,27 @@ rte_pktmbuf_pool_create(const char *name, unsigned n, mbp_priv.mbuf_data_room_size = data_room_size; mbp_priv.mbuf_priv_size = priv_size; - return rte_mempool_create(name, n, elt_size, - cache_size, sizeof(struct rte_pktmbuf_pool_private), - rte_pktmbuf_pool_init, &mbp_priv, rte_pktmbuf_init, NULL, - socket_id, 0); + mp = rte_mempool_create_empty(name, n, elt_size, cache_size, + sizeof(struct rte_pktmbuf_pool_private), socket_id, 0); + if (mp == NULL) + return NULL; + + rte_errno = rte_mempool_set_ops_byname(mp, + RTE_MBUF_DEFAULT_MEMPOOL_OPS, NULL); + if (rte_errno != 0) { + RTE_LOG(ERR, MBUF, "error setting mempool handler\n"); + return NULL; + } + rte_pktmbuf_pool_init(mp, &mbp_priv); + + if (rte_mempool_populate_default(mp) < 0) { + rte_mempool_free(mp); + return NULL; + } + + rte_mempool_obj_iter(mp, rte_pktmbuf_init, NULL); + + return mp; } /* do some sanity checks on a mbuf: panic if it fails */ @@ -218,7 +236,7 @@ rte_pktmbuf_dump(FILE *f, const struct rte_mbuf *m, unsigned dump_len) __rte_mbuf_sanity_check(m, 1); - fprintf(f, "dump mbuf at 0x%p, phys=%"PRIx64", buf_len=%u\n", + fprintf(f, "dump mbuf at %p, phys=%"PRIx64", buf_len=%u\n", m, (uint64_t)m->buf_physaddr, (unsigned)m->buf_len); fprintf(f, " pkt_len=%"PRIu32", ol_flags=%"PRIx64", nb_segs=%u, " "in_port=%u\n", m->pkt_len, m->ol_flags, @@ -228,7 +246,7 @@ rte_pktmbuf_dump(FILE *f, const struct rte_mbuf *m, unsigned dump_len) while (m && nb_segs != 0) { __rte_mbuf_sanity_check(m, 0); - fprintf(f, " segment at 0x%p, data=0x%p, data_len=%u\n", + fprintf(f, " segment at %p, data=%p, data_len=%u\n", m, rte_pktmbuf_mtod(m, void *), (unsigned)m->data_len); len = dump_len; if (len > m->data_len) @@ -254,12 +272,10 @@ const char *rte_get_rx_ol_flag_name(uint64_t mask) case PKT_RX_L4_CKSUM_BAD: return "PKT_RX_L4_CKSUM_BAD"; case PKT_RX_IP_CKSUM_BAD: return "PKT_RX_IP_CKSUM_BAD"; case PKT_RX_EIP_CKSUM_BAD: return "PKT_RX_EIP_CKSUM_BAD"; - /* case PKT_RX_OVERSIZE: return "PKT_RX_OVERSIZE"; */ - /* case PKT_RX_HBUF_OVERFLOW: return "PKT_RX_HBUF_OVERFLOW"; */ - /* case PKT_RX_RECIP_ERR: return "PKT_RX_RECIP_ERR"; */ - /* case PKT_RX_MAC_ERR: return "PKT_RX_MAC_ERR"; */ + case PKT_RX_VLAN_STRIPPED: return "PKT_RX_VLAN_STRIPPED"; case PKT_RX_IEEE1588_PTP: return "PKT_RX_IEEE1588_PTP"; case PKT_RX_IEEE1588_TMST: return "PKT_RX_IEEE1588_TMST"; + case PKT_RX_QINQ_STRIPPED: return "PKT_RX_QINQ_STRIPPED"; default: return NULL; } } diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h index 75a227d8..101485fb 100644 --- a/lib/librte_mbuf/rte_mbuf.h +++ b/lib/librte_mbuf/rte_mbuf.h @@ -65,10 +65,6 @@ extern "C" { #endif -/* deprecated options */ -#pragma GCC poison RTE_MBUF_SCATTER_GATHER -#pragma GCC poison RTE_MBUF_REFCNT - /* * Packet Offload Features Flags. It also carry packet type information. * Critical resources. Both rx/tx shared these bits. Be cautious on any change @@ -83,21 +79,51 @@ extern "C" { * Keep these flags synchronized with rte_get_rx_ol_flag_name() and * rte_get_tx_ol_flag_name(). */ -#define PKT_RX_VLAN_PKT (1ULL << 0) /**< RX packet is a 802.1q VLAN packet. */ + +/** + * RX packet is a 802.1q VLAN packet. This flag was set by PMDs when + * the packet is recognized as a VLAN, but the behavior between PMDs + * was not the same. This flag is kept for some time to avoid breaking + * applications and should be replaced by PKT_RX_VLAN_STRIPPED. + */ +#define PKT_RX_VLAN_PKT (1ULL << 0) + #define PKT_RX_RSS_HASH (1ULL << 1) /**< RX packet with RSS hash result. */ #define PKT_RX_FDIR (1ULL << 2) /**< RX packet with FDIR match indicate. */ #define PKT_RX_L4_CKSUM_BAD (1ULL << 3) /**< L4 cksum of RX pkt. is not OK. */ #define PKT_RX_IP_CKSUM_BAD (1ULL << 4) /**< IP cksum of RX pkt. is not OK. */ #define PKT_RX_EIP_CKSUM_BAD (1ULL << 5) /**< External IP header checksum error. */ -#define PKT_RX_OVERSIZE (0ULL << 0) /**< Num of desc of an RX pkt oversize. */ -#define PKT_RX_HBUF_OVERFLOW (0ULL << 0) /**< Header buffer overflow. */ -#define PKT_RX_RECIP_ERR (0ULL << 0) /**< Hardware processing error. */ -#define PKT_RX_MAC_ERR (0ULL << 0) /**< MAC error. */ + +/** + * A vlan has been stripped by the hardware and its tci is saved in + * mbuf->vlan_tci. This can only happen if vlan stripping is enabled + * in the RX configuration of the PMD. + */ +#define PKT_RX_VLAN_STRIPPED (1ULL << 6) + +/* hole, some bits can be reused here */ + #define PKT_RX_IEEE1588_PTP (1ULL << 9) /**< RX IEEE1588 L2 Ethernet PT Packet. */ #define PKT_RX_IEEE1588_TMST (1ULL << 10) /**< RX IEEE1588 L2/L4 timestamped packet.*/ #define PKT_RX_FDIR_ID (1ULL << 13) /**< FD id reported if FDIR match. */ #define PKT_RX_FDIR_FLX (1ULL << 14) /**< Flexible bytes reported if FDIR match. */ -#define PKT_RX_QINQ_PKT (1ULL << 15) /**< RX packet with double VLAN stripped. */ + +/** + * The 2 vlans have been stripped by the hardware and their tci are + * saved in mbuf->vlan_tci (inner) and mbuf->vlan_tci_outer (outer). + * This can only happen if vlan stripping is enabled in the RX + * configuration of the PMD. If this flag is set, PKT_RX_VLAN_STRIPPED + * must also be set. + */ +#define PKT_RX_QINQ_STRIPPED (1ULL << 15) + +/** + * Deprecated. + * RX packet with double VLAN stripped. + * This flag is replaced by PKT_RX_QINQ_STRIPPED. + */ +#define PKT_RX_QINQ_PKT PKT_RX_QINQ_STRIPPED + /* add new RX flags here */ /* add new TX flags here */ @@ -278,6 +304,13 @@ extern "C" { */ #define RTE_PTYPE_L2_ETHER_LLDP 0x00000004 /** + * NSH (Network Service Header) packet type. + * + * Packet format: + * <'ether type'=0x894F> + */ +#define RTE_PTYPE_L2_ETHER_NSH 0x00000005 +/** * Mask of layer 2 packet types. * It is used for outer packet for tunneling cases. */ @@ -765,7 +798,10 @@ struct rte_mbuf { /* * The packet type, which is the combination of outer/inner L2, L3, L4 - * and tunnel types. + * and tunnel types. The packet_type is about data really present in the + * mbuf. Example: if vlan stripping is enabled, a received vlan packet + * would have RTE_PTYPE_L2_ETHER and not RTE_PTYPE_L2_VLAN because the + * vlan is stripped from the data. */ union { uint32_t packet_type; /**< L2/L3/L4 and tunnel information. */ @@ -782,7 +818,8 @@ struct rte_mbuf { uint32_t pkt_len; /**< Total pkt len: sum of all segments. */ uint16_t data_len; /**< Amount of data in segment buffer. */ - uint16_t vlan_tci; /**< VLAN Tag Control Identifier (CPU order) */ + /** VLAN TCI (CPU order), valid if PKT_RX_VLAN_STRIPPED is set. */ + uint16_t vlan_tci; union { uint32_t rss; /**< RSS hash result if RSS enabled */ @@ -808,7 +845,8 @@ struct rte_mbuf { uint32_t seqn; /**< Sequence number. See also rte_reorder_insert() */ - uint16_t vlan_tci_outer; /**< Outer VLAN Tag Control Identifier (CPU order) */ + /** Outer VLAN TCI (CPU order), valid if PKT_RX_QINQ_STRIPPED is set. */ + uint16_t vlan_tci_outer; /* second cache line - fields only used in slow path or on TX */ MARKER cacheline1 __rte_cache_min_aligned; @@ -846,6 +884,44 @@ struct rte_mbuf { uint16_t timesync; } __rte_cache_aligned; +/** + * Prefetch the first part of the mbuf + * + * The first 64 bytes of the mbuf corresponds to fields that are used early + * in the receive path. If the cache line of the architecture is higher than + * 64B, the second part will also be prefetched. + * + * @param m + * The pointer to the mbuf. + */ +static inline void +rte_mbuf_prefetch_part1(struct rte_mbuf *m) +{ + rte_prefetch0(&m->cacheline0); +} + +/** + * Prefetch the second part of the mbuf + * + * The next 64 bytes of the mbuf corresponds to fields that are used in the + * transmit path. If the cache line of the architecture is higher than 64B, + * this function does nothing as it is expected that the full mbuf is + * already in cache. + * + * @param m + * The pointer to the mbuf. + */ +static inline void +rte_mbuf_prefetch_part2(struct rte_mbuf *m) +{ +#if RTE_CACHE_LINE_SIZE == 64 + rte_prefetch0(&m->cacheline1); +#else + RTE_SET_USED(m); +#endif +} + + static inline uint16_t rte_pktmbuf_priv_size(struct rte_mempool *mp); /** @@ -936,29 +1012,11 @@ struct rte_pktmbuf_pool_private { /** check mbuf type in debug mode */ #define __rte_mbuf_sanity_check(m, is_h) rte_mbuf_sanity_check(m, is_h) -/** check mbuf type in debug mode if mbuf pointer is not null */ -#define __rte_mbuf_sanity_check_raw(m, is_h) do { \ - if ((m) != NULL) \ - rte_mbuf_sanity_check(m, is_h); \ -} while (0) - -/** MBUF asserts in debug mode */ -#define RTE_MBUF_ASSERT(exp) \ -if (!(exp)) { \ - rte_panic("line%d\tassert \"" #exp "\" failed\n", __LINE__); \ -} - #else /* RTE_LIBRTE_MBUF_DEBUG */ /** check mbuf type in debug mode */ #define __rte_mbuf_sanity_check(m, is_h) do { } while (0) -/** check mbuf type in debug mode if mbuf pointer is not null */ -#define __rte_mbuf_sanity_check_raw(m, is_h) do { } while (0) - -/** MBUF asserts in debug mode */ -#define RTE_MBUF_ASSERT(exp) do { } while (0) - #endif /* RTE_LIBRTE_MBUF_DEBUG */ #ifdef RTE_MBUF_REFCNT_ATOMIC @@ -1071,9 +1129,12 @@ void rte_mbuf_sanity_check(const struct rte_mbuf *m, int is_header); /** - * @internal Allocate a new mbuf from mempool *mp*. - * The use of that function is reserved for RTE internal needs. - * Please use rte_pktmbuf_alloc(). + * Allocate an unitialized mbuf from mempool *mp*. + * + * This function can be used by PMDs (especially in RX functions) to + * allocate an unitialized mbuf. The driver is responsible of + * initializing all the required fields. See rte_pktmbuf_reset(). + * For standard needs, prefer rte_pktmbuf_alloc(). * * @param mp * The mempool from which mbuf is allocated. @@ -1081,18 +1142,28 @@ rte_mbuf_sanity_check(const struct rte_mbuf *m, int is_header); * - The pointer to the new mbuf on success. * - NULL if allocation failed. */ -static inline struct rte_mbuf *__rte_mbuf_raw_alloc(struct rte_mempool *mp) +static inline struct rte_mbuf *rte_mbuf_raw_alloc(struct rte_mempool *mp) { struct rte_mbuf *m; void *mb = NULL; + if (rte_mempool_get(mp, &mb) < 0) return NULL; m = (struct rte_mbuf *)mb; - RTE_MBUF_ASSERT(rte_mbuf_refcnt_read(m) == 0); + RTE_ASSERT(rte_mbuf_refcnt_read(m) == 0); rte_mbuf_refcnt_set(m, 1); + __rte_mbuf_sanity_check(m, 0); + return m; } +/* compat with older versions */ +__rte_deprecated static inline struct rte_mbuf * +__rte_mbuf_raw_alloc(struct rte_mempool *mp) +{ + return rte_mbuf_raw_alloc(mp); +} + /** * @internal Put mbuf back into its original mempool. * The use of that function is reserved for RTE internal needs. @@ -1104,7 +1175,7 @@ static inline struct rte_mbuf *__rte_mbuf_raw_alloc(struct rte_mempool *mp) static inline void __attribute__((always_inline)) __rte_mbuf_raw_free(struct rte_mbuf *m) { - RTE_MBUF_ASSERT(rte_mbuf_refcnt_read(m) == 0); + RTE_ASSERT(rte_mbuf_refcnt_read(m) == 0); rte_mempool_put(m->pool, m); } @@ -1356,7 +1427,7 @@ static inline void rte_pktmbuf_reset(struct rte_mbuf *m) static inline struct rte_mbuf *rte_pktmbuf_alloc(struct rte_mempool *mp) { struct rte_mbuf *m; - if ((m = __rte_mbuf_raw_alloc(mp)) != NULL) + if ((m = rte_mbuf_raw_alloc(mp)) != NULL) rte_pktmbuf_reset(m); return m; } @@ -1392,22 +1463,22 @@ static inline int rte_pktmbuf_alloc_bulk(struct rte_mempool *pool, switch (count % 4) { case 0: while (idx != count) { - RTE_MBUF_ASSERT(rte_mbuf_refcnt_read(mbufs[idx]) == 0); + RTE_ASSERT(rte_mbuf_refcnt_read(mbufs[idx]) == 0); rte_mbuf_refcnt_set(mbufs[idx], 1); rte_pktmbuf_reset(mbufs[idx]); idx++; case 3: - RTE_MBUF_ASSERT(rte_mbuf_refcnt_read(mbufs[idx]) == 0); + RTE_ASSERT(rte_mbuf_refcnt_read(mbufs[idx]) == 0); rte_mbuf_refcnt_set(mbufs[idx], 1); rte_pktmbuf_reset(mbufs[idx]); idx++; case 2: - RTE_MBUF_ASSERT(rte_mbuf_refcnt_read(mbufs[idx]) == 0); + RTE_ASSERT(rte_mbuf_refcnt_read(mbufs[idx]) == 0); rte_mbuf_refcnt_set(mbufs[idx], 1); rte_pktmbuf_reset(mbufs[idx]); idx++; case 1: - RTE_MBUF_ASSERT(rte_mbuf_refcnt_read(mbufs[idx]) == 0); + RTE_ASSERT(rte_mbuf_refcnt_read(mbufs[idx]) == 0); rte_mbuf_refcnt_set(mbufs[idx], 1); rte_pktmbuf_reset(mbufs[idx]); idx++; @@ -1421,6 +1492,8 @@ static inline int rte_pktmbuf_alloc_bulk(struct rte_mempool *pool, * * After attachment we refer the mbuf we attached as 'indirect', * while mbuf we attached to as 'direct'. + * The direct mbuf's reference counter is incremented. + * * Right now, not supported: * - attachment for already indirect mbuf (e.g. - mi has to be direct). * - mbuf we trying to attach (mi) is used by someone else @@ -1435,7 +1508,7 @@ static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *m) { struct rte_mbuf *md; - RTE_MBUF_ASSERT(RTE_MBUF_DIRECT(mi) && + RTE_ASSERT(RTE_MBUF_DIRECT(mi) && rte_mbuf_refcnt_read(mi) == 1); /* if m is not direct, get the mbuf that embeds the data */ @@ -1474,13 +1547,17 @@ static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *m) * * - restore original mbuf address and length values. * - reset pktmbuf data and data_len to their default values. - * All other fields of the given packet mbuf will be left intact. + * - decrement the direct mbuf's reference counter. When the + * reference counter becomes 0, the direct mbuf is freed. + * + * All other fields of the given packet mbuf will be left intact. * * @param m * The indirect attached packet mbuf. */ static inline void rte_pktmbuf_detach(struct rte_mbuf *m) { + struct rte_mbuf *md = rte_mbuf_from_indirect(m); struct rte_mempool *mp = m->pool; uint32_t mbuf_size, buf_len, priv_size; @@ -1495,6 +1572,9 @@ static inline void rte_pktmbuf_detach(struct rte_mbuf *m) m->data_off = RTE_MIN(RTE_PKTMBUF_HEADROOM, (uint16_t)m->buf_len); m->data_len = 0; m->ol_flags = 0; + + if (rte_mbuf_refcnt_update(md, -1) == 0) + __rte_mbuf_raw_free(md); } static inline struct rte_mbuf* __attribute__((always_inline)) @@ -1503,17 +1583,9 @@ __rte_pktmbuf_prefree_seg(struct rte_mbuf *m) __rte_mbuf_sanity_check(m, 0); if (likely(rte_mbuf_refcnt_update(m, -1) == 0)) { - - /* if this is an indirect mbuf, then - * - detach mbuf - * - free attached mbuf segment - */ - if (RTE_MBUF_INDIRECT(m)) { - struct rte_mbuf *md = rte_mbuf_from_indirect(m); + /* if this is an indirect mbuf, it is detached. */ + if (RTE_MBUF_INDIRECT(m)) rte_pktmbuf_detach(m); - if (rte_mbuf_refcnt_update(md, -1) == 0) - __rte_mbuf_raw_free(md); - } return m; } return NULL; diff --git a/lib/librte_mempool/Makefile b/lib/librte_mempool/Makefile index a6898eff..057a6ab4 100644 --- a/lib/librte_mempool/Makefile +++ b/lib/librte_mempool/Makefile @@ -38,13 +38,13 @@ CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 EXPORT_MAP := rte_mempool_version.map -LIBABIVER := 1 +LIBABIVER := 2 # all source are stored in SRCS-y SRCS-$(CONFIG_RTE_LIBRTE_MEMPOOL) += rte_mempool.c -ifeq ($(CONFIG_RTE_LIBRTE_XEN_DOM0),y) -SRCS-$(CONFIG_RTE_LIBRTE_MEMPOOL) += rte_dom0_mempool.c -endif +SRCS-$(CONFIG_RTE_LIBRTE_MEMPOOL) += rte_mempool_ops.c +SRCS-$(CONFIG_RTE_LIBRTE_MEMPOOL) += rte_mempool_ring.c +SRCS-$(CONFIG_RTE_LIBRTE_MEMPOOL) += rte_mempool_stack.c # install includes SYMLINK-$(CONFIG_RTE_LIBRTE_MEMPOOL)-include := rte_mempool.h diff --git a/lib/librte_mempool/rte_dom0_mempool.c b/lib/librte_mempool/rte_dom0_mempool.c deleted file mode 100644 index 0d6d7504..00000000 --- a/lib/librte_mempool/rte_dom0_mempool.c +++ /dev/null @@ -1,133 +0,0 @@ -/*- - * BSD LICENSE - * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include <stdio.h> -#include <string.h> -#include <stdint.h> -#include <unistd.h> -#include <stdarg.h> -#include <inttypes.h> -#include <errno.h> -#include <sys/queue.h> - -#include <rte_common.h> -#include <rte_log.h> -#include <rte_debug.h> -#include <rte_memory.h> -#include <rte_memzone.h> -#include <rte_atomic.h> -#include <rte_launch.h> -#include <rte_eal.h> -#include <rte_eal_memconfig.h> -#include <rte_per_lcore.h> -#include <rte_lcore.h> -#include <rte_branch_prediction.h> -#include <rte_ring.h> -#include <rte_errno.h> -#include <rte_string_fns.h> -#include <rte_spinlock.h> - -#include "rte_mempool.h" - -static void -get_phys_map(void *va, phys_addr_t pa[], uint32_t pg_num, - uint32_t pg_sz, uint32_t memseg_id) -{ - uint32_t i; - uint64_t virt_addr, mfn_id; - struct rte_mem_config *mcfg; - uint32_t page_size = getpagesize(); - - /* get pointer to global configuration */ - mcfg = rte_eal_get_configuration()->mem_config; - virt_addr = (uintptr_t) mcfg->memseg[memseg_id].addr; - - for (i = 0; i != pg_num; i++) { - mfn_id = ((uintptr_t)va + i * pg_sz - virt_addr) / RTE_PGSIZE_2M; - pa[i] = mcfg->memseg[memseg_id].mfn[mfn_id] * page_size; - } -} - -/* create the mempool for supporting Dom0 */ -struct rte_mempool * -rte_dom0_mempool_create(const char *name, unsigned elt_num, unsigned elt_size, - unsigned cache_size, unsigned private_data_size, - rte_mempool_ctor_t *mp_init, void *mp_init_arg, - rte_mempool_obj_ctor_t *obj_init, void *obj_init_arg, - int socket_id, unsigned flags) -{ - struct rte_mempool *mp = NULL; - phys_addr_t *pa; - char *va; - size_t sz; - uint32_t pg_num, pg_shift, pg_sz, total_size; - const struct rte_memzone *mz; - char mz_name[RTE_MEMZONE_NAMESIZE]; - int mz_flags = RTE_MEMZONE_1GB|RTE_MEMZONE_SIZE_HINT_ONLY; - - pg_sz = RTE_PGSIZE_2M; - - pg_shift = rte_bsf32(pg_sz); - total_size = rte_mempool_calc_obj_size(elt_size, flags, NULL); - - /* calc max memory size and max number of pages needed. */ - sz = rte_mempool_xmem_size(elt_num, total_size, pg_shift) + - RTE_PGSIZE_2M; - pg_num = sz >> pg_shift; - - /* extract physical mappings of the allocated memory. */ - pa = calloc(pg_num, sizeof (*pa)); - if (pa == NULL) - return mp; - - snprintf(mz_name, sizeof(mz_name), RTE_MEMPOOL_OBJ_NAME, name); - mz = rte_memzone_reserve(mz_name, sz, socket_id, mz_flags); - if (mz == NULL) { - free(pa); - return mp; - } - - va = (char *)RTE_ALIGN_CEIL((uintptr_t)mz->addr, RTE_PGSIZE_2M); - /* extract physical mappings of the allocated memory. */ - get_phys_map(va, pa, pg_num, pg_sz, mz->memseg_id); - - mp = rte_mempool_xmem_create(name, elt_num, elt_size, - cache_size, private_data_size, - mp_init, mp_init_arg, - obj_init, obj_init_arg, - socket_id, flags, va, pa, pg_num, pg_shift); - - free(pa); - - return mp; -} diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c index f8781e17..d78d02b7 100644 --- a/lib/librte_mempool/rte_mempool.c +++ b/lib/librte_mempool/rte_mempool.c @@ -2,6 +2,7 @@ * BSD LICENSE * * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * Copyright(c) 2016 6WIND S.A. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -39,6 +40,7 @@ #include <inttypes.h> #include <errno.h> #include <sys/queue.h> +#include <sys/mman.h> #include <rte_common.h> #include <rte_log.h> @@ -127,127 +129,63 @@ static unsigned optimize_object_size(unsigned obj_size) } static void -mempool_add_elem(struct rte_mempool *mp, void *obj, uint32_t obj_idx, - rte_mempool_obj_ctor_t *obj_init, void *obj_init_arg) +mempool_add_elem(struct rte_mempool *mp, void *obj, phys_addr_t physaddr) { struct rte_mempool_objhdr *hdr; struct rte_mempool_objtlr *tlr __rte_unused; - obj = (char *)obj + mp->header_size; - /* set mempool ptr in header */ hdr = RTE_PTR_SUB(obj, sizeof(*hdr)); hdr->mp = mp; + hdr->physaddr = physaddr; + STAILQ_INSERT_TAIL(&mp->elt_list, hdr, next); + mp->populated_size++; #ifdef RTE_LIBRTE_MEMPOOL_DEBUG hdr->cookie = RTE_MEMPOOL_HEADER_COOKIE2; tlr = __mempool_get_trailer(obj); tlr->cookie = RTE_MEMPOOL_TRAILER_COOKIE; #endif - /* call the initializer */ - if (obj_init) - obj_init(mp, obj_init_arg, obj, obj_idx); /* enqueue in ring */ - rte_ring_sp_enqueue(mp->ring, obj); + rte_mempool_ops_enqueue_bulk(mp, &obj, 1); } +/* call obj_cb() for each mempool element */ uint32_t -rte_mempool_obj_iter(void *vaddr, uint32_t elt_num, size_t elt_sz, size_t align, - const phys_addr_t paddr[], uint32_t pg_num, uint32_t pg_shift, - rte_mempool_obj_iter_t obj_iter, void *obj_iter_arg) +rte_mempool_obj_iter(struct rte_mempool *mp, + rte_mempool_obj_cb_t *obj_cb, void *obj_cb_arg) { - uint32_t i, j, k; - uint32_t pgn, pgf; - uintptr_t end, start, va; - uintptr_t pg_sz; - - pg_sz = (uintptr_t)1 << pg_shift; - va = (uintptr_t)vaddr; - - i = 0; - j = 0; - - while (i != elt_num && j != pg_num) { - - start = RTE_ALIGN_CEIL(va, align); - end = start + elt_sz; - - /* index of the first page for the next element. */ - pgf = (end >> pg_shift) - (start >> pg_shift); - - /* index of the last page for the current element. */ - pgn = ((end - 1) >> pg_shift) - (start >> pg_shift); - pgn += j; - - /* do we have enough space left for the element. */ - if (pgn >= pg_num) - break; - - for (k = j; - k != pgn && - paddr[k] + pg_sz == paddr[k + 1]; - k++) - ; + struct rte_mempool_objhdr *hdr; + void *obj; + unsigned n = 0; - /* - * if next pgn chunks of memory physically continuous, - * use it to create next element. - * otherwise, just skip that chunk unused. - */ - if (k == pgn) { - if (obj_iter != NULL) - obj_iter(obj_iter_arg, (void *)start, - (void *)end, i); - va = end; - j += pgf; - i++; - } else { - va = RTE_ALIGN_CEIL((va + 1), pg_sz); - j++; - } + STAILQ_FOREACH(hdr, &mp->elt_list, next) { + obj = (char *)hdr + sizeof(*hdr); + obj_cb(mp, obj_cb_arg, obj, n); + n++; } - return i; -} - -/* - * Populate mempool with the objects. - */ - -struct mempool_populate_arg { - struct rte_mempool *mp; - rte_mempool_obj_ctor_t *obj_init; - void *obj_init_arg; -}; - -static void -mempool_obj_populate(void *arg, void *start, void *end, uint32_t idx) -{ - struct mempool_populate_arg *pa = arg; - - mempool_add_elem(pa->mp, start, idx, pa->obj_init, pa->obj_init_arg); - pa->mp->elt_va_end = (uintptr_t)end; + return n; } -static void -mempool_populate(struct rte_mempool *mp, size_t num, size_t align, - rte_mempool_obj_ctor_t *obj_init, void *obj_init_arg) +/* call mem_cb() for each mempool memory chunk */ +uint32_t +rte_mempool_mem_iter(struct rte_mempool *mp, + rte_mempool_mem_cb_t *mem_cb, void *mem_cb_arg) { - uint32_t elt_sz; - struct mempool_populate_arg arg; + struct rte_mempool_memhdr *hdr; + unsigned n = 0; - elt_sz = mp->elt_size + mp->header_size + mp->trailer_size; - arg.mp = mp; - arg.obj_init = obj_init; - arg.obj_init_arg = obj_init_arg; + STAILQ_FOREACH(hdr, &mp->mem_list, next) { + mem_cb(mp, mem_cb_arg, hdr, n); + n++; + } - mp->size = rte_mempool_obj_iter((void *)mp->elt_va_start, - num, elt_sz, align, - mp->elt_pa, mp->pg_num, mp->pg_shift, - mempool_obj_populate, &arg); + return n; } +/* get the header, trailer and total size of a mempool element. */ uint32_t rte_mempool_calc_obj_size(uint32_t elt_size, uint32_t flags, struct rte_mempool_objsz *sz) @@ -256,24 +194,13 @@ rte_mempool_calc_obj_size(uint32_t elt_size, uint32_t flags, sz = (sz != NULL) ? sz : &lsz; - /* - * In header, we have at least the pointer to the pool, and - * optionaly a 64 bits cookie. - */ - sz->header_size = 0; - sz->header_size += sizeof(struct rte_mempool *); /* ptr to pool */ -#ifdef RTE_LIBRTE_MEMPOOL_DEBUG - sz->header_size += sizeof(uint64_t); /* cookie */ -#endif + sz->header_size = sizeof(struct rte_mempool_objhdr); if ((flags & MEMPOOL_F_NO_CACHE_ALIGN) == 0) sz->header_size = RTE_ALIGN_CEIL(sz->header_size, RTE_MEMPOOL_ALIGN); - /* trailer contains the cookie in debug mode */ - sz->trailer_size = 0; -#ifdef RTE_LIBRTE_MEMPOOL_DEBUG - sz->trailer_size += sizeof(uint64_t); /* cookie */ -#endif + sz->trailer_size = sizeof(struct rte_mempool_objtlr); + /* element size is 8 bytes-aligned at least */ sz->elt_size = RTE_ALIGN_CEIL(elt_size, sizeof(uint64_t)); @@ -297,23 +224,6 @@ rte_mempool_calc_obj_size(uint32_t elt_size, uint32_t flags, sz->trailer_size = new_size - sz->header_size - sz->elt_size; } - if (! rte_eal_has_hugepages()) { - /* - * compute trailer size so that pool elements fit exactly in - * a standard page - */ - int page_size = getpagesize(); - int new_size = page_size - sz->header_size - sz->elt_size; - if (new_size < 0 || (unsigned int)new_size < sz->trailer_size) { - printf("When hugepages are disabled, pool objects " - "can't exceed PAGE_SIZE: %d + %d + %d > %d\n", - sz->header_size, sz->elt_size, sz->trailer_size, - page_size); - return 0; - } - sz->trailer_size = new_size; - } - /* this is the size of an object, including header and trailer */ sz->total_size = sz->header_size + sz->elt_size + sz->trailer_size; @@ -325,139 +235,514 @@ rte_mempool_calc_obj_size(uint32_t elt_size, uint32_t flags, * Calculate maximum amount of memory required to store given number of objects. */ size_t -rte_mempool_xmem_size(uint32_t elt_num, size_t elt_sz, uint32_t pg_shift) +rte_mempool_xmem_size(uint32_t elt_num, size_t total_elt_sz, uint32_t pg_shift) { - size_t n, pg_num, pg_sz, sz; + size_t obj_per_page, pg_num, pg_sz; - pg_sz = (size_t)1 << pg_shift; + if (total_elt_sz == 0) + return 0; - if ((n = pg_sz / elt_sz) > 0) { - pg_num = (elt_num + n - 1) / n; - sz = pg_num << pg_shift; - } else { - sz = RTE_ALIGN_CEIL(elt_sz, pg_sz) * elt_num; - } + if (pg_shift == 0) + return total_elt_sz * elt_num; - return sz; + pg_sz = (size_t)1 << pg_shift; + obj_per_page = pg_sz / total_elt_sz; + if (obj_per_page == 0) + return RTE_ALIGN_CEIL(total_elt_sz, pg_sz) * elt_num; + + pg_num = (elt_num + obj_per_page - 1) / obj_per_page; + return pg_num << pg_shift; } /* * Calculate how much memory would be actually required with the * given memory footprint to store required number of elements. */ +ssize_t +rte_mempool_xmem_usage(__rte_unused void *vaddr, uint32_t elt_num, + size_t total_elt_sz, const phys_addr_t paddr[], uint32_t pg_num, + uint32_t pg_shift) +{ + uint32_t elt_cnt = 0; + phys_addr_t start, end; + uint32_t paddr_idx; + size_t pg_sz = (size_t)1 << pg_shift; + + /* if paddr is NULL, assume contiguous memory */ + if (paddr == NULL) { + start = 0; + end = pg_sz * pg_num; + paddr_idx = pg_num; + } else { + start = paddr[0]; + end = paddr[0] + pg_sz; + paddr_idx = 1; + } + while (elt_cnt < elt_num) { + + if (end - start >= total_elt_sz) { + /* enough contiguous memory, add an object */ + start += total_elt_sz; + elt_cnt++; + } else if (paddr_idx < pg_num) { + /* no room to store one obj, add a page */ + if (end == paddr[paddr_idx]) { + end += pg_sz; + } else { + start = paddr[paddr_idx]; + end = paddr[paddr_idx] + pg_sz; + } + paddr_idx++; + + } else { + /* no more page, return how many elements fit */ + return -(size_t)elt_cnt; + } + } + + return (size_t)paddr_idx << pg_shift; +} + +/* free a memchunk allocated with rte_memzone_reserve() */ static void -mempool_lelem_iter(void *arg, __rte_unused void *start, void *end, - __rte_unused uint32_t idx) +rte_mempool_memchunk_mz_free(__rte_unused struct rte_mempool_memhdr *memhdr, + void *opaque) { - *(uintptr_t *)arg = (uintptr_t)end; + const struct rte_memzone *mz = opaque; + rte_memzone_free(mz); } -ssize_t -rte_mempool_xmem_usage(void *vaddr, uint32_t elt_num, size_t elt_sz, - const phys_addr_t paddr[], uint32_t pg_num, uint32_t pg_shift) +/* Free memory chunks used by a mempool. Objects must be in pool */ +static void +rte_mempool_free_memchunks(struct rte_mempool *mp) { - uint32_t n; - uintptr_t va, uv; - size_t pg_sz, usz; + struct rte_mempool_memhdr *memhdr; + void *elt; + + while (!STAILQ_EMPTY(&mp->elt_list)) { + rte_mempool_ops_dequeue_bulk(mp, &elt, 1); + (void)elt; + STAILQ_REMOVE_HEAD(&mp->elt_list, next); + mp->populated_size--; + } - pg_sz = (size_t)1 << pg_shift; - va = (uintptr_t)vaddr; - uv = va; + while (!STAILQ_EMPTY(&mp->mem_list)) { + memhdr = STAILQ_FIRST(&mp->mem_list); + STAILQ_REMOVE_HEAD(&mp->mem_list, next); + if (memhdr->free_cb != NULL) + memhdr->free_cb(memhdr, memhdr->opaque); + rte_free(memhdr); + mp->nb_mem_chunks--; + } +} + +/* Add objects in the pool, using a physically contiguous memory + * zone. Return the number of objects added, or a negative value + * on error. + */ +int +rte_mempool_populate_phys(struct rte_mempool *mp, char *vaddr, + phys_addr_t paddr, size_t len, rte_mempool_memchunk_free_cb_t *free_cb, + void *opaque) +{ + unsigned total_elt_sz; + unsigned i = 0; + size_t off; + struct rte_mempool_memhdr *memhdr; + int ret; + + /* create the internal ring if not already done */ + if ((mp->flags & MEMPOOL_F_POOL_CREATED) == 0) { + ret = rte_mempool_ops_alloc(mp); + if (ret != 0) + return ret; + mp->flags |= MEMPOOL_F_POOL_CREATED; + } + + /* mempool is already populated */ + if (mp->populated_size >= mp->size) + return -ENOSPC; + + total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size; - if ((n = rte_mempool_obj_iter(vaddr, elt_num, elt_sz, 1, - paddr, pg_num, pg_shift, mempool_lelem_iter, - &uv)) != elt_num) { - return -(ssize_t)n; + memhdr = rte_zmalloc("MEMPOOL_MEMHDR", sizeof(*memhdr), 0); + if (memhdr == NULL) + return -ENOMEM; + + memhdr->mp = mp; + memhdr->addr = vaddr; + memhdr->phys_addr = paddr; + memhdr->len = len; + memhdr->free_cb = free_cb; + memhdr->opaque = opaque; + + if (mp->flags & MEMPOOL_F_NO_CACHE_ALIGN) + off = RTE_PTR_ALIGN_CEIL(vaddr, 8) - vaddr; + else + off = RTE_PTR_ALIGN_CEIL(vaddr, RTE_CACHE_LINE_SIZE) - vaddr; + + while (off + total_elt_sz <= len && mp->populated_size < mp->size) { + off += mp->header_size; + if (paddr == RTE_BAD_PHYS_ADDR) + mempool_add_elem(mp, (char *)vaddr + off, + RTE_BAD_PHYS_ADDR); + else + mempool_add_elem(mp, (char *)vaddr + off, paddr + off); + off += mp->elt_size + mp->trailer_size; + i++; } - uv = RTE_ALIGN_CEIL(uv, pg_sz); - usz = uv - va; - return usz; + /* not enough room to store one object */ + if (i == 0) + return -EINVAL; + + STAILQ_INSERT_TAIL(&mp->mem_list, memhdr, next); + mp->nb_mem_chunks++; + return i; } -#ifndef RTE_LIBRTE_XEN_DOM0 -/* stub if DOM0 support not configured */ -struct rte_mempool * -rte_dom0_mempool_create(const char *name __rte_unused, - unsigned n __rte_unused, - unsigned elt_size __rte_unused, - unsigned cache_size __rte_unused, - unsigned private_data_size __rte_unused, - rte_mempool_ctor_t *mp_init __rte_unused, - void *mp_init_arg __rte_unused, - rte_mempool_obj_ctor_t *obj_init __rte_unused, - void *obj_init_arg __rte_unused, - int socket_id __rte_unused, - unsigned flags __rte_unused) -{ - rte_errno = EINVAL; - return NULL; +/* Add objects in the pool, using a table of physical pages. Return the + * number of objects added, or a negative value on error. + */ +int +rte_mempool_populate_phys_tab(struct rte_mempool *mp, char *vaddr, + const phys_addr_t paddr[], uint32_t pg_num, uint32_t pg_shift, + rte_mempool_memchunk_free_cb_t *free_cb, void *opaque) +{ + uint32_t i, n; + int ret, cnt = 0; + size_t pg_sz = (size_t)1 << pg_shift; + + /* mempool must not be populated */ + if (mp->nb_mem_chunks != 0) + return -EEXIST; + + if (mp->flags & MEMPOOL_F_NO_PHYS_CONTIG) + return rte_mempool_populate_phys(mp, vaddr, RTE_BAD_PHYS_ADDR, + pg_num * pg_sz, free_cb, opaque); + + for (i = 0; i < pg_num && mp->populated_size < mp->size; i += n) { + + /* populate with the largest group of contiguous pages */ + for (n = 1; (i + n) < pg_num && + paddr[i] + pg_sz == paddr[i+n]; n++) + ; + + ret = rte_mempool_populate_phys(mp, vaddr + i * pg_sz, + paddr[i], n * pg_sz, free_cb, opaque); + if (ret < 0) { + rte_mempool_free_memchunks(mp); + return ret; + } + /* no need to call the free callback for next chunks */ + free_cb = NULL; + cnt += ret; + } + return cnt; } -#endif -/* create the mempool */ -struct rte_mempool * -rte_mempool_create(const char *name, unsigned n, unsigned elt_size, - unsigned cache_size, unsigned private_data_size, - rte_mempool_ctor_t *mp_init, void *mp_init_arg, - rte_mempool_obj_ctor_t *obj_init, void *obj_init_arg, - int socket_id, unsigned flags) -{ - if (rte_xen_dom0_supported()) - return rte_dom0_mempool_create(name, n, elt_size, - cache_size, private_data_size, - mp_init, mp_init_arg, - obj_init, obj_init_arg, - socket_id, flags); - else - return rte_mempool_xmem_create(name, n, elt_size, - cache_size, private_data_size, - mp_init, mp_init_arg, - obj_init, obj_init_arg, - socket_id, flags, - NULL, NULL, MEMPOOL_PG_NUM_DEFAULT, - MEMPOOL_PG_SHIFT_MAX); +/* Populate the mempool with a virtual area. Return the number of + * objects added, or a negative value on error. + */ +int +rte_mempool_populate_virt(struct rte_mempool *mp, char *addr, + size_t len, size_t pg_sz, rte_mempool_memchunk_free_cb_t *free_cb, + void *opaque) +{ + phys_addr_t paddr; + size_t off, phys_len; + int ret, cnt = 0; + + /* mempool must not be populated */ + if (mp->nb_mem_chunks != 0) + return -EEXIST; + /* address and len must be page-aligned */ + if (RTE_PTR_ALIGN_CEIL(addr, pg_sz) != addr) + return -EINVAL; + if (RTE_ALIGN_CEIL(len, pg_sz) != len) + return -EINVAL; + + if (mp->flags & MEMPOOL_F_NO_PHYS_CONTIG) + return rte_mempool_populate_phys(mp, addr, RTE_BAD_PHYS_ADDR, + len, free_cb, opaque); + + for (off = 0; off + pg_sz <= len && + mp->populated_size < mp->size; off += phys_len) { + + paddr = rte_mem_virt2phy(addr + off); + /* required for xen_dom0 to get the machine address */ + paddr = rte_mem_phy2mch(-1, paddr); + + if (paddr == RTE_BAD_PHYS_ADDR) { + ret = -EINVAL; + goto fail; + } + + /* populate with the largest group of contiguous pages */ + for (phys_len = pg_sz; off + phys_len < len; phys_len += pg_sz) { + phys_addr_t paddr_tmp; + + paddr_tmp = rte_mem_virt2phy(addr + off + phys_len); + paddr_tmp = rte_mem_phy2mch(-1, paddr_tmp); + + if (paddr_tmp != paddr + phys_len) + break; + } + + ret = rte_mempool_populate_phys(mp, addr + off, paddr, + phys_len, free_cb, opaque); + if (ret < 0) + goto fail; + /* no need to call the free callback for next chunks */ + free_cb = NULL; + cnt += ret; + } + + return cnt; + + fail: + rte_mempool_free_memchunks(mp); + return ret; +} + +/* Default function to populate the mempool: allocate memory in memzones, + * and populate them. Return the number of objects added, or a negative + * value on error. + */ +int +rte_mempool_populate_default(struct rte_mempool *mp) +{ + int mz_flags = RTE_MEMZONE_1GB|RTE_MEMZONE_SIZE_HINT_ONLY; + char mz_name[RTE_MEMZONE_NAMESIZE]; + const struct rte_memzone *mz; + size_t size, total_elt_sz, align, pg_sz, pg_shift; + phys_addr_t paddr; + unsigned mz_id, n; + int ret; + + /* mempool must not be populated */ + if (mp->nb_mem_chunks != 0) + return -EEXIST; + + if (rte_eal_has_hugepages()) { + pg_shift = 0; /* not needed, zone is physically contiguous */ + pg_sz = 0; + align = RTE_CACHE_LINE_SIZE; + } else { + pg_sz = getpagesize(); + pg_shift = rte_bsf32(pg_sz); + align = pg_sz; + } + + total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size; + for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) { + size = rte_mempool_xmem_size(n, total_elt_sz, pg_shift); + + ret = snprintf(mz_name, sizeof(mz_name), + RTE_MEMPOOL_MZ_FORMAT "_%d", mp->name, mz_id); + if (ret < 0 || ret >= (int)sizeof(mz_name)) { + ret = -ENAMETOOLONG; + goto fail; + } + + mz = rte_memzone_reserve_aligned(mz_name, size, + mp->socket_id, mz_flags, align); + /* not enough memory, retry with the biggest zone we have */ + if (mz == NULL) + mz = rte_memzone_reserve_aligned(mz_name, 0, + mp->socket_id, mz_flags, align); + if (mz == NULL) { + ret = -rte_errno; + goto fail; + } + + if (mp->flags & MEMPOOL_F_NO_PHYS_CONTIG) + paddr = RTE_BAD_PHYS_ADDR; + else + paddr = mz->phys_addr; + + if (rte_eal_has_hugepages() && !rte_xen_dom0_supported()) + ret = rte_mempool_populate_phys(mp, mz->addr, + paddr, mz->len, + rte_mempool_memchunk_mz_free, + (void *)(uintptr_t)mz); + else + ret = rte_mempool_populate_virt(mp, mz->addr, + mz->len, pg_sz, + rte_mempool_memchunk_mz_free, + (void *)(uintptr_t)mz); + if (ret < 0) + goto fail; + } + + return mp->size; + + fail: + rte_mempool_free_memchunks(mp); + return ret; +} + +/* return the memory size required for mempool objects in anonymous mem */ +static size_t +get_anon_size(const struct rte_mempool *mp) +{ + size_t size, total_elt_sz, pg_sz, pg_shift; + + pg_sz = getpagesize(); + pg_shift = rte_bsf32(pg_sz); + total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size; + size = rte_mempool_xmem_size(mp->size, total_elt_sz, pg_shift); + + return size; +} + +/* unmap a memory zone mapped by rte_mempool_populate_anon() */ +static void +rte_mempool_memchunk_anon_free(struct rte_mempool_memhdr *memhdr, + void *opaque) +{ + munmap(opaque, get_anon_size(memhdr->mp)); +} + +/* populate the mempool with an anonymous mapping */ +int +rte_mempool_populate_anon(struct rte_mempool *mp) +{ + size_t size; + int ret; + char *addr; + + /* mempool is already populated, error */ + if (!STAILQ_EMPTY(&mp->mem_list)) { + rte_errno = EINVAL; + return 0; + } + + /* get chunk of virtually continuous memory */ + size = get_anon_size(mp); + addr = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (addr == MAP_FAILED) { + rte_errno = errno; + return 0; + } + /* can't use MMAP_LOCKED, it does not exist on BSD */ + if (mlock(addr, size) < 0) { + rte_errno = errno; + munmap(addr, size); + return 0; + } + + ret = rte_mempool_populate_virt(mp, addr, size, getpagesize(), + rte_mempool_memchunk_anon_free, addr); + if (ret == 0) + goto fail; + + return mp->populated_size; + + fail: + rte_mempool_free_memchunks(mp); + return 0; +} + +/* free a mempool */ +void +rte_mempool_free(struct rte_mempool *mp) +{ + struct rte_mempool_list *mempool_list = NULL; + struct rte_tailq_entry *te; + + if (mp == NULL) + return; + + mempool_list = RTE_TAILQ_CAST(rte_mempool_tailq.head, rte_mempool_list); + rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); + /* find out tailq entry */ + TAILQ_FOREACH(te, mempool_list, next) { + if (te->data == (void *)mp) + break; + } + + if (te != NULL) { + TAILQ_REMOVE(mempool_list, te, next); + rte_free(te); + } + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + + rte_mempool_free_memchunks(mp); + rte_mempool_ops_free(mp); + rte_memzone_free(mp->mz); +} + +static void +mempool_cache_init(struct rte_mempool_cache *cache, uint32_t size) +{ + cache->size = size; + cache->flushthresh = CALC_CACHE_FLUSHTHRESH(size); + cache->len = 0; } /* - * Create the mempool over already allocated chunk of memory. - * That external memory buffer can consists of physically disjoint pages. - * Setting vaddr to NULL, makes mempool to fallback to original behaviour - * and allocate space for mempool and it's elements as one big chunk of - * physically continuos memory. - * */ + * Create and initialize a cache for objects that are retrieved from and + * returned to an underlying mempool. This structure is identical to the + * local_cache[lcore_id] pointed to by the mempool structure. + */ +struct rte_mempool_cache * +rte_mempool_cache_create(uint32_t size, int socket_id) +{ + struct rte_mempool_cache *cache; + + if (size == 0 || size > RTE_MEMPOOL_CACHE_MAX_SIZE) { + rte_errno = EINVAL; + return NULL; + } + + cache = rte_zmalloc_socket("MEMPOOL_CACHE", sizeof(*cache), + RTE_CACHE_LINE_SIZE, socket_id); + if (cache == NULL) { + RTE_LOG(ERR, MEMPOOL, "Cannot allocate mempool cache.\n"); + rte_errno = ENOMEM; + return NULL; + } + + mempool_cache_init(cache, size); + + return cache; +} + +/* + * Free a cache. It's the responsibility of the user to make sure that any + * remaining objects in the cache are flushed to the corresponding + * mempool. + */ +void +rte_mempool_cache_free(struct rte_mempool_cache *cache) +{ + rte_free(cache); +} + +/* create an empty mempool */ struct rte_mempool * -rte_mempool_xmem_create(const char *name, unsigned n, unsigned elt_size, - unsigned cache_size, unsigned private_data_size, - rte_mempool_ctor_t *mp_init, void *mp_init_arg, - rte_mempool_obj_ctor_t *obj_init, void *obj_init_arg, - int socket_id, unsigned flags, void *vaddr, - const phys_addr_t paddr[], uint32_t pg_num, uint32_t pg_shift) +rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size, + unsigned cache_size, unsigned private_data_size, + int socket_id, unsigned flags) { char mz_name[RTE_MEMZONE_NAMESIZE]; - char rg_name[RTE_RING_NAMESIZE]; struct rte_mempool_list *mempool_list; struct rte_mempool *mp = NULL; struct rte_tailq_entry *te = NULL; - struct rte_ring *r = NULL; - const struct rte_memzone *mz; + const struct rte_memzone *mz = NULL; size_t mempool_size; int mz_flags = RTE_MEMZONE_1GB|RTE_MEMZONE_SIZE_HINT_ONLY; - int rg_flags = 0; - void *obj; struct rte_mempool_objsz objsz; - void *startaddr; - int page_size = getpagesize(); + unsigned lcore_id; + int ret; /* compilation-time checks */ RTE_BUILD_BUG_ON((sizeof(struct rte_mempool) & RTE_CACHE_LINE_MASK) != 0); -#if RTE_MEMPOOL_CACHE_MAX_SIZE > 0 RTE_BUILD_BUG_ON((sizeof(struct rte_mempool_cache) & RTE_CACHE_LINE_MASK) != 0); - RTE_BUILD_BUG_ON((offsetof(struct rte_mempool, local_cache) & - RTE_CACHE_LINE_MASK) != 0); -#endif #ifdef RTE_LIBRTE_MEMPOOL_DEBUG RTE_BUILD_BUG_ON((sizeof(struct rte_mempool_debug_stats) & RTE_CACHE_LINE_MASK) != 0); @@ -474,28 +759,10 @@ rte_mempool_xmem_create(const char *name, unsigned n, unsigned elt_size, return NULL; } - /* check that we have both VA and PA */ - if (vaddr != NULL && paddr == NULL) { - rte_errno = EINVAL; - return NULL; - } - - /* Check that pg_num and pg_shift parameters are valid. */ - if (pg_num < RTE_DIM(mp->elt_pa) || pg_shift > MEMPOOL_PG_SHIFT_MAX) { - rte_errno = EINVAL; - return NULL; - } - /* "no cache align" imply "no spread" */ if (flags & MEMPOOL_F_NO_CACHE_ALIGN) flags |= MEMPOOL_F_NO_SPREAD; - /* ring flags */ - if (flags & MEMPOOL_F_SP_PUT) - rg_flags |= RING_F_SP_ENQ; - if (flags & MEMPOOL_F_SC_GET) - rg_flags |= RING_F_SC_DEQ; - /* calculate mempool object sizes. */ if (!rte_mempool_calc_obj_size(elt_size, flags, &objsz)) { rte_errno = EINVAL; @@ -504,15 +771,6 @@ rte_mempool_xmem_create(const char *name, unsigned n, unsigned elt_size, rte_rwlock_write_lock(RTE_EAL_MEMPOOL_RWLOCK); - /* allocate the ring that will be used to store objects */ - /* Ring functions will return appropriate errors if we are - * running as a secondary process etc., so no checks made - * in this function for that condition */ - snprintf(rg_name, sizeof(rg_name), RTE_MEMPOOL_MZ_FORMAT, name); - r = rte_ring_create(rg_name, rte_align32pow2(n+1), socket_id, rg_flags); - if (r == NULL) - goto exit_unlock; - /* * reserve a memory zone for this mempool: private data is * cache-aligned @@ -520,17 +778,6 @@ rte_mempool_xmem_create(const char *name, unsigned n, unsigned elt_size, private_data_size = (private_data_size + RTE_MEMPOOL_ALIGN_MASK) & (~RTE_MEMPOOL_ALIGN_MASK); - if (! rte_eal_has_hugepages()) { - /* - * expand private data size to a whole page, so that the - * first pool element will start on a new standard page - */ - int head = sizeof(struct rte_mempool); - int new_size = (private_data_size + head) % page_size; - if (new_size) { - private_data_size += page_size - new_size; - } - } /* try to allocate tailq entry */ te = rte_zmalloc("MEMPOOL_TAILQ_ENTRY", sizeof(*te), 0); @@ -539,89 +786,57 @@ rte_mempool_xmem_create(const char *name, unsigned n, unsigned elt_size, goto exit_unlock; } - /* - * If user provided an external memory buffer, then use it to - * store mempool objects. Otherwise reserve a memzone that is large - * enough to hold mempool header and metadata plus mempool objects. - */ - mempool_size = MEMPOOL_HEADER_SIZE(mp, pg_num) + private_data_size; + mempool_size = MEMPOOL_HEADER_SIZE(mp, cache_size); + mempool_size += private_data_size; mempool_size = RTE_ALIGN_CEIL(mempool_size, RTE_MEMPOOL_ALIGN); - if (vaddr == NULL) - mempool_size += (size_t)objsz.total_size * n; - if (! rte_eal_has_hugepages()) { - /* - * we want the memory pool to start on a page boundary, - * because pool elements crossing page boundaries would - * result in discontiguous physical addresses - */ - mempool_size += page_size; + ret = snprintf(mz_name, sizeof(mz_name), RTE_MEMPOOL_MZ_FORMAT, name); + if (ret < 0 || ret >= (int)sizeof(mz_name)) { + rte_errno = ENAMETOOLONG; + goto exit_unlock; } - snprintf(mz_name, sizeof(mz_name), RTE_MEMPOOL_MZ_FORMAT, name); - mz = rte_memzone_reserve(mz_name, mempool_size, socket_id, mz_flags); if (mz == NULL) goto exit_unlock; - if (rte_eal_has_hugepages()) { - startaddr = (void*)mz->addr; - } else { - /* align memory pool start address on a page boundary */ - unsigned long addr = (unsigned long)mz->addr; - if (addr & (page_size - 1)) { - addr += page_size; - addr &= ~(page_size - 1); - } - startaddr = (void*)addr; - } - /* init the mempool structure */ - mp = startaddr; - memset(mp, 0, sizeof(*mp)); - snprintf(mp->name, sizeof(mp->name), "%s", name); - mp->phys_addr = mz->phys_addr; - mp->ring = r; + mp = mz->addr; + memset(mp, 0, MEMPOOL_HEADER_SIZE(mp, cache_size)); + ret = snprintf(mp->name, sizeof(mp->name), "%s", name); + if (ret < 0 || ret >= (int)sizeof(mp->name)) { + rte_errno = ENAMETOOLONG; + goto exit_unlock; + } + mp->mz = mz; + mp->socket_id = socket_id; mp->size = n; mp->flags = flags; + mp->socket_id = socket_id; mp->elt_size = objsz.elt_size; mp->header_size = objsz.header_size; mp->trailer_size = objsz.trailer_size; + /* Size of default caches, zero means disabled. */ mp->cache_size = cache_size; - mp->cache_flushthresh = CALC_CACHE_FLUSHTHRESH(cache_size); mp->private_data_size = private_data_size; + STAILQ_INIT(&mp->elt_list); + STAILQ_INIT(&mp->mem_list); - /* calculate address of the first element for continuous mempool. */ - obj = (char *)mp + MEMPOOL_HEADER_SIZE(mp, pg_num) + - private_data_size; - obj = RTE_PTR_ALIGN_CEIL(obj, RTE_MEMPOOL_ALIGN); - - /* populate address translation fields. */ - mp->pg_num = pg_num; - mp->pg_shift = pg_shift; - mp->pg_mask = RTE_LEN2MASK(mp->pg_shift, typeof(mp->pg_mask)); - - /* mempool elements allocated together with mempool */ - if (vaddr == NULL) { - mp->elt_va_start = (uintptr_t)obj; - mp->elt_pa[0] = mp->phys_addr + - (mp->elt_va_start - (uintptr_t)mp); + /* + * local_cache pointer is set even if cache_size is zero. + * The local_cache points to just past the elt_pa[] array. + */ + mp->local_cache = (struct rte_mempool_cache *) + RTE_PTR_ADD(mp, MEMPOOL_HEADER_SIZE(mp, 0)); - /* mempool elements in a separate chunk of memory. */ - } else { - mp->elt_va_start = (uintptr_t)vaddr; - memcpy(mp->elt_pa, paddr, sizeof (mp->elt_pa[0]) * pg_num); + /* Init all default caches. */ + if (cache_size != 0) { + for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) + mempool_cache_init(&mp->local_cache[lcore_id], + cache_size); } - mp->elt_va_end = mp->elt_va_start; - - /* call the initializer */ - if (mp_init) - mp_init(mp, mp_init_arg); - - mempool_populate(mp, n, 1, obj_init, obj_init_arg); - - te->data = (void *) mp; + te->data = mp; rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); TAILQ_INSERT_TAIL(mempool_list, te, next); @@ -632,30 +847,132 @@ rte_mempool_xmem_create(const char *name, unsigned n, unsigned elt_size, exit_unlock: rte_rwlock_write_unlock(RTE_EAL_MEMPOOL_RWLOCK); - rte_ring_free(r); rte_free(te); + rte_mempool_free(mp); + return NULL; +} + +/* create the mempool */ +struct rte_mempool * +rte_mempool_create(const char *name, unsigned n, unsigned elt_size, + unsigned cache_size, unsigned private_data_size, + rte_mempool_ctor_t *mp_init, void *mp_init_arg, + rte_mempool_obj_cb_t *obj_init, void *obj_init_arg, + int socket_id, unsigned flags) +{ + struct rte_mempool *mp; + + mp = rte_mempool_create_empty(name, n, elt_size, cache_size, + private_data_size, socket_id, flags); + if (mp == NULL) + return NULL; + + /* + * Since we have 4 combinations of the SP/SC/MP/MC examine the flags to + * set the correct index into the table of ops structs. + */ + if (flags & (MEMPOOL_F_SP_PUT | MEMPOOL_F_SC_GET)) + rte_mempool_set_ops_byname(mp, "ring_sp_sc", NULL); + else if (flags & MEMPOOL_F_SP_PUT) + rte_mempool_set_ops_byname(mp, "ring_sp_mc", NULL); + else if (flags & MEMPOOL_F_SC_GET) + rte_mempool_set_ops_byname(mp, "ring_mp_sc", NULL); + else + rte_mempool_set_ops_byname(mp, "ring_mp_mc", NULL); + + /* call the mempool priv initializer */ + if (mp_init) + mp_init(mp, mp_init_arg); + + if (rte_mempool_populate_default(mp) < 0) + goto fail; + + /* call the object initializers */ + if (obj_init) + rte_mempool_obj_iter(mp, obj_init, obj_init_arg); + + return mp; + fail: + rte_mempool_free(mp); + return NULL; +} + +/* + * Create the mempool over already allocated chunk of memory. + * That external memory buffer can consists of physically disjoint pages. + * Setting vaddr to NULL, makes mempool to fallback to original behaviour + * and allocate space for mempool and it's elements as one big chunk of + * physically continuos memory. + */ +struct rte_mempool * +rte_mempool_xmem_create(const char *name, unsigned n, unsigned elt_size, + unsigned cache_size, unsigned private_data_size, + rte_mempool_ctor_t *mp_init, void *mp_init_arg, + rte_mempool_obj_cb_t *obj_init, void *obj_init_arg, + int socket_id, unsigned flags, void *vaddr, + const phys_addr_t paddr[], uint32_t pg_num, uint32_t pg_shift) +{ + struct rte_mempool *mp = NULL; + int ret; + + /* no virtual address supplied, use rte_mempool_create() */ + if (vaddr == NULL) + return rte_mempool_create(name, n, elt_size, cache_size, + private_data_size, mp_init, mp_init_arg, + obj_init, obj_init_arg, socket_id, flags); + + /* check that we have both VA and PA */ + if (paddr == NULL) { + rte_errno = EINVAL; + return NULL; + } + + /* Check that pg_shift parameter is valid. */ + if (pg_shift > MEMPOOL_PG_SHIFT_MAX) { + rte_errno = EINVAL; + return NULL; + } + + mp = rte_mempool_create_empty(name, n, elt_size, cache_size, + private_data_size, socket_id, flags); + if (mp == NULL) + return NULL; + + /* call the mempool priv initializer */ + if (mp_init) + mp_init(mp, mp_init_arg); + + ret = rte_mempool_populate_phys_tab(mp, vaddr, paddr, pg_num, pg_shift, + NULL, NULL); + if (ret < 0 || ret != (int)mp->size) + goto fail; + + /* call the object initializers */ + if (obj_init) + rte_mempool_obj_iter(mp, obj_init, obj_init_arg); + + return mp; + + fail: + rte_mempool_free(mp); return NULL; } /* Return the number of entries in the mempool */ -unsigned -rte_mempool_count(const struct rte_mempool *mp) +unsigned int +rte_mempool_avail_count(const struct rte_mempool *mp) { unsigned count; + unsigned lcore_id; - count = rte_ring_count(mp->ring); + count = rte_mempool_ops_get_count(mp); -#if RTE_MEMPOOL_CACHE_MAX_SIZE > 0 - { - unsigned lcore_id; - if (mp->cache_size == 0) - return count; + if (mp->cache_size == 0) + return count; - for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) - count += mp->local_cache[lcore_id].len; - } -#endif + for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) + count += mp->local_cache[lcore_id].len; /* * due to race condition (access to len is not locked), the @@ -666,116 +983,168 @@ rte_mempool_count(const struct rte_mempool *mp) return count; } +/* return the number of entries allocated from the mempool */ +unsigned int +rte_mempool_in_use_count(const struct rte_mempool *mp) +{ + return mp->size - rte_mempool_avail_count(mp); +} + +unsigned int +rte_mempool_count(const struct rte_mempool *mp) +{ + return rte_mempool_avail_count(mp); +} + /* dump the cache status */ static unsigned rte_mempool_dump_cache(FILE *f, const struct rte_mempool *mp) { -#if RTE_MEMPOOL_CACHE_MAX_SIZE > 0 unsigned lcore_id; unsigned count = 0; unsigned cache_count; - fprintf(f, " cache infos:\n"); + fprintf(f, " internal cache infos:\n"); fprintf(f, " cache_size=%"PRIu32"\n", mp->cache_size); + + if (mp->cache_size == 0) + return count; + for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { cache_count = mp->local_cache[lcore_id].len; - fprintf(f, " cache_count[%u]=%u\n", lcore_id, cache_count); + fprintf(f, " cache_count[%u]=%"PRIu32"\n", + lcore_id, cache_count); count += cache_count; } fprintf(f, " total_cache_count=%u\n", count); return count; -#else - RTE_SET_USED(mp); - fprintf(f, " cache disabled\n"); - return 0; -#endif } -#ifdef RTE_LIBRTE_MEMPOOL_DEBUG -/* check cookies before and after objects */ #ifndef __INTEL_COMPILER #pragma GCC diagnostic ignored "-Wcast-qual" #endif -struct mempool_audit_arg { - const struct rte_mempool *mp; - uintptr_t obj_end; - uint32_t obj_num; -}; - -static void -mempool_obj_audit(void *arg, void *start, void *end, uint32_t idx) +/* check and update cookies or panic (internal) */ +void rte_mempool_check_cookies(const struct rte_mempool *mp, + void * const *obj_table_const, unsigned n, int free) { - struct mempool_audit_arg *pa = arg; +#ifdef RTE_LIBRTE_MEMPOOL_DEBUG + struct rte_mempool_objhdr *hdr; + struct rte_mempool_objtlr *tlr; + uint64_t cookie; + void *tmp; void *obj; - - obj = (char *)start + pa->mp->header_size; - pa->obj_end = (uintptr_t)end; - pa->obj_num = idx + 1; - __mempool_check_cookies(pa->mp, &obj, 1, 2); + void **obj_table; + + /* Force to drop the "const" attribute. This is done only when + * DEBUG is enabled */ + tmp = (void *) obj_table_const; + obj_table = (void **) tmp; + + while (n--) { + obj = obj_table[n]; + + if (rte_mempool_from_obj(obj) != mp) + rte_panic("MEMPOOL: object is owned by another " + "mempool\n"); + + hdr = __mempool_get_header(obj); + cookie = hdr->cookie; + + if (free == 0) { + if (cookie != RTE_MEMPOOL_HEADER_COOKIE1) { + RTE_LOG(CRIT, MEMPOOL, + "obj=%p, mempool=%p, cookie=%" PRIx64 "\n", + obj, (const void *) mp, cookie); + rte_panic("MEMPOOL: bad header cookie (put)\n"); + } + hdr->cookie = RTE_MEMPOOL_HEADER_COOKIE2; + } else if (free == 1) { + if (cookie != RTE_MEMPOOL_HEADER_COOKIE2) { + RTE_LOG(CRIT, MEMPOOL, + "obj=%p, mempool=%p, cookie=%" PRIx64 "\n", + obj, (const void *) mp, cookie); + rte_panic("MEMPOOL: bad header cookie (get)\n"); + } + hdr->cookie = RTE_MEMPOOL_HEADER_COOKIE1; + } else if (free == 2) { + if (cookie != RTE_MEMPOOL_HEADER_COOKIE1 && + cookie != RTE_MEMPOOL_HEADER_COOKIE2) { + RTE_LOG(CRIT, MEMPOOL, + "obj=%p, mempool=%p, cookie=%" PRIx64 "\n", + obj, (const void *) mp, cookie); + rte_panic("MEMPOOL: bad header cookie (audit)\n"); + } + } + tlr = __mempool_get_trailer(obj); + cookie = tlr->cookie; + if (cookie != RTE_MEMPOOL_TRAILER_COOKIE) { + RTE_LOG(CRIT, MEMPOOL, + "obj=%p, mempool=%p, cookie=%" PRIx64 "\n", + obj, (const void *) mp, cookie); + rte_panic("MEMPOOL: bad trailer cookie\n"); + } + } +#else + RTE_SET_USED(mp); + RTE_SET_USED(obj_table_const); + RTE_SET_USED(n); + RTE_SET_USED(free); +#endif } +#ifdef RTE_LIBRTE_MEMPOOL_DEBUG static void -mempool_audit_cookies(const struct rte_mempool *mp) +mempool_obj_audit(struct rte_mempool *mp, __rte_unused void *opaque, + void *obj, __rte_unused unsigned idx) { - uint32_t elt_sz, num; - struct mempool_audit_arg arg; - - elt_sz = mp->elt_size + mp->header_size + mp->trailer_size; - - arg.mp = mp; - arg.obj_end = mp->elt_va_start; - arg.obj_num = 0; + __mempool_check_cookies(mp, &obj, 1, 2); +} - num = rte_mempool_obj_iter((void *)mp->elt_va_start, - mp->size, elt_sz, 1, - mp->elt_pa, mp->pg_num, mp->pg_shift, - mempool_obj_audit, &arg); +static void +mempool_audit_cookies(struct rte_mempool *mp) +{ + unsigned num; + num = rte_mempool_obj_iter(mp, mempool_obj_audit, NULL); if (num != mp->size) { - rte_panic("rte_mempool_obj_iter(mempool=%p, size=%u) " + rte_panic("rte_mempool_obj_iter(mempool=%p, size=%u) " "iterated only over %u elements\n", mp, mp->size, num); - } else if (arg.obj_end != mp->elt_va_end || arg.obj_num != mp->size) { - rte_panic("rte_mempool_obj_iter(mempool=%p, size=%u) " - "last callback va_end: %#tx (%#tx expeceted), " - "num of objects: %u (%u expected)\n", - mp, mp->size, - arg.obj_end, mp->elt_va_end, - arg.obj_num, mp->size); } } +#else +#define mempool_audit_cookies(mp) do {} while(0) +#endif #ifndef __INTEL_COMPILER #pragma GCC diagnostic error "-Wcast-qual" #endif -#else -#define mempool_audit_cookies(mp) do {} while(0) -#endif -#if RTE_MEMPOOL_CACHE_MAX_SIZE > 0 /* check cookies before and after objects */ static void mempool_audit_cache(const struct rte_mempool *mp) { /* check cache size consistency */ unsigned lcore_id; + + if (mp->cache_size == 0) + return; + for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { - if (mp->local_cache[lcore_id].len > mp->cache_flushthresh) { + const struct rte_mempool_cache *cache; + cache = &mp->local_cache[lcore_id]; + if (cache->len > cache->flushthresh) { RTE_LOG(CRIT, MEMPOOL, "badness on cache[%u]\n", lcore_id); rte_panic("MEMPOOL: invalid cache len\n"); } } } -#else -#define mempool_audit_cache(mp) do {} while(0) -#endif - /* check the consistency of mempool (size, cookies, ...) */ void -rte_mempool_audit(const struct rte_mempool *mp) +rte_mempool_audit(struct rte_mempool *mp) { mempool_audit_cache(mp); mempool_audit_cookies(mp); @@ -786,23 +1155,27 @@ rte_mempool_audit(const struct rte_mempool *mp) /* dump the status of the mempool on the console */ void -rte_mempool_dump(FILE *f, const struct rte_mempool *mp) +rte_mempool_dump(FILE *f, struct rte_mempool *mp) { #ifdef RTE_LIBRTE_MEMPOOL_DEBUG struct rte_mempool_debug_stats sum; unsigned lcore_id; #endif + struct rte_mempool_memhdr *memhdr; unsigned common_count; unsigned cache_count; + size_t mem_len = 0; - RTE_VERIFY(f != NULL); - RTE_VERIFY(mp != NULL); + RTE_ASSERT(f != NULL); + RTE_ASSERT(mp != NULL); fprintf(f, "mempool <%s>@%p\n", mp->name, mp); fprintf(f, " flags=%x\n", mp->flags); - fprintf(f, " ring=<%s>@%p\n", mp->ring->name, mp->ring); - fprintf(f, " phys_addr=0x%" PRIx64 "\n", mp->phys_addr); + fprintf(f, " pool=%p\n", mp->pool_data); + fprintf(f, " phys_addr=0x%" PRIx64 "\n", mp->mz->phys_addr); + fprintf(f, " nb_mem_chunks=%u\n", mp->nb_mem_chunks); fprintf(f, " size=%"PRIu32"\n", mp->size); + fprintf(f, " populated_size=%"PRIu32"\n", mp->populated_size); fprintf(f, " header_size=%"PRIu32"\n", mp->header_size); fprintf(f, " elt_size=%"PRIu32"\n", mp->elt_size); fprintf(f, " trailer_size=%"PRIu32"\n", mp->trailer_size); @@ -810,20 +1183,16 @@ rte_mempool_dump(FILE *f, const struct rte_mempool *mp) mp->header_size + mp->elt_size + mp->trailer_size); fprintf(f, " private_data_size=%"PRIu32"\n", mp->private_data_size); - fprintf(f, " pg_num=%"PRIu32"\n", mp->pg_num); - fprintf(f, " pg_shift=%"PRIu32"\n", mp->pg_shift); - fprintf(f, " pg_mask=%#tx\n", mp->pg_mask); - fprintf(f, " elt_va_start=%#tx\n", mp->elt_va_start); - fprintf(f, " elt_va_end=%#tx\n", mp->elt_va_end); - fprintf(f, " elt_pa[0]=0x%" PRIx64 "\n", mp->elt_pa[0]); - - if (mp->size != 0) + + STAILQ_FOREACH(memhdr, &mp->mem_list, next) + mem_len += memhdr->len; + if (mem_len != 0) { fprintf(f, " avg bytes/object=%#Lf\n", - (long double)(mp->elt_va_end - mp->elt_va_start) / - mp->size); + (long double)mem_len / mp->size); + } cache_count = rte_mempool_dump_cache(f, mp); - common_count = rte_ring_count(mp->ring); + common_count = rte_mempool_ops_get_count(mp); if ((cache_count + common_count) > mp->size) common_count = mp->size - cache_count; fprintf(f, " common_pool_count=%u\n", common_count); @@ -857,7 +1226,7 @@ rte_mempool_dump(FILE *f, const struct rte_mempool *mp) void rte_mempool_list_dump(FILE *f) { - const struct rte_mempool *mp = NULL; + struct rte_mempool *mp = NULL; struct rte_tailq_entry *te; struct rte_mempool_list *mempool_list; @@ -901,7 +1270,7 @@ rte_mempool_lookup(const char *name) return mp; } -void rte_mempool_walk(void (*func)(const struct rte_mempool *, void *), +void rte_mempool_walk(void (*func)(struct rte_mempool *, void *), void *arg) { struct rte_tailq_entry *te = NULL; diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h index 9745bf0d..fb7052e1 100644 --- a/lib/librte_mempool/rte_mempool.h +++ b/lib/librte_mempool/rte_mempool.h @@ -2,6 +2,7 @@ * BSD LICENSE * * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * Copyright(c) 2016 6WIND S.A. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -66,12 +67,14 @@ #include <inttypes.h> #include <sys/queue.h> +#include <rte_spinlock.h> #include <rte_log.h> #include <rte_debug.h> #include <rte_lcore.h> #include <rte_memory.h> #include <rte_branch_prediction.h> #include <rte_ring.h> +#include <rte_memcpy.h> #ifdef __cplusplus extern "C" { @@ -95,19 +98,19 @@ struct rte_mempool_debug_stats { } __rte_cache_aligned; #endif -#if RTE_MEMPOOL_CACHE_MAX_SIZE > 0 /** * A structure that stores a per-core object cache. */ struct rte_mempool_cache { - unsigned len; /**< Cache len */ + uint32_t size; /**< Size of the cache */ + uint32_t flushthresh; /**< Threshold before we flush excess elements */ + uint32_t len; /**< Current cache count */ /* * Cache is allocated to this size to allow it to overflow in certain * cases to avoid needless emptying of cache. */ void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */ } __rte_cache_aligned; -#endif /* RTE_MEMPOOL_CACHE_MAX_SIZE > 0 */ /** * A structure that stores the size of mempool elements. @@ -126,17 +129,6 @@ struct rte_mempool_objsz { /* "MP_<name>" */ #define RTE_MEMPOOL_MZ_FORMAT RTE_MEMPOOL_MZ_PREFIX "%s" -#ifdef RTE_LIBRTE_XEN_DOM0 - -/* "<name>_MP_elt" */ -#define RTE_MEMPOOL_OBJ_NAME "%s_" RTE_MEMPOOL_MZ_PREFIX "elt" - -#else - -#define RTE_MEMPOOL_OBJ_NAME RTE_MEMPOOL_MZ_FORMAT - -#endif /* RTE_LIBRTE_XEN_DOM0 */ - #define MEMPOOL_PG_SHIFT_MAX (sizeof(uintptr_t) * CHAR_BIT - 1) /** Mempool over one chunk of physically continuous memory */ @@ -152,18 +144,26 @@ struct rte_mempool_objsz { * Mempool object header structure * * Each object stored in mempools are prefixed by this header structure, - * it allows to retrieve the mempool pointer from the object. When debug - * is enabled, a cookie is also added in this structure preventing - * corruptions and double-frees. + * it allows to retrieve the mempool pointer from the object and to + * iterate on all objects attached to a mempool. When debug is enabled, + * a cookie is also added in this structure preventing corruptions and + * double-frees. */ struct rte_mempool_objhdr { + STAILQ_ENTRY(rte_mempool_objhdr) next; /**< Next in list. */ struct rte_mempool *mp; /**< The mempool owning the object. */ + phys_addr_t physaddr; /**< Physical address of the object. */ #ifdef RTE_LIBRTE_MEMPOOL_DEBUG uint64_t cookie; /**< Debug cookie. */ #endif }; /** + * A list of object headers type + */ +STAILQ_HEAD(rte_mempool_objhdr_list, rte_mempool_objhdr); + +/** * Mempool object trailer structure * * In debug mode, each object stored in mempools are suffixed by this @@ -176,53 +176,82 @@ struct rte_mempool_objtlr { }; /** + * A list of memory where objects are stored + */ +STAILQ_HEAD(rte_mempool_memhdr_list, rte_mempool_memhdr); + +/** + * Callback used to free a memory chunk + */ +typedef void (rte_mempool_memchunk_free_cb_t)(struct rte_mempool_memhdr *memhdr, + void *opaque); + +/** + * Mempool objects memory header structure + * + * The memory chunks where objects are stored. Each chunk is virtually + * and physically contiguous. + */ +struct rte_mempool_memhdr { + STAILQ_ENTRY(rte_mempool_memhdr) next; /**< Next in list. */ + struct rte_mempool *mp; /**< The mempool owning the chunk */ + void *addr; /**< Virtual address of the chunk */ + phys_addr_t phys_addr; /**< Physical address of the chunk */ + size_t len; /**< length of the chunk */ + rte_mempool_memchunk_free_cb_t *free_cb; /**< Free callback */ + void *opaque; /**< Argument passed to the free callback */ +}; + +/** * The RTE mempool structure. */ struct rte_mempool { char name[RTE_MEMPOOL_NAMESIZE]; /**< Name of mempool. */ - struct rte_ring *ring; /**< Ring to store objects. */ - phys_addr_t phys_addr; /**< Phys. addr. of mempool struct. */ + union { + void *pool_data; /**< Ring or pool to store objects. */ + uint64_t pool_id; /**< External mempool identifier. */ + }; + void *pool_config; /**< optional args for ops alloc. */ + const struct rte_memzone *mz; /**< Memzone where pool is alloc'd. */ int flags; /**< Flags of the mempool. */ - uint32_t size; /**< Size of the mempool. */ - uint32_t cache_size; /**< Size of per-lcore local cache. */ - uint32_t cache_flushthresh; - /**< Threshold before we flush excess elements. */ + int socket_id; /**< Socket id passed at create. */ + uint32_t size; /**< Max size of the mempool. */ + uint32_t cache_size; + /**< Size of per-lcore default local cache. */ uint32_t elt_size; /**< Size of an element. */ uint32_t header_size; /**< Size of header (before elt). */ uint32_t trailer_size; /**< Size of trailer (after elt). */ unsigned private_data_size; /**< Size of private data. */ + /** + * Index into rte_mempool_ops_table array of mempool ops + * structs, which contain callback function pointers. + * We're using an index here rather than pointers to the callbacks + * to facilitate any secondary processes that may want to use + * this mempool. + */ + int32_t ops_index; -#if RTE_MEMPOOL_CACHE_MAX_SIZE > 0 - /** Per-lcore local cache. */ - struct rte_mempool_cache local_cache[RTE_MAX_LCORE]; -#endif + struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */ + + uint32_t populated_size; /**< Number of populated objects. */ + struct rte_mempool_objhdr_list elt_list; /**< List of objects in pool */ + uint32_t nb_mem_chunks; /**< Number of memory chunks */ + struct rte_mempool_memhdr_list mem_list; /**< List of memory chunks */ #ifdef RTE_LIBRTE_MEMPOOL_DEBUG /** Per-lcore statistics. */ struct rte_mempool_debug_stats stats[RTE_MAX_LCORE]; #endif - - /* Address translation support, starts from next cache line. */ - - /** Number of elements in the elt_pa array. */ - uint32_t pg_num __rte_cache_aligned; - uint32_t pg_shift; /**< LOG2 of the physical pages. */ - uintptr_t pg_mask; /**< physical page mask value. */ - uintptr_t elt_va_start; - /**< Virtual address of the first mempool object. */ - uintptr_t elt_va_end; - /**< Virtual address of the <size + 1> mempool object. */ - phys_addr_t elt_pa[MEMPOOL_PG_NUM_DEFAULT]; - /**< Array of physical page addresses for the mempool objects buffer. */ - } __rte_cache_aligned; -#define MEMPOOL_F_NO_SPREAD 0x0001 /**< Do not spread in memory. */ +#define MEMPOOL_F_NO_SPREAD 0x0001 /**< Do not spread among memory channels. */ #define MEMPOOL_F_NO_CACHE_ALIGN 0x0002 /**< Do not align objs on cache lines.*/ #define MEMPOOL_F_SP_PUT 0x0004 /**< Default put is "single-producer".*/ #define MEMPOOL_F_SC_GET 0x0008 /**< Default get is "single-consumer".*/ +#define MEMPOOL_F_POOL_CREATED 0x0010 /**< Internal: pool is created. */ +#define MEMPOOL_F_NO_PHYS_CONTIG 0x0020 /**< Don't need physically contiguous objs. */ /** * @internal When debug is enabled, store some statistics. @@ -251,24 +280,18 @@ struct rte_mempool { * * @param mp * Pointer to the memory pool. - * @param pgn - * Number of pages used to store mempool objects. - */ -#define MEMPOOL_HEADER_SIZE(mp, pgn) (sizeof(*(mp)) + \ - RTE_ALIGN_CEIL(((pgn) - RTE_DIM((mp)->elt_pa)) * \ - sizeof ((mp)->elt_pa[0]), RTE_CACHE_LINE_SIZE)) - -/** - * Return true if the whole mempool is in contiguous memory. + * @param cs + * Size of the per-lcore cache. */ -#define MEMPOOL_IS_CONTIG(mp) \ - ((mp)->pg_num == MEMPOOL_PG_NUM_DEFAULT && \ - (mp)->phys_addr == (mp)->elt_pa[0]) +#define MEMPOOL_HEADER_SIZE(mp, cs) \ + (sizeof(*(mp)) + (((cs) == 0) ? 0 : \ + (sizeof(struct rte_mempool_cache) * RTE_MAX_LCORE))) /* return the header of a mempool object (internal) */ static inline struct rte_mempool_objhdr *__mempool_get_header(void *obj) { - return (struct rte_mempool_objhdr *)RTE_PTR_SUB(obj, sizeof(struct rte_mempool_objhdr)); + return (struct rte_mempool_objhdr *)RTE_PTR_SUB(obj, + sizeof(struct rte_mempool_objhdr)); } /** @@ -307,141 +330,242 @@ static inline struct rte_mempool_objtlr *__mempool_get_trailer(void *obj) * - 1: object is supposed to be free, mark it as allocated * - 2: just check that cookie is valid (free or allocated) */ +void rte_mempool_check_cookies(const struct rte_mempool *mp, + void * const *obj_table_const, unsigned n, int free); + #ifdef RTE_LIBRTE_MEMPOOL_DEBUG -#ifndef __INTEL_COMPILER -#pragma GCC diagnostic ignored "-Wcast-qual" -#endif -static inline void __mempool_check_cookies(const struct rte_mempool *mp, - void * const *obj_table_const, - unsigned n, int free) -{ - struct rte_mempool_objhdr *hdr; - struct rte_mempool_objtlr *tlr; - uint64_t cookie; - void *tmp; - void *obj; - void **obj_table; - - /* Force to drop the "const" attribute. This is done only when - * DEBUG is enabled */ - tmp = (void *) obj_table_const; - obj_table = (void **) tmp; - - while (n--) { - obj = obj_table[n]; - - if (rte_mempool_from_obj(obj) != mp) - rte_panic("MEMPOOL: object is owned by another " - "mempool\n"); - - hdr = __mempool_get_header(obj); - cookie = hdr->cookie; - - if (free == 0) { - if (cookie != RTE_MEMPOOL_HEADER_COOKIE1) { - rte_log_set_history(0); - RTE_LOG(CRIT, MEMPOOL, - "obj=%p, mempool=%p, cookie=%" PRIx64 "\n", - obj, (const void *) mp, cookie); - rte_panic("MEMPOOL: bad header cookie (put)\n"); - } - hdr->cookie = RTE_MEMPOOL_HEADER_COOKIE2; - } - else if (free == 1) { - if (cookie != RTE_MEMPOOL_HEADER_COOKIE2) { - rte_log_set_history(0); - RTE_LOG(CRIT, MEMPOOL, - "obj=%p, mempool=%p, cookie=%" PRIx64 "\n", - obj, (const void *) mp, cookie); - rte_panic("MEMPOOL: bad header cookie (get)\n"); - } - hdr->cookie = RTE_MEMPOOL_HEADER_COOKIE1; - } - else if (free == 2) { - if (cookie != RTE_MEMPOOL_HEADER_COOKIE1 && - cookie != RTE_MEMPOOL_HEADER_COOKIE2) { - rte_log_set_history(0); - RTE_LOG(CRIT, MEMPOOL, - "obj=%p, mempool=%p, cookie=%" PRIx64 "\n", - obj, (const void *) mp, cookie); - rte_panic("MEMPOOL: bad header cookie (audit)\n"); - } - } - tlr = __mempool_get_trailer(obj); - cookie = tlr->cookie; - if (cookie != RTE_MEMPOOL_TRAILER_COOKIE) { - rte_log_set_history(0); - RTE_LOG(CRIT, MEMPOOL, - "obj=%p, mempool=%p, cookie=%" PRIx64 "\n", - obj, (const void *) mp, cookie); - rte_panic("MEMPOOL: bad trailer cookie\n"); - } - } -} -#ifndef __INTEL_COMPILER -#pragma GCC diagnostic error "-Wcast-qual" -#endif +#define __mempool_check_cookies(mp, obj_table_const, n, free) \ + rte_mempool_check_cookies(mp, obj_table_const, n, free) #else #define __mempool_check_cookies(mp, obj_table_const, n, free) do {} while(0) #endif /* RTE_LIBRTE_MEMPOOL_DEBUG */ +#define RTE_MEMPOOL_OPS_NAMESIZE 32 /**< Max length of ops struct name. */ + +/** + * Prototype for implementation specific data provisioning function. + * + * The function should provide the implementation specific memory for + * for use by the other mempool ops functions in a given mempool ops struct. + * E.g. the default ops provides an instance of the rte_ring for this purpose. + * it will most likely point to a different type of data structure, and + * will be transparent to the application programmer. + * This function should set mp->pool_data. + */ +typedef int (*rte_mempool_alloc_t)(struct rte_mempool *mp); + +/** + * Free the opaque private data pointed to by mp->pool_data pointer. + */ +typedef void (*rte_mempool_free_t)(struct rte_mempool *mp); + +/** + * Enqueue an object into the external pool. + */ +typedef int (*rte_mempool_enqueue_t)(struct rte_mempool *mp, + void * const *obj_table, unsigned int n); + /** - * A mempool object iterator callback function. + * Dequeue an object from the external pool. */ -typedef void (*rte_mempool_obj_iter_t)(void * /*obj_iter_arg*/, - void * /*obj_start*/, - void * /*obj_end*/, - uint32_t /*obj_index */); +typedef int (*rte_mempool_dequeue_t)(struct rte_mempool *mp, + void **obj_table, unsigned int n); /** - * Call a function for each mempool object in a memory chunk + * Return the number of available objects in the external pool. + */ +typedef unsigned (*rte_mempool_get_count)(const struct rte_mempool *mp); + +/** Structure defining mempool operations structure */ +struct rte_mempool_ops { + char name[RTE_MEMPOOL_OPS_NAMESIZE]; /**< Name of mempool ops struct. */ + rte_mempool_alloc_t alloc; /**< Allocate private data. */ + rte_mempool_free_t free; /**< Free the external pool. */ + rte_mempool_enqueue_t enqueue; /**< Enqueue an object. */ + rte_mempool_dequeue_t dequeue; /**< Dequeue an object. */ + rte_mempool_get_count get_count; /**< Get qty of available objs. */ +} __rte_cache_aligned; + +#define RTE_MEMPOOL_MAX_OPS_IDX 16 /**< Max registered ops structs */ + +/** + * Structure storing the table of registered ops structs, each of which contain + * the function pointers for the mempool ops functions. + * Each process has its own storage for this ops struct array so that + * the mempools can be shared across primary and secondary processes. + * The indices used to access the array are valid across processes, whereas + * any function pointers stored directly in the mempool struct would not be. + * This results in us simply having "ops_index" in the mempool struct. + */ +struct rte_mempool_ops_table { + rte_spinlock_t sl; /**< Spinlock for add/delete. */ + uint32_t num_ops; /**< Number of used ops structs in the table. */ + /** + * Storage for all possible ops structs. + */ + struct rte_mempool_ops ops[RTE_MEMPOOL_MAX_OPS_IDX]; +} __rte_cache_aligned; + +/** Array of registered ops structs. */ +extern struct rte_mempool_ops_table rte_mempool_ops_table; + +/** + * @internal Get the mempool ops struct from its index. * - * Iterate across objects of the given size and alignment in the - * provided chunk of memory. The given memory buffer can consist of - * disjointed physical pages. + * @param ops_index + * The index of the ops struct in the ops struct table. It must be a valid + * index: (0 <= idx < num_ops). + * @return + * The pointer to the ops struct in the table. + */ +static inline struct rte_mempool_ops * +rte_mempool_get_ops(int ops_index) +{ + RTE_VERIFY((ops_index >= 0) && (ops_index < RTE_MEMPOOL_MAX_OPS_IDX)); + + return &rte_mempool_ops_table.ops[ops_index]; +} + +/** + * @internal Wrapper for mempool_ops alloc callback. * - * For each object, call the provided callback (if any). This function - * is used to populate a mempool, or walk through all the elements of a - * mempool, or estimate how many elements of the given size could be - * created in the given memory buffer. + * @param mp + * Pointer to the memory pool. + * @return + * - 0: Success; successfully allocated mempool pool_data. + * - <0: Error; code of alloc function. + */ +int +rte_mempool_ops_alloc(struct rte_mempool *mp); + +/** + * @internal Wrapper for mempool_ops dequeue callback. * - * @param vaddr - * Virtual address of the memory buffer. - * @param elt_num - * Maximum number of objects to iterate through. - * @param elt_sz - * Size of each object. - * @param align - * Alignment of each object. - * @param paddr - * Array of physical addresses of the pages that comprises given memory - * buffer. - * @param pg_num - * Number of elements in the paddr array. - * @param pg_shift - * LOG2 of the physical pages size. - * @param obj_iter - * Object iterator callback function (could be NULL). - * @param obj_iter_arg - * User defined parameter for the object iterator callback function. + * @param mp + * Pointer to the memory pool. + * @param obj_table + * Pointer to a table of void * pointers (objects). + * @param n + * Number of objects to get. + * @return + * - 0: Success; got n objects. + * - <0: Error; code of dequeue function. + */ +static inline int +rte_mempool_ops_dequeue_bulk(struct rte_mempool *mp, + void **obj_table, unsigned n) +{ + struct rte_mempool_ops *ops; + + ops = rte_mempool_get_ops(mp->ops_index); + return ops->dequeue(mp, obj_table, n); +} + +/** + * @internal wrapper for mempool_ops enqueue callback. * + * @param mp + * Pointer to the memory pool. + * @param obj_table + * Pointer to a table of void * pointers (objects). + * @param n + * Number of objects to put. * @return - * Number of objects iterated through. + * - 0: Success; n objects supplied. + * - <0: Error; code of enqueue function. */ -uint32_t rte_mempool_obj_iter(void *vaddr, - uint32_t elt_num, size_t elt_sz, size_t align, - const phys_addr_t paddr[], uint32_t pg_num, uint32_t pg_shift, - rte_mempool_obj_iter_t obj_iter, void *obj_iter_arg); +static inline int +rte_mempool_ops_enqueue_bulk(struct rte_mempool *mp, void * const *obj_table, + unsigned n) +{ + struct rte_mempool_ops *ops; + + ops = rte_mempool_get_ops(mp->ops_index); + return ops->enqueue(mp, obj_table, n); +} /** - * An object constructor callback function for mempool. + * @internal wrapper for mempool_ops get_count callback. * - * Arguments are the mempool, the opaque pointer given by the user in - * rte_mempool_create(), the pointer to the element and the index of - * the element in the pool. + * @param mp + * Pointer to the memory pool. + * @return + * The number of available objects in the external pool. */ -typedef void (rte_mempool_obj_ctor_t)(struct rte_mempool *, void *, - void *, unsigned); +unsigned +rte_mempool_ops_get_count(const struct rte_mempool *mp); + +/** + * @internal wrapper for mempool_ops free callback. + * + * @param mp + * Pointer to the memory pool. + */ +void +rte_mempool_ops_free(struct rte_mempool *mp); + +/** + * Set the ops of a mempool. + * + * This can only be done on a mempool that is not populated, i.e. just after + * a call to rte_mempool_create_empty(). + * + * @param mp + * Pointer to the memory pool. + * @param name + * Name of the ops structure to use for this mempool. + * @param pool_config + * Opaque data that can be passed by the application to the ops functions. + * @return + * - 0: Success; the mempool is now using the requested ops functions. + * - -EINVAL - Invalid ops struct name provided. + * - -EEXIST - mempool already has an ops struct assigned. + */ +int +rte_mempool_set_ops_byname(struct rte_mempool *mp, const char *name, + void *pool_config); + +/** + * Register mempool operations. + * + * @param ops + * Pointer to an ops structure to register. + * @return + * - >=0: Success; return the index of the ops struct in the table. + * - -EINVAL - some missing callbacks while registering ops struct. + * - -ENOSPC - the maximum number of ops structs has been reached. + */ +int rte_mempool_register_ops(const struct rte_mempool_ops *ops); + +/** + * Macro to statically register the ops of a mempool handler. + * Note that the rte_mempool_register_ops fails silently here when + * more then RTE_MEMPOOL_MAX_OPS_IDX is registered. + */ +#define MEMPOOL_REGISTER_OPS(ops) \ + void mp_hdlr_init_##ops(void); \ + void __attribute__((constructor, used)) mp_hdlr_init_##ops(void)\ + { \ + rte_mempool_register_ops(&ops); \ + } + +/** + * An object callback function for mempool. + * + * Used by rte_mempool_create() and rte_mempool_obj_iter(). + */ +typedef void (rte_mempool_obj_cb_t)(struct rte_mempool *mp, + void *opaque, void *obj, unsigned obj_idx); +typedef rte_mempool_obj_cb_t rte_mempool_obj_ctor_t; /* compat */ + +/** + * A memory callback function for mempool. + * + * Used by rte_mempool_mem_iter(). + */ +typedef void (rte_mempool_mem_cb_t)(struct rte_mempool *mp, + void *opaque, struct rte_mempool_memhdr *memhdr, + unsigned mem_idx); /** * A mempool constructor callback function. @@ -522,6 +646,8 @@ typedef void (rte_mempool_ctor_t)(struct rte_mempool *, void *); * - MEMPOOL_F_SC_GET: If this flag is set, the default behavior * when using rte_mempool_get() or rte_mempool_get_bulk() is * "single-consumer". Otherwise, it is "multi-consumers". + * - MEMPOOL_F_NO_PHYS_CONTIG: If set, allocated objects won't + * necessarilly be contiguous in physical memory. * @return * The pointer to the new allocated mempool, on success. NULL on error * with rte_errno set appropriately. Possible rte_errno values include: @@ -536,14 +662,15 @@ struct rte_mempool * rte_mempool_create(const char *name, unsigned n, unsigned elt_size, unsigned cache_size, unsigned private_data_size, rte_mempool_ctor_t *mp_init, void *mp_init_arg, - rte_mempool_obj_ctor_t *obj_init, void *obj_init_arg, + rte_mempool_obj_cb_t *obj_init, void *obj_init_arg, int socket_id, unsigned flags); /** * Create a new mempool named *name* in memory. * - * This function uses ``memzone_reserve()`` to allocate memory. The - * pool contains n elements of elt_size. Its size is set to n. + * The pool contains n elements of elt_size. Its size is set to n. + * This function uses ``memzone_reserve()`` to allocate the mempool header + * (and the objects if vaddr is NULL). * Depending on the input parameters, mempool elements can be either allocated * together with the mempool header, or an externally provided memory buffer * could be used to store mempool objects. In later case, that external @@ -558,18 +685,7 @@ rte_mempool_create(const char *name, unsigned n, unsigned elt_size, * @param elt_size * The size of each element. * @param cache_size - * If cache_size is non-zero, the rte_mempool library will try to - * limit the accesses to the common lockless pool, by maintaining a - * per-lcore object cache. This argument must be lower or equal to - * CONFIG_RTE_MEMPOOL_CACHE_MAX_SIZE. It is advised to choose - * cache_size to have "n modulo cache_size == 0": if this is - * not the case, some elements will always stay in the pool and will - * never be used. The access to the per-lcore table is of course - * faster than the multi-producer/consumer pool. The cache can be - * disabled if the cache_size argument is set to 0; it can be useful to - * avoid losing objects in cache. Note that even if not used, the - * memory space for cache is always reserved in a mempool structure, - * except if CONFIG_RTE_MEMPOOL_CACHE_MAX_SIZE is set to 0. + * Size of the cache. See rte_mempool_create() for details. * @param private_data_size * The size of the private data appended after the mempool * structure. This is useful for storing some private data after the @@ -583,35 +699,17 @@ rte_mempool_create(const char *name, unsigned n, unsigned elt_size, * An opaque pointer to data that can be used in the mempool * constructor function. * @param obj_init - * A function pointer that is called for each object at - * initialization of the pool. The user can set some meta data in - * objects if needed. This parameter can be NULL if not needed. - * The obj_init() function takes the mempool pointer, the init_arg, - * the object pointer and the object number as parameters. + * A function called for each object at initialization of the pool. + * See rte_mempool_create() for details. * @param obj_init_arg - * An opaque pointer to data that can be used as an argument for - * each call to the object constructor function. + * An opaque pointer passed to the object constructor function. * @param socket_id * The *socket_id* argument is the socket identifier in the case of * NUMA. The value can be *SOCKET_ID_ANY* if there is no NUMA * constraint for the reserved zone. * @param flags - * The *flags* arguments is an OR of following flags: - * - MEMPOOL_F_NO_SPREAD: By default, objects addresses are spread - * between channels in RAM: the pool allocator will add padding - * between objects depending on the hardware configuration. See - * Memory alignment constraints for details. If this flag is set, - * the allocator will just align them to a cache line. - * - MEMPOOL_F_NO_CACHE_ALIGN: By default, the returned objects are - * cache-aligned. This flag removes this constraint, and no - * padding will be present between objects. This flag implies - * MEMPOOL_F_NO_SPREAD. - * - MEMPOOL_F_SP_PUT: If this flag is set, the default behavior - * when using rte_mempool_put() or rte_mempool_put_bulk() is - * "single-producer". Otherwise, it is "multi-producers". - * - MEMPOOL_F_SC_GET: If this flag is set, the default behavior - * when using rte_mempool_get() or rte_mempool_get_bulk() is - * "single-consumer". Otherwise, it is "multi-consumers". + * Flags controlling the behavior of the mempool. See + * rte_mempool_create() for details. * @param vaddr * Virtual address of the externally allocated memory buffer. * Will be used to store mempool objects. @@ -624,110 +722,219 @@ rte_mempool_create(const char *name, unsigned n, unsigned elt_size, * LOG2 of the physical pages size. * @return * The pointer to the new allocated mempool, on success. NULL on error - * with rte_errno set appropriately. Possible rte_errno values include: - * - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure - * - E_RTE_SECONDARY - function was called from a secondary process instance - * - EINVAL - cache size provided is too large - * - ENOSPC - the maximum number of memzones has already been allocated - * - EEXIST - a memzone with the same name already exists - * - ENOMEM - no appropriate memory area found in which to create memzone + * with rte_errno set appropriately. See rte_mempool_create() for details. */ struct rte_mempool * rte_mempool_xmem_create(const char *name, unsigned n, unsigned elt_size, unsigned cache_size, unsigned private_data_size, rte_mempool_ctor_t *mp_init, void *mp_init_arg, - rte_mempool_obj_ctor_t *obj_init, void *obj_init_arg, + rte_mempool_obj_cb_t *obj_init, void *obj_init_arg, int socket_id, unsigned flags, void *vaddr, const phys_addr_t paddr[], uint32_t pg_num, uint32_t pg_shift); /** - * Create a new mempool named *name* in memory on Xen Dom0. + * Create an empty mempool * - * This function uses ``rte_mempool_xmem_create()`` to allocate memory. The - * pool contains n elements of elt_size. Its size is set to n. - * All elements of the mempool are allocated together with the mempool header, - * and memory buffer can consist of set of disjoint physical pages. + * The mempool is allocated and initialized, but it is not populated: no + * memory is allocated for the mempool elements. The user has to call + * rte_mempool_populate_*() or to add memory chunks to the pool. Once + * populated, the user may also want to initialize each object with + * rte_mempool_obj_iter(). * * @param name * The name of the mempool. * @param n - * The number of elements in the mempool. The optimum size (in terms of - * memory usage) for a mempool is when n is a power of two minus one: - * n = (2^q - 1). + * The maximum number of elements that can be added in the mempool. + * The optimum size (in terms of memory usage) for a mempool is when n + * is a power of two minus one: n = (2^q - 1). * @param elt_size * The size of each element. * @param cache_size - * If cache_size is non-zero, the rte_mempool library will try to - * limit the accesses to the common lockless pool, by maintaining a - * per-lcore object cache. This argument must be lower or equal to - * CONFIG_RTE_MEMPOOL_CACHE_MAX_SIZE. It is advised to choose - * cache_size to have "n modulo cache_size == 0": if this is - * not the case, some elements will always stay in the pool and will - * never be used. The access to the per-lcore table is of course - * faster than the multi-producer/consumer pool. The cache can be - * disabled if the cache_size argument is set to 0; it can be useful to - * avoid losing objects in cache. Note that even if not used, the - * memory space for cache is always reserved in a mempool structure, - * except if CONFIG_RTE_MEMPOOL_CACHE_MAX_SIZE is set to 0. + * Size of the cache. See rte_mempool_create() for details. * @param private_data_size * The size of the private data appended after the mempool * structure. This is useful for storing some private data after the * mempool structure, as is done for rte_mbuf_pool for example. - * @param mp_init - * A function pointer that is called for initialization of the pool, - * before object initialization. The user can initialize the private - * data in this function if needed. This parameter can be NULL if - * not needed. - * @param mp_init_arg - * An opaque pointer to data that can be used in the mempool - * constructor function. - * @param obj_init - * A function pointer that is called for each object at - * initialization of the pool. The user can set some meta data in - * objects if needed. This parameter can be NULL if not needed. - * The obj_init() function takes the mempool pointer, the init_arg, - * the object pointer and the object number as parameters. - * @param obj_init_arg - * An opaque pointer to data that can be used as an argument for - * each call to the object constructor function. * @param socket_id * The *socket_id* argument is the socket identifier in the case of * NUMA. The value can be *SOCKET_ID_ANY* if there is no NUMA * constraint for the reserved zone. * @param flags - * The *flags* arguments is an OR of following flags: - * - MEMPOOL_F_NO_SPREAD: By default, objects addresses are spread - * between channels in RAM: the pool allocator will add padding - * between objects depending on the hardware configuration. See - * Memory alignment constraints for details. If this flag is set, - * the allocator will just align them to a cache line. - * - MEMPOOL_F_NO_CACHE_ALIGN: By default, the returned objects are - * cache-aligned. This flag removes this constraint, and no - * padding will be present between objects. This flag implies - * MEMPOOL_F_NO_SPREAD. - * - MEMPOOL_F_SP_PUT: If this flag is set, the default behavior - * when using rte_mempool_put() or rte_mempool_put_bulk() is - * "single-producer". Otherwise, it is "multi-producers". - * - MEMPOOL_F_SC_GET: If this flag is set, the default behavior - * when using rte_mempool_get() or rte_mempool_get_bulk() is - * "single-consumer". Otherwise, it is "multi-consumers". + * Flags controlling the behavior of the mempool. See + * rte_mempool_create() for details. * @return * The pointer to the new allocated mempool, on success. NULL on error - * with rte_errno set appropriately. Possible rte_errno values include: - * - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure - * - E_RTE_SECONDARY - function was called from a secondary process instance - * - EINVAL - cache size provided is too large - * - ENOSPC - the maximum number of memzones has already been allocated - * - EEXIST - a memzone with the same name already exists - * - ENOMEM - no appropriate memory area found in which to create memzone + * with rte_errno set appropriately. See rte_mempool_create() for details. */ struct rte_mempool * -rte_dom0_mempool_create(const char *name, unsigned n, unsigned elt_size, - unsigned cache_size, unsigned private_data_size, - rte_mempool_ctor_t *mp_init, void *mp_init_arg, - rte_mempool_obj_ctor_t *obj_init, void *obj_init_arg, - int socket_id, unsigned flags); +rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size, + unsigned cache_size, unsigned private_data_size, + int socket_id, unsigned flags); +/** + * Free a mempool + * + * Unlink the mempool from global list, free the memory chunks, and all + * memory referenced by the mempool. The objects must not be used by + * other cores as they will be freed. + * + * @param mp + * A pointer to the mempool structure. + */ +void +rte_mempool_free(struct rte_mempool *mp); + +/** + * Add physically contiguous memory for objects in the pool at init + * + * Add a virtually and physically contiguous memory chunk in the pool + * where objects can be instanciated. + * + * @param mp + * A pointer to the mempool structure. + * @param vaddr + * The virtual address of memory that should be used to store objects. + * @param paddr + * The physical address + * @param len + * The length of memory in bytes. + * @param free_cb + * The callback used to free this chunk when destroying the mempool. + * @param opaque + * An opaque argument passed to free_cb. + * @return + * The number of objects added on success. + * On error, the chunk is not added in the memory list of the + * mempool and a negative errno is returned. + */ +int rte_mempool_populate_phys(struct rte_mempool *mp, char *vaddr, + phys_addr_t paddr, size_t len, rte_mempool_memchunk_free_cb_t *free_cb, + void *opaque); +/** + * Add physical memory for objects in the pool at init + * + * Add a virtually contiguous memory chunk in the pool where objects can + * be instanciated. The physical addresses corresponding to the virtual + * area are described in paddr[], pg_num, pg_shift. + * + * @param mp + * A pointer to the mempool structure. + * @param vaddr + * The virtual address of memory that should be used to store objects. + * @param paddr + * An array of physical addresses of each page composing the virtual + * area. + * @param pg_num + * Number of elements in the paddr array. + * @param pg_shift + * LOG2 of the physical pages size. + * @param free_cb + * The callback used to free this chunk when destroying the mempool. + * @param opaque + * An opaque argument passed to free_cb. + * @return + * The number of objects added on success. + * On error, the chunks are not added in the memory list of the + * mempool and a negative errno is returned. + */ +int rte_mempool_populate_phys_tab(struct rte_mempool *mp, char *vaddr, + const phys_addr_t paddr[], uint32_t pg_num, uint32_t pg_shift, + rte_mempool_memchunk_free_cb_t *free_cb, void *opaque); + +/** + * Add virtually contiguous memory for objects in the pool at init + * + * Add a virtually contiguous memory chunk in the pool where objects can + * be instanciated. + * + * @param mp + * A pointer to the mempool structure. + * @param addr + * The virtual address of memory that should be used to store objects. + * Must be page-aligned. + * @param len + * The length of memory in bytes. Must be page-aligned. + * @param pg_sz + * The size of memory pages in this virtual area. + * @param free_cb + * The callback used to free this chunk when destroying the mempool. + * @param opaque + * An opaque argument passed to free_cb. + * @return + * The number of objects added on success. + * On error, the chunk is not added in the memory list of the + * mempool and a negative errno is returned. + */ +int +rte_mempool_populate_virt(struct rte_mempool *mp, char *addr, + size_t len, size_t pg_sz, rte_mempool_memchunk_free_cb_t *free_cb, + void *opaque); + +/** + * Add memory for objects in the pool at init + * + * This is the default function used by rte_mempool_create() to populate + * the mempool. It adds memory allocated using rte_memzone_reserve(). + * + * @param mp + * A pointer to the mempool structure. + * @return + * The number of objects added on success. + * On error, the chunk is not added in the memory list of the + * mempool and a negative errno is returned. + */ +int rte_mempool_populate_default(struct rte_mempool *mp); + +/** + * Add memory from anonymous mapping for objects in the pool at init + * + * This function mmap an anonymous memory zone that is locked in + * memory to store the objects of the mempool. + * + * @param mp + * A pointer to the mempool structure. + * @return + * The number of objects added on success. + * On error, the chunk is not added in the memory list of the + * mempool and a negative errno is returned. + */ +int rte_mempool_populate_anon(struct rte_mempool *mp); + +/** + * Call a function for each mempool element + * + * Iterate across all objects attached to a rte_mempool and call the + * callback function on it. + * + * @param mp + * A pointer to an initialized mempool. + * @param obj_cb + * A function pointer that is called for each object. + * @param obj_cb_arg + * An opaque pointer passed to the callback function. + * @return + * Number of objects iterated. + */ +uint32_t rte_mempool_obj_iter(struct rte_mempool *mp, + rte_mempool_obj_cb_t *obj_cb, void *obj_cb_arg); + +/** + * Call a function for each mempool memory chunk + * + * Iterate across all memory chunks attached to a rte_mempool and call + * the callback function on it. + * + * @param mp + * A pointer to an initialized mempool. + * @param mem_cb + * A function pointer that is called for each memory chunk. + * @param mem_cb_arg + * An opaque pointer passed to the callback function. + * @return + * Number of memory chunks iterated. + */ +uint32_t rte_mempool_mem_iter(struct rte_mempool *mp, + rte_mempool_mem_cb_t *mem_cb, void *mem_cb_arg); /** * Dump the status of the mempool to the console. @@ -737,7 +944,71 @@ rte_dom0_mempool_create(const char *name, unsigned n, unsigned elt_size, * @param mp * A pointer to the mempool structure. */ -void rte_mempool_dump(FILE *f, const struct rte_mempool *mp); +void rte_mempool_dump(FILE *f, struct rte_mempool *mp); + +/** + * Create a user-owned mempool cache. + * + * This can be used by non-EAL threads to enable caching when they + * interact with a mempool. + * + * @param size + * The size of the mempool cache. See rte_mempool_create()'s cache_size + * parameter description for more information. The same limits and + * considerations apply here too. + * @param socket_id + * The socket identifier in the case of NUMA. The value can be + * SOCKET_ID_ANY if there is no NUMA constraint for the reserved zone. + */ +struct rte_mempool_cache * +rte_mempool_cache_create(uint32_t size, int socket_id); + +/** + * Free a user-owned mempool cache. + * + * @param cache + * A pointer to the mempool cache. + */ +void +rte_mempool_cache_free(struct rte_mempool_cache *cache); + +/** + * Flush a user-owned mempool cache to the specified mempool. + * + * @param cache + * A pointer to the mempool cache. + * @param mp + * A pointer to the mempool. + */ +static inline void __attribute__((always_inline)) +rte_mempool_cache_flush(struct rte_mempool_cache *cache, + struct rte_mempool *mp) +{ + rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len); + cache->len = 0; +} + +/** + * Get a pointer to the per-lcore default mempool cache. + * + * @param mp + * A pointer to the mempool structure. + * @param lcore_id + * The logical core id. + * @return + * A pointer to the mempool cache or NULL if disabled or non-EAL thread. + */ +static inline struct rte_mempool_cache *__attribute__((always_inline)) +rte_mempool_default_cache(struct rte_mempool *mp, unsigned lcore_id) +{ + if (mp->cache_size == 0) + return NULL; + + if (lcore_id >= RTE_MAX_LCORE) + return NULL; + + return &mp->local_cache[lcore_id]; +} /** * @internal Put several objects back in the mempool; used internally. @@ -748,36 +1019,29 @@ void rte_mempool_dump(FILE *f, const struct rte_mempool *mp); * @param n * The number of objects to store back in the mempool, must be strictly * positive. - * @param is_mp - * Mono-producer (0) or multi-producers (1). + * @param cache + * A pointer to a mempool cache structure. May be NULL if not needed. + * @param flags + * The flags used for the mempool creation. + * Single-producer (MEMPOOL_F_SP_PUT flag) or multi-producers. */ static inline void __attribute__((always_inline)) -__mempool_put_bulk(struct rte_mempool *mp, void * const *obj_table, - unsigned n, int is_mp) +__mempool_generic_put(struct rte_mempool *mp, void * const *obj_table, + unsigned n, struct rte_mempool_cache *cache, int flags) { -#if RTE_MEMPOOL_CACHE_MAX_SIZE > 0 - struct rte_mempool_cache *cache; - uint32_t index; void **cache_objs; - unsigned lcore_id = rte_lcore_id(); - uint32_t cache_size = mp->cache_size; - uint32_t flushthresh = mp->cache_flushthresh; -#endif /* RTE_MEMPOOL_CACHE_MAX_SIZE > 0 */ /* increment stat now, adding in mempool always success */ __MEMPOOL_STAT_ADD(mp, put, n); -#if RTE_MEMPOOL_CACHE_MAX_SIZE > 0 - /* cache is not enabled or single producer or non-EAL thread */ - if (unlikely(cache_size == 0 || is_mp == 0 || - lcore_id >= RTE_MAX_LCORE)) + /* No cache provided or single producer */ + if (unlikely(cache == NULL || flags & MEMPOOL_F_SP_PUT)) goto ring_enqueue; /* Go straight to ring if put would overflow mem allocated for cache */ if (unlikely(n > RTE_MEMPOOL_CACHE_MAX_SIZE)) goto ring_enqueue; - cache = &mp->local_cache[lcore_id]; cache_objs = &cache->objs[cache->len]; /* @@ -788,42 +1052,55 @@ __mempool_put_bulk(struct rte_mempool *mp, void * const *obj_table, */ /* Add elements back into the cache */ - for (index = 0; index < n; ++index, obj_table++) - cache_objs[index] = *obj_table; + rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n); cache->len += n; - if (cache->len >= flushthresh) { - rte_ring_mp_enqueue_bulk(mp->ring, &cache->objs[cache_size], - cache->len - cache_size); - cache->len = cache_size; + if (cache->len >= cache->flushthresh) { + rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size], + cache->len - cache->size); + cache->len = cache->size; } return; ring_enqueue: -#endif /* RTE_MEMPOOL_CACHE_MAX_SIZE > 0 */ /* push remaining objects in ring */ #ifdef RTE_LIBRTE_MEMPOOL_DEBUG - if (is_mp) { - if (rte_ring_mp_enqueue_bulk(mp->ring, obj_table, n) < 0) - rte_panic("cannot put objects in mempool\n"); - } - else { - if (rte_ring_sp_enqueue_bulk(mp->ring, obj_table, n) < 0) - rte_panic("cannot put objects in mempool\n"); - } + if (rte_mempool_ops_enqueue_bulk(mp, obj_table, n) < 0) + rte_panic("cannot put objects in mempool\n"); #else - if (is_mp) - rte_ring_mp_enqueue_bulk(mp->ring, obj_table, n); - else - rte_ring_sp_enqueue_bulk(mp->ring, obj_table, n); + rte_mempool_ops_enqueue_bulk(mp, obj_table, n); #endif } /** + * Put several objects back in the mempool. + * + * @param mp + * A pointer to the mempool structure. + * @param obj_table + * A pointer to a table of void * pointers (objects). + * @param n + * The number of objects to add in the mempool from the obj_table. + * @param cache + * A pointer to a mempool cache structure. May be NULL if not needed. + * @param flags + * The flags used for the mempool creation. + * Single-producer (MEMPOOL_F_SP_PUT flag) or multi-producers. + */ +static inline void __attribute__((always_inline)) +rte_mempool_generic_put(struct rte_mempool *mp, void * const *obj_table, + unsigned n, struct rte_mempool_cache *cache, int flags) +{ + __mempool_check_cookies(mp, obj_table, n, 0); + __mempool_generic_put(mp, obj_table, n, cache, flags); +} + +/** + * @deprecated * Put several objects back in the mempool (multi-producers safe). * * @param mp @@ -833,15 +1110,18 @@ ring_enqueue: * @param n * The number of objects to add in the mempool from the obj_table. */ +__rte_deprecated static inline void __attribute__((always_inline)) rte_mempool_mp_put_bulk(struct rte_mempool *mp, void * const *obj_table, unsigned n) { - __mempool_check_cookies(mp, obj_table, n, 0); - __mempool_put_bulk(mp, obj_table, n, 1); + struct rte_mempool_cache *cache; + cache = rte_mempool_default_cache(mp, rte_lcore_id()); + rte_mempool_generic_put(mp, obj_table, n, cache, 0); } /** + * @deprecated * Put several objects back in the mempool (NOT multi-producers safe). * * @param mp @@ -851,12 +1131,12 @@ rte_mempool_mp_put_bulk(struct rte_mempool *mp, void * const *obj_table, * @param n * The number of objects to add in the mempool from obj_table. */ -static inline void +__rte_deprecated +static inline void __attribute__((always_inline)) rte_mempool_sp_put_bulk(struct rte_mempool *mp, void * const *obj_table, unsigned n) { - __mempool_check_cookies(mp, obj_table, n, 0); - __mempool_put_bulk(mp, obj_table, n, 0); + rte_mempool_generic_put(mp, obj_table, n, NULL, MEMPOOL_F_SP_PUT); } /** @@ -877,11 +1157,13 @@ static inline void __attribute__((always_inline)) rte_mempool_put_bulk(struct rte_mempool *mp, void * const *obj_table, unsigned n) { - __mempool_check_cookies(mp, obj_table, n, 0); - __mempool_put_bulk(mp, obj_table, n, !(mp->flags & MEMPOOL_F_SP_PUT)); + struct rte_mempool_cache *cache; + cache = rte_mempool_default_cache(mp, rte_lcore_id()); + rte_mempool_generic_put(mp, obj_table, n, cache, mp->flags); } /** + * @deprecated * Put one object in the mempool (multi-producers safe). * * @param mp @@ -889,13 +1171,17 @@ rte_mempool_put_bulk(struct rte_mempool *mp, void * const *obj_table, * @param obj * A pointer to the object to be added. */ +__rte_deprecated static inline void __attribute__((always_inline)) rte_mempool_mp_put(struct rte_mempool *mp, void *obj) { - rte_mempool_mp_put_bulk(mp, &obj, 1); + struct rte_mempool_cache *cache; + cache = rte_mempool_default_cache(mp, rte_lcore_id()); + rte_mempool_generic_put(mp, &obj, 1, cache, 0); } /** + * @deprecated * Put one object back in the mempool (NOT multi-producers safe). * * @param mp @@ -903,10 +1189,11 @@ rte_mempool_mp_put(struct rte_mempool *mp, void *obj) * @param obj * A pointer to the object to be added. */ +__rte_deprecated static inline void __attribute__((always_inline)) rte_mempool_sp_put(struct rte_mempool *mp, void *obj) { - rte_mempool_sp_put_bulk(mp, &obj, 1); + rte_mempool_generic_put(mp, &obj, 1, NULL, MEMPOOL_F_SP_PUT); } /** @@ -935,39 +1222,38 @@ rte_mempool_put(struct rte_mempool *mp, void *obj) * A pointer to a table of void * pointers (objects). * @param n * The number of objects to get, must be strictly positive. - * @param is_mc - * Mono-consumer (0) or multi-consumers (1). + * @param cache + * A pointer to a mempool cache structure. May be NULL if not needed. + * @param flags + * The flags used for the mempool creation. + * Single-consumer (MEMPOOL_F_SC_GET flag) or multi-consumers. * @return * - >=0: Success; number of objects supplied. * - <0: Error; code of ring dequeue function. */ static inline int __attribute__((always_inline)) -__mempool_get_bulk(struct rte_mempool *mp, void **obj_table, - unsigned n, int is_mc) +__mempool_generic_get(struct rte_mempool *mp, void **obj_table, + unsigned n, struct rte_mempool_cache *cache, int flags) { int ret; -#if RTE_MEMPOOL_CACHE_MAX_SIZE > 0 - struct rte_mempool_cache *cache; uint32_t index, len; void **cache_objs; - unsigned lcore_id = rte_lcore_id(); - uint32_t cache_size = mp->cache_size; - /* cache is not enabled or single consumer */ - if (unlikely(cache_size == 0 || is_mc == 0 || - n >= cache_size || lcore_id >= RTE_MAX_LCORE)) + /* No cache provided or single consumer */ + if (unlikely(cache == NULL || flags & MEMPOOL_F_SC_GET || + n >= cache->size)) goto ring_dequeue; - cache = &mp->local_cache[lcore_id]; cache_objs = cache->objs; /* Can this be satisfied from the cache? */ if (cache->len < n) { /* No. Backfill the cache first, and then fill from it */ - uint32_t req = n + (cache_size - cache->len); + uint32_t req = n + (cache->size - cache->len); /* How many do we require i.e. number to fill the cache + the request */ - ret = rte_ring_mc_dequeue_bulk(mp->ring, &cache->objs[cache->len], req); + ret = rte_mempool_ops_dequeue_bulk(mp, + &cache->objs[cache->len], req); if (unlikely(ret < 0)) { /* * In the offchance that we are buffer constrained, @@ -992,13 +1278,9 @@ __mempool_get_bulk(struct rte_mempool *mp, void **obj_table, return 0; ring_dequeue: -#endif /* RTE_MEMPOOL_CACHE_MAX_SIZE > 0 */ /* get remaining objects from ring */ - if (is_mc) - ret = rte_ring_mc_dequeue_bulk(mp->ring, obj_table, n); - else - ret = rte_ring_sc_dequeue_bulk(mp->ring, obj_table, n); + ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, n); if (ret < 0) __MEMPOOL_STAT_ADD(mp, get_fail, n); @@ -1009,7 +1291,7 @@ ring_dequeue: } /** - * Get several objects from the mempool (multi-consumers safe). + * Get several objects from the mempool. * * If cache is enabled, objects will be retrieved first from cache, * subsequently from the common pool. Note that it can return -ENOENT when @@ -1022,21 +1304,56 @@ ring_dequeue: * A pointer to a table of void * pointers (objects) that will be filled. * @param n * The number of objects to get from mempool to obj_table. + * @param cache + * A pointer to a mempool cache structure. May be NULL if not needed. + * @param flags + * The flags used for the mempool creation. + * Single-consumer (MEMPOOL_F_SC_GET flag) or multi-consumers. * @return * - 0: Success; objects taken. * - -ENOENT: Not enough entries in the mempool; no object is retrieved. */ static inline int __attribute__((always_inline)) -rte_mempool_mc_get_bulk(struct rte_mempool *mp, void **obj_table, unsigned n) +rte_mempool_generic_get(struct rte_mempool *mp, void **obj_table, unsigned n, + struct rte_mempool_cache *cache, int flags) { int ret; - ret = __mempool_get_bulk(mp, obj_table, n, 1); + ret = __mempool_generic_get(mp, obj_table, n, cache, flags); if (ret == 0) __mempool_check_cookies(mp, obj_table, n, 1); return ret; } /** + * @deprecated + * Get several objects from the mempool (multi-consumers safe). + * + * If cache is enabled, objects will be retrieved first from cache, + * subsequently from the common pool. Note that it can return -ENOENT when + * the local cache and common pool are empty, even if cache from other + * lcores are full. + * + * @param mp + * A pointer to the mempool structure. + * @param obj_table + * A pointer to a table of void * pointers (objects) that will be filled. + * @param n + * The number of objects to get from mempool to obj_table. + * @return + * - 0: Success; objects taken. + * - -ENOENT: Not enough entries in the mempool; no object is retrieved. + */ +__rte_deprecated +static inline int __attribute__((always_inline)) +rte_mempool_mc_get_bulk(struct rte_mempool *mp, void **obj_table, unsigned n) +{ + struct rte_mempool_cache *cache; + cache = rte_mempool_default_cache(mp, rte_lcore_id()); + return rte_mempool_generic_get(mp, obj_table, n, cache, 0); +} + +/** + * @deprecated * Get several objects from the mempool (NOT multi-consumers safe). * * If cache is enabled, objects will be retrieved first from cache, @@ -1055,14 +1372,12 @@ rte_mempool_mc_get_bulk(struct rte_mempool *mp, void **obj_table, unsigned n) * - -ENOENT: Not enough entries in the mempool; no object is * retrieved. */ +__rte_deprecated static inline int __attribute__((always_inline)) rte_mempool_sc_get_bulk(struct rte_mempool *mp, void **obj_table, unsigned n) { - int ret; - ret = __mempool_get_bulk(mp, obj_table, n, 0); - if (ret == 0) - __mempool_check_cookies(mp, obj_table, n, 1); - return ret; + return rte_mempool_generic_get(mp, obj_table, n, NULL, + MEMPOOL_F_SC_GET); } /** @@ -1090,15 +1405,13 @@ rte_mempool_sc_get_bulk(struct rte_mempool *mp, void **obj_table, unsigned n) static inline int __attribute__((always_inline)) rte_mempool_get_bulk(struct rte_mempool *mp, void **obj_table, unsigned n) { - int ret; - ret = __mempool_get_bulk(mp, obj_table, n, - !(mp->flags & MEMPOOL_F_SC_GET)); - if (ret == 0) - __mempool_check_cookies(mp, obj_table, n, 1); - return ret; + struct rte_mempool_cache *cache; + cache = rte_mempool_default_cache(mp, rte_lcore_id()); + return rte_mempool_generic_get(mp, obj_table, n, cache, mp->flags); } /** + * @deprecated * Get one object from the mempool (multi-consumers safe). * * If cache is enabled, objects will be retrieved first from cache, @@ -1114,13 +1427,17 @@ rte_mempool_get_bulk(struct rte_mempool *mp, void **obj_table, unsigned n) * - 0: Success; objects taken. * - -ENOENT: Not enough entries in the mempool; no object is retrieved. */ +__rte_deprecated static inline int __attribute__((always_inline)) rte_mempool_mc_get(struct rte_mempool *mp, void **obj_p) { - return rte_mempool_mc_get_bulk(mp, obj_p, 1); + struct rte_mempool_cache *cache; + cache = rte_mempool_default_cache(mp, rte_lcore_id()); + return rte_mempool_generic_get(mp, obj_p, 1, cache, 0); } /** + * @deprecated * Get one object from the mempool (NOT multi-consumers safe). * * If cache is enabled, objects will be retrieved first from cache, @@ -1136,10 +1453,11 @@ rte_mempool_mc_get(struct rte_mempool *mp, void **obj_p) * - 0: Success; objects taken. * - -ENOENT: Not enough entries in the mempool; no object is retrieved. */ +__rte_deprecated static inline int __attribute__((always_inline)) rte_mempool_sc_get(struct rte_mempool *mp, void **obj_p) { - return rte_mempool_sc_get_bulk(mp, obj_p, 1); + return rte_mempool_generic_get(mp, obj_p, 1, NULL, MEMPOOL_F_SC_GET); } /** @@ -1173,6 +1491,21 @@ rte_mempool_get(struct rte_mempool *mp, void **obj_p) * * When cache is enabled, this function has to browse the length of * all lcores, so it should not be used in a data path, but only for + * debug purposes. User-owned mempool caches are not accounted for. + * + * @param mp + * A pointer to the mempool structure. + * @return + * The number of entries in the mempool. + */ +unsigned int rte_mempool_avail_count(const struct rte_mempool *mp); + +/** + * @deprecated + * Return the number of entries in the mempool. + * + * When cache is enabled, this function has to browse the length of + * all lcores, so it should not be used in a data path, but only for * debug purposes. * * @param mp @@ -1180,9 +1513,26 @@ rte_mempool_get(struct rte_mempool *mp, void **obj_p) * @return * The number of entries in the mempool. */ +__rte_deprecated unsigned rte_mempool_count(const struct rte_mempool *mp); /** + * Return the number of elements which have been allocated from the mempool + * + * When cache is enabled, this function has to browse the length of + * all lcores, so it should not be used in a data path, but only for + * debug purposes. + * + * @param mp + * A pointer to the mempool structure. + * @return + * The number of free entries in the mempool. + */ +unsigned int +rte_mempool_in_use_count(const struct rte_mempool *mp); + +/** + * @deprecated * Return the number of free entries in the mempool ring. * i.e. how many entries can be freed back to the mempool. * @@ -1192,17 +1542,18 @@ unsigned rte_mempool_count(const struct rte_mempool *mp); * * When cache is enabled, this function has to browse the length of * all lcores, so it should not be used in a data path, but only for - * debug purposes. + * debug purposes. User-owned mempool caches are not accounted for. * * @param mp * A pointer to the mempool structure. * @return * The number of free entries in the mempool. */ +__rte_deprecated static inline unsigned rte_mempool_free_count(const struct rte_mempool *mp) { - return mp->size - rte_mempool_count(mp); + return rte_mempool_in_use_count(mp); } /** @@ -1210,7 +1561,7 @@ rte_mempool_free_count(const struct rte_mempool *mp) * * When cache is enabled, this function has to browse the length of all * lcores, so it should not be used in a data path, but only for debug - * purposes. + * purposes. User-owned mempool caches are not accounted for. * * @param mp * A pointer to the mempool structure. @@ -1221,7 +1572,7 @@ rte_mempool_free_count(const struct rte_mempool *mp) static inline int rte_mempool_full(const struct rte_mempool *mp) { - return !!(rte_mempool_count(mp) == mp->size); + return !!(rte_mempool_avail_count(mp) == mp->size); } /** @@ -1229,7 +1580,7 @@ rte_mempool_full(const struct rte_mempool *mp) * * When cache is enabled, this function has to browse the length of all * lcores, so it should not be used in a data path, but only for debug - * purposes. + * purposes. User-owned mempool caches are not accounted for. * * @param mp * A pointer to the mempool structure. @@ -1240,7 +1591,7 @@ rte_mempool_full(const struct rte_mempool *mp) static inline int rte_mempool_empty(const struct rte_mempool *mp) { - return !!(rte_mempool_count(mp) == 0); + return !!(rte_mempool_avail_count(mp) == 0); } /** @@ -1252,23 +1603,16 @@ rte_mempool_empty(const struct rte_mempool *mp) * A pointer (virtual address) to the element of the pool. * @return * The physical address of the elt element. + * If the mempool was created with MEMPOOL_F_NO_PHYS_CONTIG, the + * returned value is RTE_BAD_PHYS_ADDR. */ static inline phys_addr_t -rte_mempool_virt2phy(const struct rte_mempool *mp, const void *elt) +rte_mempool_virt2phy(__rte_unused const struct rte_mempool *mp, const void *elt) { - if (rte_eal_has_hugepages()) { - uintptr_t off; - - off = (const char *)elt - (const char *)mp->elt_va_start; - return mp->elt_pa[off >> mp->pg_shift] + (off & mp->pg_mask); - } else { - /* - * If huge pages are disabled, we cannot assume the - * memory region to be physically contiguous. - * Lookup for each element. - */ - return rte_mem_virt2phy(elt); - } + const struct rte_mempool_objhdr *hdr; + hdr = (const struct rte_mempool_objhdr *)RTE_PTR_SUB(elt, + sizeof(*hdr)); + return hdr->physaddr; } /** @@ -1281,7 +1625,7 @@ rte_mempool_virt2phy(const struct rte_mempool *mp, const void *elt) * @param mp * A pointer to the mempool structure. */ -void rte_mempool_audit(const struct rte_mempool *mp); +void rte_mempool_audit(struct rte_mempool *mp); /** * Return a pointer to the private data in an mempool structure. @@ -1293,7 +1637,8 @@ void rte_mempool_audit(const struct rte_mempool *mp); */ static inline void *rte_mempool_get_priv(struct rte_mempool *mp) { - return (char *)mp + MEMPOOL_HEADER_SIZE(mp, mp->pg_num); + return (char *)mp + + MEMPOOL_HEADER_SIZE(mp, mp->cache_size); } /** @@ -1325,7 +1670,7 @@ struct rte_mempool *rte_mempool_lookup(const char *name); * calculates header, trailer, body and total sizes of the mempool object. * * @param elt_size - * The size of each element. + * The size of each element, without header and trailer. * @param flags * The flags used for the mempool creation. * Consult rte_mempool_create() for more information about possible values. @@ -1351,14 +1696,15 @@ uint32_t rte_mempool_calc_obj_size(uint32_t elt_size, uint32_t flags, * * @param elt_num * Number of elements. - * @param elt_sz - * The size of each element. + * @param total_elt_sz + * The size of each element, including header and trailer, as returned + * by rte_mempool_calc_obj_size(). * @param pg_shift - * LOG2 of the physical pages size. + * LOG2 of the physical pages size. If set to 0, ignore page boundaries. * @return * Required memory size aligned at page boundary. */ -size_t rte_mempool_xmem_size(uint32_t elt_num, size_t elt_sz, +size_t rte_mempool_xmem_size(uint32_t elt_num, size_t total_elt_sz, uint32_t pg_shift); /** @@ -1372,8 +1718,9 @@ size_t rte_mempool_xmem_size(uint32_t elt_num, size_t elt_sz, * Will be used to store mempool objects. * @param elt_num * Number of elements. - * @param elt_sz - * The size of each element. + * @param total_elt_sz + * The size of each element, including header and trailer, as returned + * by rte_mempool_calc_obj_size(). * @param paddr * Array of physical addresses of the pages that comprises given memory * buffer. @@ -1387,8 +1734,9 @@ size_t rte_mempool_xmem_size(uint32_t elt_num, size_t elt_sz, * buffer is too small, return a negative value whose absolute value * is the actual number of elements that can be stored in that buffer. */ -ssize_t rte_mempool_xmem_usage(void *vaddr, uint32_t elt_num, size_t elt_sz, - const phys_addr_t paddr[], uint32_t pg_num, uint32_t pg_shift); +ssize_t rte_mempool_xmem_usage(void *vaddr, uint32_t elt_num, + size_t total_elt_sz, const phys_addr_t paddr[], uint32_t pg_num, + uint32_t pg_shift); /** * Walk list of all memory pools @@ -1398,7 +1746,7 @@ ssize_t rte_mempool_xmem_usage(void *vaddr, uint32_t elt_num, size_t elt_sz, * @param arg * Argument passed to iterator */ -void rte_mempool_walk(void (*func)(const struct rte_mempool *, void *arg), +void rte_mempool_walk(void (*func)(struct rte_mempool *, void *arg), void *arg); #ifdef __cplusplus diff --git a/lib/librte_mempool/rte_mempool_ops.c b/lib/librte_mempool/rte_mempool_ops.c new file mode 100644 index 00000000..fd0b64cf --- /dev/null +++ b/lib/librte_mempool/rte_mempool_ops.c @@ -0,0 +1,151 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. All rights reserved. + * Copyright(c) 2016 6WIND S.A. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdio.h> +#include <string.h> + +#include <rte_mempool.h> +#include <rte_errno.h> + +/* indirect jump table to support external memory pools. */ +struct rte_mempool_ops_table rte_mempool_ops_table = { + .sl = RTE_SPINLOCK_INITIALIZER, + .num_ops = 0 +}; + +/* add a new ops struct in rte_mempool_ops_table, return its index. */ +int +rte_mempool_register_ops(const struct rte_mempool_ops *h) +{ + struct rte_mempool_ops *ops; + int16_t ops_index; + + rte_spinlock_lock(&rte_mempool_ops_table.sl); + + if (rte_mempool_ops_table.num_ops >= + RTE_MEMPOOL_MAX_OPS_IDX) { + rte_spinlock_unlock(&rte_mempool_ops_table.sl); + RTE_LOG(ERR, MEMPOOL, + "Maximum number of mempool ops structs exceeded\n"); + return -ENOSPC; + } + + if (h->alloc == NULL || h->enqueue == NULL || + h->dequeue == NULL || h->get_count == NULL) { + rte_spinlock_unlock(&rte_mempool_ops_table.sl); + RTE_LOG(ERR, MEMPOOL, + "Missing callback while registering mempool ops\n"); + return -EINVAL; + } + + if (strlen(h->name) >= sizeof(ops->name) - 1) { + rte_spinlock_unlock(&rte_mempool_ops_table.sl); + RTE_LOG(DEBUG, EAL, "%s(): mempool_ops <%s>: name too long\n", + __func__, h->name); + rte_errno = EEXIST; + return -EEXIST; + } + + ops_index = rte_mempool_ops_table.num_ops++; + ops = &rte_mempool_ops_table.ops[ops_index]; + snprintf(ops->name, sizeof(ops->name), "%s", h->name); + ops->alloc = h->alloc; + ops->enqueue = h->enqueue; + ops->dequeue = h->dequeue; + ops->get_count = h->get_count; + + rte_spinlock_unlock(&rte_mempool_ops_table.sl); + + return ops_index; +} + +/* wrapper to allocate an external mempool's private (pool) data. */ +int +rte_mempool_ops_alloc(struct rte_mempool *mp) +{ + struct rte_mempool_ops *ops; + + ops = rte_mempool_get_ops(mp->ops_index); + return ops->alloc(mp); +} + +/* wrapper to free an external pool ops. */ +void +rte_mempool_ops_free(struct rte_mempool *mp) +{ + struct rte_mempool_ops *ops; + + ops = rte_mempool_get_ops(mp->ops_index); + if (ops->free == NULL) + return; + ops->free(mp); +} + +/* wrapper to get available objects in an external mempool. */ +unsigned int +rte_mempool_ops_get_count(const struct rte_mempool *mp) +{ + struct rte_mempool_ops *ops; + + ops = rte_mempool_get_ops(mp->ops_index); + return ops->get_count(mp); +} + +/* sets mempool ops previously registered by rte_mempool_register_ops. */ +int +rte_mempool_set_ops_byname(struct rte_mempool *mp, const char *name, + void *pool_config) +{ + struct rte_mempool_ops *ops = NULL; + unsigned i; + + /* too late, the mempool is already populated. */ + if (mp->flags & MEMPOOL_F_POOL_CREATED) + return -EEXIST; + + for (i = 0; i < rte_mempool_ops_table.num_ops; i++) { + if (!strcmp(name, + rte_mempool_ops_table.ops[i].name)) { + ops = &rte_mempool_ops_table.ops[i]; + break; + } + } + + if (ops == NULL) + return -EINVAL; + + mp->ops_index = i; + mp->pool_config = pool_config; + return 0; +} diff --git a/lib/librte_mempool/rte_mempool_ring.c b/lib/librte_mempool/rte_mempool_ring.c new file mode 100644 index 00000000..b9aa64dd --- /dev/null +++ b/lib/librte_mempool/rte_mempool_ring.c @@ -0,0 +1,161 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdio.h> +#include <string.h> + +#include <rte_errno.h> +#include <rte_ring.h> +#include <rte_mempool.h> + +static int +common_ring_mp_enqueue(struct rte_mempool *mp, void * const *obj_table, + unsigned n) +{ + return rte_ring_mp_enqueue_bulk(mp->pool_data, obj_table, n); +} + +static int +common_ring_sp_enqueue(struct rte_mempool *mp, void * const *obj_table, + unsigned n) +{ + return rte_ring_sp_enqueue_bulk(mp->pool_data, obj_table, n); +} + +static int +common_ring_mc_dequeue(struct rte_mempool *mp, void **obj_table, unsigned n) +{ + return rte_ring_mc_dequeue_bulk(mp->pool_data, obj_table, n); +} + +static int +common_ring_sc_dequeue(struct rte_mempool *mp, void **obj_table, unsigned n) +{ + return rte_ring_sc_dequeue_bulk(mp->pool_data, obj_table, n); +} + +static unsigned +common_ring_get_count(const struct rte_mempool *mp) +{ + return rte_ring_count(mp->pool_data); +} + + +static int +common_ring_alloc(struct rte_mempool *mp) +{ + int rg_flags = 0, ret; + char rg_name[RTE_RING_NAMESIZE]; + struct rte_ring *r; + + ret = snprintf(rg_name, sizeof(rg_name), + RTE_MEMPOOL_MZ_FORMAT, mp->name); + if (ret < 0 || ret >= (int)sizeof(rg_name)) { + rte_errno = ENAMETOOLONG; + return -rte_errno; + } + + /* ring flags */ + if (mp->flags & MEMPOOL_F_SP_PUT) + rg_flags |= RING_F_SP_ENQ; + if (mp->flags & MEMPOOL_F_SC_GET) + rg_flags |= RING_F_SC_DEQ; + + /* + * Allocate the ring that will be used to store objects. + * Ring functions will return appropriate errors if we are + * running as a secondary process etc., so no checks made + * in this function for that condition. + */ + r = rte_ring_create(rg_name, rte_align32pow2(mp->size + 1), + mp->socket_id, rg_flags); + if (r == NULL) + return -rte_errno; + + mp->pool_data = r; + + return 0; +} + +static void +common_ring_free(struct rte_mempool *mp) +{ + rte_ring_free(mp->pool_data); +} + +/* + * The following 4 declarations of mempool ops structs address + * the need for the backward compatible mempool handlers for + * single/multi producers and single/multi consumers as dictated by the + * flags provided to the rte_mempool_create function + */ +static const struct rte_mempool_ops ops_mp_mc = { + .name = "ring_mp_mc", + .alloc = common_ring_alloc, + .free = common_ring_free, + .enqueue = common_ring_mp_enqueue, + .dequeue = common_ring_mc_dequeue, + .get_count = common_ring_get_count, +}; + +static const struct rte_mempool_ops ops_sp_sc = { + .name = "ring_sp_sc", + .alloc = common_ring_alloc, + .free = common_ring_free, + .enqueue = common_ring_sp_enqueue, + .dequeue = common_ring_sc_dequeue, + .get_count = common_ring_get_count, +}; + +static const struct rte_mempool_ops ops_mp_sc = { + .name = "ring_mp_sc", + .alloc = common_ring_alloc, + .free = common_ring_free, + .enqueue = common_ring_mp_enqueue, + .dequeue = common_ring_sc_dequeue, + .get_count = common_ring_get_count, +}; + +static const struct rte_mempool_ops ops_sp_mc = { + .name = "ring_sp_mc", + .alloc = common_ring_alloc, + .free = common_ring_free, + .enqueue = common_ring_sp_enqueue, + .dequeue = common_ring_mc_dequeue, + .get_count = common_ring_get_count, +}; + +MEMPOOL_REGISTER_OPS(ops_mp_mc); +MEMPOOL_REGISTER_OPS(ops_sp_sc); +MEMPOOL_REGISTER_OPS(ops_mp_sc); +MEMPOOL_REGISTER_OPS(ops_sp_mc); diff --git a/lib/librte_mempool/rte_mempool_stack.c b/lib/librte_mempool/rte_mempool_stack.c new file mode 100644 index 00000000..5fd8af24 --- /dev/null +++ b/lib/librte_mempool/rte_mempool_stack.c @@ -0,0 +1,147 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdio.h> +#include <rte_mempool.h> +#include <rte_malloc.h> + +struct rte_mempool_stack { + rte_spinlock_t sl; + + uint32_t size; + uint32_t len; + void *objs[]; +}; + +static int +stack_alloc(struct rte_mempool *mp) +{ + struct rte_mempool_stack *s; + unsigned n = mp->size; + int size = sizeof(*s) + (n+16)*sizeof(void *); + + /* Allocate our local memory structure */ + s = rte_zmalloc_socket("mempool-stack", + size, + RTE_CACHE_LINE_SIZE, + mp->socket_id); + if (s == NULL) { + RTE_LOG(ERR, MEMPOOL, "Cannot allocate stack!\n"); + return -ENOMEM; + } + + rte_spinlock_init(&s->sl); + + s->size = n; + mp->pool_data = s; + + return 0; +} + +static int +stack_enqueue(struct rte_mempool *mp, void * const *obj_table, + unsigned n) +{ + struct rte_mempool_stack *s = mp->pool_data; + void **cache_objs; + unsigned index; + + rte_spinlock_lock(&s->sl); + cache_objs = &s->objs[s->len]; + + /* Is there sufficient space in the stack ? */ + if ((s->len + n) > s->size) { + rte_spinlock_unlock(&s->sl); + return -ENOBUFS; + } + + /* Add elements back into the cache */ + for (index = 0; index < n; ++index, obj_table++) + cache_objs[index] = *obj_table; + + s->len += n; + + rte_spinlock_unlock(&s->sl); + return 0; +} + +static int +stack_dequeue(struct rte_mempool *mp, void **obj_table, + unsigned n) +{ + struct rte_mempool_stack *s = mp->pool_data; + void **cache_objs; + unsigned index, len; + + rte_spinlock_lock(&s->sl); + + if (unlikely(n > s->len)) { + rte_spinlock_unlock(&s->sl); + return -ENOENT; + } + + cache_objs = s->objs; + + for (index = 0, len = s->len - 1; index < n; + ++index, len--, obj_table++) + *obj_table = cache_objs[len]; + + s->len -= n; + rte_spinlock_unlock(&s->sl); + return n; +} + +static unsigned +stack_get_count(const struct rte_mempool *mp) +{ + struct rte_mempool_stack *s = mp->pool_data; + + return s->len; +} + +static void +stack_free(struct rte_mempool *mp) +{ + rte_free((void *)(mp->pool_data)); +} + +static struct rte_mempool_ops ops_stack = { + .name = "stack", + .alloc = stack_alloc, + .free = stack_free, + .enqueue = stack_enqueue, + .dequeue = stack_dequeue, + .get_count = stack_get_count +}; + +MEMPOOL_REGISTER_OPS(ops_stack); diff --git a/lib/librte_mempool/rte_mempool_version.map b/lib/librte_mempool/rte_mempool_version.map index 17151e08..dee1c990 100644 --- a/lib/librte_mempool/rte_mempool_version.map +++ b/lib/librte_mempool/rte_mempool_version.map @@ -1,7 +1,6 @@ DPDK_2.0 { global: - rte_dom0_mempool_create; rte_mempool_audit; rte_mempool_calc_obj_size; rte_mempool_count; @@ -9,7 +8,6 @@ DPDK_2.0 { rte_mempool_dump; rte_mempool_list_dump; rte_mempool_lookup; - rte_mempool_obj_iter; rte_mempool_walk; rte_mempool_xmem_create; rte_mempool_xmem_size; @@ -17,3 +15,30 @@ DPDK_2.0 { local: *; }; + +DPDK_16.07 { + global: + + rte_mempool_avail_count; + rte_mempool_cache_create; + rte_mempool_cache_flush; + rte_mempool_cache_free; + rte_mempool_check_cookies; + rte_mempool_create_empty; + rte_mempool_default_cache; + rte_mempool_free; + rte_mempool_generic_get; + rte_mempool_generic_put; + rte_mempool_in_use_count; + rte_mempool_mem_iter; + rte_mempool_obj_iter; + rte_mempool_ops_table; + rte_mempool_populate_anon; + rte_mempool_populate_default; + rte_mempool_populate_phys; + rte_mempool_populate_phys_tab; + rte_mempool_populate_virt; + rte_mempool_register_ops; + rte_mempool_set_ops_byname; + +} DPDK_2.0; diff --git a/lib/librte_pdump/Makefile b/lib/librte_pdump/Makefile new file mode 100644 index 00000000..166441a2 --- /dev/null +++ b/lib/librte_pdump/Makefile @@ -0,0 +1,57 @@ +# BSD LICENSE +# +# Copyright(c) 2016 Intel Corporation. All rights reserved. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_pdump.a + +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 +CFLAGS += -D_GNU_SOURCE +LDLIBS += -lpthread + +EXPORT_MAP := rte_pdump_version.map + +LIBABIVER := 1 + +# all source are stored in SRCS-y +SRCS-$(CONFIG_RTE_LIBRTE_PDUMP) := rte_pdump.c + +# install this header file +SYMLINK-$(CONFIG_RTE_LIBRTE_PDUMP)-include := rte_pdump.h + +# this lib depends upon: +DEPDIRS-$(CONFIG_RTE_LIBRTE_PDUMP) += lib/librte_mbuf +DEPDIRS-$(CONFIG_RTE_LIBRTE_PDUMP) += lib/librte_mempool +DEPDIRS-$(CONFIG_RTE_LIBRTE_PDUMP) += lib/librte_eal +DEPDIRS-$(CONFIG_RTE_LIBRTE_PDUMP) += lib/librte_ether + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/lib/librte_pdump/rte_pdump.c b/lib/librte_pdump/rte_pdump.c new file mode 100644 index 00000000..ee566cb2 --- /dev/null +++ b/lib/librte_pdump/rte_pdump.c @@ -0,0 +1,959 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/socket.h> +#include <sys/un.h> +#include <sys/stat.h> +#include <unistd.h> +#include <sys/types.h> +#include <pthread.h> +#include <stdbool.h> +#include <stdio.h> + +#include <rte_memcpy.h> +#include <rte_mbuf.h> +#include <rte_ethdev.h> +#include <rte_lcore.h> +#include <rte_log.h> +#include <rte_errno.h> +#include <rte_pci.h> + +#include "rte_pdump.h" + +#define SOCKET_PATH_VAR_RUN "/var/run" +#define SOCKET_PATH_HOME "HOME" +#define DPDK_DIR "/.dpdk" +#define SOCKET_DIR "/pdump_sockets" +#define SERVER_SOCKET "%s/pdump_server_socket" +#define CLIENT_SOCKET "%s/pdump_client_socket_%d_%u" +#define DEVICE_ID_SIZE 64 +/* Macros for printing using RTE_LOG */ +#define RTE_LOGTYPE_PDUMP RTE_LOGTYPE_USER1 + +enum pdump_operation { + DISABLE = 1, + ENABLE = 2 +}; + +enum pdump_version { + V1 = 1 +}; + +static pthread_t pdump_thread; +static int pdump_socket_fd; +static char server_socket_dir[PATH_MAX]; +static char client_socket_dir[PATH_MAX]; + +struct pdump_request { + uint16_t ver; + uint16_t op; + uint32_t flags; + union pdump_data { + struct enable_v1 { + char device[DEVICE_ID_SIZE]; + uint16_t queue; + struct rte_ring *ring; + struct rte_mempool *mp; + void *filter; + } en_v1; + struct disable_v1 { + char device[DEVICE_ID_SIZE]; + uint16_t queue; + struct rte_ring *ring; + struct rte_mempool *mp; + void *filter; + } dis_v1; + } data; +}; + +struct pdump_response { + uint16_t ver; + uint16_t res_op; + int32_t err_value; +}; + +static struct pdump_rxtx_cbs { + struct rte_ring *ring; + struct rte_mempool *mp; + struct rte_eth_rxtx_callback *cb; + void *filter; +} rx_cbs[RTE_MAX_ETHPORTS][RTE_MAX_QUEUES_PER_PORT], +tx_cbs[RTE_MAX_ETHPORTS][RTE_MAX_QUEUES_PER_PORT]; + +static inline int +pdump_pktmbuf_copy_data(struct rte_mbuf *seg, const struct rte_mbuf *m) +{ + if (rte_pktmbuf_tailroom(seg) < m->data_len) { + RTE_LOG(ERR, PDUMP, + "User mempool: insufficient data_len of mbuf\n"); + return -EINVAL; + } + + seg->port = m->port; + seg->vlan_tci = m->vlan_tci; + seg->hash = m->hash; + seg->tx_offload = m->tx_offload; + seg->ol_flags = m->ol_flags; + seg->packet_type = m->packet_type; + seg->vlan_tci_outer = m->vlan_tci_outer; + seg->data_len = m->data_len; + seg->pkt_len = seg->data_len; + rte_memcpy(rte_pktmbuf_mtod(seg, void *), + rte_pktmbuf_mtod(m, void *), + rte_pktmbuf_data_len(seg)); + + return 0; +} + +static inline struct rte_mbuf * +pdump_pktmbuf_copy(struct rte_mbuf *m, struct rte_mempool *mp) +{ + struct rte_mbuf *m_dup, *seg, **prev; + uint32_t pktlen; + uint8_t nseg; + + m_dup = rte_pktmbuf_alloc(mp); + if (unlikely(m_dup == NULL)) + return NULL; + + seg = m_dup; + prev = &seg->next; + pktlen = m->pkt_len; + nseg = 0; + + do { + nseg++; + if (pdump_pktmbuf_copy_data(seg, m) < 0) { + rte_pktmbuf_free(m_dup); + return NULL; + } + *prev = seg; + prev = &seg->next; + } while ((m = m->next) != NULL && + (seg = rte_pktmbuf_alloc(mp)) != NULL); + + *prev = NULL; + m_dup->nb_segs = nseg; + m_dup->pkt_len = pktlen; + + /* Allocation of new indirect segment failed */ + if (unlikely(seg == NULL)) { + rte_pktmbuf_free(m_dup); + return NULL; + } + + __rte_mbuf_sanity_check(m_dup, 1); + return m_dup; +} + +static inline void +pdump_copy(struct rte_mbuf **pkts, uint16_t nb_pkts, void *user_params) +{ + unsigned i; + int ring_enq; + uint16_t d_pkts = 0; + struct rte_mbuf *dup_bufs[nb_pkts]; + struct pdump_rxtx_cbs *cbs; + struct rte_ring *ring; + struct rte_mempool *mp; + struct rte_mbuf *p; + + cbs = user_params; + ring = cbs->ring; + mp = cbs->mp; + for (i = 0; i < nb_pkts; i++) { + p = pdump_pktmbuf_copy(pkts[i], mp); + if (p) + dup_bufs[d_pkts++] = p; + } + + ring_enq = rte_ring_enqueue_burst(ring, (void *)dup_bufs, d_pkts); + if (unlikely(ring_enq < d_pkts)) { + RTE_LOG(DEBUG, PDUMP, + "only %d of packets enqueued to ring\n", ring_enq); + do { + rte_pktmbuf_free(dup_bufs[ring_enq]); + } while (++ring_enq < d_pkts); + } +} + +static uint16_t +pdump_rx(uint8_t port __rte_unused, uint16_t qidx __rte_unused, + struct rte_mbuf **pkts, uint16_t nb_pkts, + uint16_t max_pkts __rte_unused, + void *user_params) +{ + pdump_copy(pkts, nb_pkts, user_params); + return nb_pkts; +} + +static uint16_t +pdump_tx(uint8_t port __rte_unused, uint16_t qidx __rte_unused, + struct rte_mbuf **pkts, uint16_t nb_pkts, void *user_params) +{ + pdump_copy(pkts, nb_pkts, user_params); + return nb_pkts; +} + +static int +pdump_get_dombdf(char *device_id, char *domBDF, size_t len) +{ + int ret; + struct rte_pci_addr dev_addr = {0}; + + /* identify if device_id is pci address or name */ + ret = eal_parse_pci_DomBDF(device_id, &dev_addr); + if (ret < 0) + return -1; + + if (dev_addr.domain) + ret = snprintf(domBDF, len, "%u:%u:%u.%u", dev_addr.domain, + dev_addr.bus, dev_addr.devid, + dev_addr.function); + else + ret = snprintf(domBDF, len, "%u:%u.%u", dev_addr.bus, + dev_addr.devid, + dev_addr.function); + + return ret; +} + +static int +pdump_regitser_rx_callbacks(uint16_t end_q, uint8_t port, uint16_t queue, + struct rte_ring *ring, struct rte_mempool *mp, + uint16_t operation) +{ + uint16_t qid; + struct pdump_rxtx_cbs *cbs = NULL; + + qid = (queue == RTE_PDUMP_ALL_QUEUES) ? 0 : queue; + for (; qid < end_q; qid++) { + cbs = &rx_cbs[port][qid]; + if (cbs && operation == ENABLE) { + if (cbs->cb) { + RTE_LOG(ERR, PDUMP, + "failed to add rx callback for port=%d " + "and queue=%d, callback already exists\n", + port, qid); + return -EEXIST; + } + cbs->ring = ring; + cbs->mp = mp; + cbs->cb = rte_eth_add_first_rx_callback(port, qid, + pdump_rx, cbs); + if (cbs->cb == NULL) { + RTE_LOG(ERR, PDUMP, + "failed to add rx callback, errno=%d\n", + rte_errno); + return rte_errno; + } + } + if (cbs && operation == DISABLE) { + int ret; + + if (cbs->cb == NULL) { + RTE_LOG(ERR, PDUMP, + "failed to delete non existing rx " + "callback for port=%d and queue=%d\n", + port, qid); + return -EINVAL; + } + ret = rte_eth_remove_rx_callback(port, qid, cbs->cb); + if (ret < 0) { + RTE_LOG(ERR, PDUMP, + "failed to remove rx callback, errno=%d\n", + rte_errno); + return ret; + } + cbs->cb = NULL; + } + } + + return 0; +} + +static int +pdump_regitser_tx_callbacks(uint16_t end_q, uint8_t port, uint16_t queue, + struct rte_ring *ring, struct rte_mempool *mp, + uint16_t operation) +{ + + uint16_t qid; + struct pdump_rxtx_cbs *cbs = NULL; + + qid = (queue == RTE_PDUMP_ALL_QUEUES) ? 0 : queue; + for (; qid < end_q; qid++) { + cbs = &tx_cbs[port][qid]; + if (cbs && operation == ENABLE) { + if (cbs->cb) { + RTE_LOG(ERR, PDUMP, + "failed to add tx callback for port=%d " + "and queue=%d, callback already exists\n", + port, qid); + return -EEXIST; + } + cbs->ring = ring; + cbs->mp = mp; + cbs->cb = rte_eth_add_tx_callback(port, qid, pdump_tx, + cbs); + if (cbs->cb == NULL) { + RTE_LOG(ERR, PDUMP, + "failed to add tx callback, errno=%d\n", + rte_errno); + return rte_errno; + } + } + if (cbs && operation == DISABLE) { + int ret; + + if (cbs->cb == NULL) { + RTE_LOG(ERR, PDUMP, + "failed to delete non existing tx " + "callback for port=%d and queue=%d\n", + port, qid); + return -EINVAL; + } + ret = rte_eth_remove_tx_callback(port, qid, cbs->cb); + if (ret < 0) { + RTE_LOG(ERR, PDUMP, + "failed to remove tx callback, errno=%d\n", + rte_errno); + return ret; + } + cbs->cb = NULL; + } + } + + return 0; +} + +static int +set_pdump_rxtx_cbs(struct pdump_request *p) +{ + uint16_t nb_rx_q, nb_tx_q = 0, end_q, queue; + uint8_t port; + int ret = 0; + uint32_t flags; + uint16_t operation; + struct rte_ring *ring; + struct rte_mempool *mp; + + flags = p->flags; + operation = p->op; + if (operation == ENABLE) { + ret = rte_eth_dev_get_port_by_name(p->data.en_v1.device, + &port); + if (ret < 0) { + RTE_LOG(ERR, PDUMP, + "failed to get potid for device id=%s\n", + p->data.en_v1.device); + return -EINVAL; + } + queue = p->data.en_v1.queue; + ring = p->data.en_v1.ring; + mp = p->data.en_v1.mp; + } else { + ret = rte_eth_dev_get_port_by_name(p->data.dis_v1.device, + &port); + if (ret < 0) { + RTE_LOG(ERR, PDUMP, + "failed to get potid for device id=%s\n", + p->data.dis_v1.device); + return -EINVAL; + } + queue = p->data.dis_v1.queue; + ring = p->data.dis_v1.ring; + mp = p->data.dis_v1.mp; + } + + /* validation if packet capture is for all queues */ + if (queue == RTE_PDUMP_ALL_QUEUES) { + struct rte_eth_dev_info dev_info; + + rte_eth_dev_info_get(port, &dev_info); + nb_rx_q = dev_info.nb_rx_queues; + nb_tx_q = dev_info.nb_tx_queues; + if (nb_rx_q == 0 && flags & RTE_PDUMP_FLAG_RX) { + RTE_LOG(ERR, PDUMP, + "number of rx queues cannot be 0\n"); + return -EINVAL; + } + if (nb_tx_q == 0 && flags & RTE_PDUMP_FLAG_TX) { + RTE_LOG(ERR, PDUMP, + "number of tx queues cannot be 0\n"); + return -EINVAL; + } + if ((nb_tx_q == 0 || nb_rx_q == 0) && + flags == RTE_PDUMP_FLAG_RXTX) { + RTE_LOG(ERR, PDUMP, + "both tx&rx queues must be non zero\n"); + return -EINVAL; + } + } + + /* register RX callback */ + if (flags & RTE_PDUMP_FLAG_RX) { + end_q = (queue == RTE_PDUMP_ALL_QUEUES) ? nb_rx_q : queue + 1; + ret = pdump_regitser_rx_callbacks(end_q, port, queue, ring, mp, + operation); + if (ret < 0) + return ret; + } + + /* register TX callback */ + if (flags & RTE_PDUMP_FLAG_TX) { + end_q = (queue == RTE_PDUMP_ALL_QUEUES) ? nb_tx_q : queue + 1; + ret = pdump_regitser_tx_callbacks(end_q, port, queue, ring, mp, + operation); + if (ret < 0) + return ret; + } + + return ret; +} + +/* get socket path (/var/run if root, $HOME otherwise) */ +static int +pdump_get_socket_path(char *buffer, int bufsz, enum rte_pdump_socktype type) +{ + char dpdk_dir[PATH_MAX] = {0}; + char dir[PATH_MAX] = {0}; + char *dir_home = NULL; + + if (type == RTE_PDUMP_SOCKET_SERVER && server_socket_dir[0] != 0) + snprintf(dir, sizeof(dir), "%s", server_socket_dir); + else if (type == RTE_PDUMP_SOCKET_CLIENT && client_socket_dir[0] != 0) + snprintf(dir, sizeof(dir), "%s", client_socket_dir); + else { + if (getuid() != 0) { + dir_home = getenv(SOCKET_PATH_HOME); + if (!dir_home) { + RTE_LOG(ERR, PDUMP, + "Failed to get environment variable" + " value for %s, %s:%d\n", + SOCKET_PATH_HOME, __func__, __LINE__); + return -1; + } + snprintf(dpdk_dir, sizeof(dpdk_dir), "%s%s", + dir_home, DPDK_DIR); + } else + snprintf(dpdk_dir, sizeof(dpdk_dir), "%s%s", + SOCKET_PATH_VAR_RUN, DPDK_DIR); + + mkdir(dpdk_dir, 700); + snprintf(dir, sizeof(dir), "%s%s", + dpdk_dir, SOCKET_DIR); + } + + mkdir(dir, 700); + if (type == RTE_PDUMP_SOCKET_SERVER) + snprintf(buffer, bufsz, SERVER_SOCKET, dir); + else + snprintf(buffer, bufsz, CLIENT_SOCKET, dir, getpid(), + rte_sys_gettid()); + + return 0; +} + +static int +pdump_create_server_socket(void) +{ + int ret, socket_fd; + struct sockaddr_un addr; + socklen_t addr_len; + + ret = pdump_get_socket_path(addr.sun_path, sizeof(addr.sun_path), + RTE_PDUMP_SOCKET_SERVER); + if (ret != 0) { + RTE_LOG(ERR, PDUMP, + "Failed to get server socket path: %s:%d\n", + __func__, __LINE__); + return -1; + } + addr.sun_family = AF_UNIX; + + /* remove if file already exists */ + unlink(addr.sun_path); + + /* set up a server socket */ + socket_fd = socket(AF_UNIX, SOCK_DGRAM, 0); + if (socket_fd < 0) { + RTE_LOG(ERR, PDUMP, + "Failed to create server socket: %s, %s:%d\n", + strerror(errno), __func__, __LINE__); + return -1; + } + + addr_len = sizeof(struct sockaddr_un); + ret = bind(socket_fd, (struct sockaddr *) &addr, addr_len); + if (ret) { + RTE_LOG(ERR, PDUMP, + "Failed to bind to server socket: %s, %s:%d\n", + strerror(errno), __func__, __LINE__); + close(socket_fd); + return -1; + } + + /* save the socket in local configuration */ + pdump_socket_fd = socket_fd; + + return 0; +} + +static __attribute__((noreturn)) void * +pdump_thread_main(__rte_unused void *arg) +{ + struct sockaddr_un cli_addr; + socklen_t cli_len; + struct pdump_request cli_req; + struct pdump_response resp; + int n; + int ret = 0; + + /* host thread, never break out */ + for (;;) { + /* recv client requests */ + cli_len = sizeof(cli_addr); + n = recvfrom(pdump_socket_fd, &cli_req, + sizeof(struct pdump_request), 0, + (struct sockaddr *)&cli_addr, &cli_len); + if (n < 0) { + RTE_LOG(ERR, PDUMP, + "failed to recv from client:%s, %s:%d\n", + strerror(errno), __func__, __LINE__); + continue; + } + + ret = set_pdump_rxtx_cbs(&cli_req); + + resp.ver = cli_req.ver; + resp.res_op = cli_req.op; + resp.err_value = ret; + n = sendto(pdump_socket_fd, &resp, + sizeof(struct pdump_response), + 0, (struct sockaddr *)&cli_addr, cli_len); + if (n < 0) { + RTE_LOG(ERR, PDUMP, + "failed to send to client:%s, %s:%d\n", + strerror(errno), __func__, __LINE__); + } + } +} + +int +rte_pdump_init(const char *path) +{ + int ret = 0; + char thread_name[RTE_MAX_THREAD_NAME_LEN]; + + ret = rte_pdump_set_socket_dir(path, RTE_PDUMP_SOCKET_SERVER); + if (ret != 0) + return -1; + + ret = pdump_create_server_socket(); + if (ret != 0) { + RTE_LOG(ERR, PDUMP, + "Failed to create server socket:%s:%d\n", + __func__, __LINE__); + return -1; + } + + /* create the host thread to wait/handle pdump requests */ + ret = pthread_create(&pdump_thread, NULL, pdump_thread_main, NULL); + if (ret != 0) { + RTE_LOG(ERR, PDUMP, + "Failed to create the pdump thread:%s, %s:%d\n", + strerror(errno), __func__, __LINE__); + return -1; + } + /* Set thread_name for aid in debugging. */ + snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "pdump-thread"); + ret = rte_thread_setname(pdump_thread, thread_name); + if (ret != 0) { + RTE_LOG(DEBUG, PDUMP, + "Failed to set thread name for pdump handling\n"); + } + + return 0; +} + +int +rte_pdump_uninit(void) +{ + int ret; + + ret = pthread_cancel(pdump_thread); + if (ret != 0) { + RTE_LOG(ERR, PDUMP, + "Failed to cancel the pdump thread:%s, %s:%d\n", + strerror(errno), __func__, __LINE__); + return -1; + } + + ret = close(pdump_socket_fd); + if (ret != 0) { + RTE_LOG(ERR, PDUMP, + "Failed to close server socket: %s, %s:%d\n", + strerror(errno), __func__, __LINE__); + return -1; + } + + struct sockaddr_un addr; + + ret = pdump_get_socket_path(addr.sun_path, sizeof(addr.sun_path), + RTE_PDUMP_SOCKET_SERVER); + if (ret != 0) { + RTE_LOG(ERR, PDUMP, + "Failed to get server socket path: %s:%d\n", + __func__, __LINE__); + return -1; + } + ret = unlink(addr.sun_path); + if (ret != 0) { + RTE_LOG(ERR, PDUMP, + "Failed to remove server socket addr: %s, %s:%d\n", + strerror(errno), __func__, __LINE__); + return -1; + } + + return 0; +} + +static int +pdump_create_client_socket(struct pdump_request *p) +{ + int ret, socket_fd; + int pid; + int n; + struct pdump_response server_resp; + struct sockaddr_un addr, serv_addr, from; + socklen_t addr_len, serv_len; + + pid = getpid(); + + socket_fd = socket(AF_UNIX, SOCK_DGRAM, 0); + if (socket_fd < 0) { + RTE_LOG(ERR, PDUMP, + "client socket(): %s:pid(%d):tid(%u), %s:%d\n", + strerror(errno), pid, rte_sys_gettid(), + __func__, __LINE__); + ret = errno; + return ret; + } + + ret = pdump_get_socket_path(addr.sun_path, sizeof(addr.sun_path), + RTE_PDUMP_SOCKET_CLIENT); + if (ret != 0) { + RTE_LOG(ERR, PDUMP, + "Failed to get client socket path: %s:%d\n", + __func__, __LINE__); + return -1; + } + addr.sun_family = AF_UNIX; + addr_len = sizeof(struct sockaddr_un); + + do { + ret = bind(socket_fd, (struct sockaddr *) &addr, addr_len); + if (ret) { + RTE_LOG(ERR, PDUMP, + "client bind(): %s, %s:%d\n", + strerror(errno), __func__, __LINE__); + ret = errno; + break; + } + + serv_len = sizeof(struct sockaddr_un); + memset(&serv_addr, 0, sizeof(serv_addr)); + ret = pdump_get_socket_path(serv_addr.sun_path, + sizeof(serv_addr.sun_path), + RTE_PDUMP_SOCKET_SERVER); + if (ret != 0) { + RTE_LOG(ERR, PDUMP, + "Failed to get server socket path: %s:%d\n", + __func__, __LINE__); + break; + } + serv_addr.sun_family = AF_UNIX; + + n = sendto(socket_fd, p, sizeof(struct pdump_request), 0, + (struct sockaddr *)&serv_addr, serv_len); + if (n < 0) { + RTE_LOG(ERR, PDUMP, + "failed to send to server:%s, %s:%d\n", + strerror(errno), __func__, __LINE__); + ret = errno; + break; + } + + n = recvfrom(socket_fd, &server_resp, + sizeof(struct pdump_response), 0, + (struct sockaddr *)&from, &serv_len); + if (n < 0) { + RTE_LOG(ERR, PDUMP, + "failed to recv from server:%s, %s:%d\n", + strerror(errno), __func__, __LINE__); + ret = errno; + break; + } + ret = server_resp.err_value; + } while (0); + + close(socket_fd); + unlink(addr.sun_path); + return ret; +} + +static int +pdump_validate_ring_mp(struct rte_ring *ring, struct rte_mempool *mp) +{ + if (ring == NULL || mp == NULL) { + RTE_LOG(ERR, PDUMP, "NULL ring or mempool are passed %s:%d\n", + __func__, __LINE__); + rte_errno = EINVAL; + return -1; + } + if (mp->flags & MEMPOOL_F_SP_PUT || mp->flags & MEMPOOL_F_SC_GET) { + RTE_LOG(ERR, PDUMP, "mempool with either SP or SC settings" + " is not valid for pdump, should have MP and MC settings\n"); + rte_errno = EINVAL; + return -1; + } + if (ring->prod.sp_enqueue || ring->cons.sc_dequeue) { + RTE_LOG(ERR, PDUMP, "ring with either SP or SC settings" + " is not valid for pdump, should have MP and MC settings\n"); + rte_errno = EINVAL; + return -1; + } + + return 0; +} + +static int +pdump_validate_flags(uint32_t flags) +{ + if (flags != RTE_PDUMP_FLAG_RX && flags != RTE_PDUMP_FLAG_TX && + flags != RTE_PDUMP_FLAG_RXTX) { + RTE_LOG(ERR, PDUMP, + "invalid flags, should be either rx/tx/rxtx\n"); + rte_errno = EINVAL; + return -1; + } + + return 0; +} + +static int +pdump_validate_port(uint8_t port, char *name) +{ + int ret = 0; + + if (port >= RTE_MAX_ETHPORTS) { + RTE_LOG(ERR, PDUMP, "Invalid port id %u, %s:%d\n", port, + __func__, __LINE__); + rte_errno = EINVAL; + return -1; + } + + ret = rte_eth_dev_get_name_by_port(port, name); + if (ret < 0) { + RTE_LOG(ERR, PDUMP, + "port id to name mapping failed for port id=%u, %s:%d\n", + port, __func__, __LINE__); + rte_errno = EINVAL; + return -1; + } + + return 0; +} + +static int +pdump_prepare_client_request(char *device, uint16_t queue, + uint32_t flags, + uint16_t operation, + struct rte_ring *ring, + struct rte_mempool *mp, + void *filter) +{ + int ret; + struct pdump_request req = {.ver = 1,}; + + req.flags = flags; + req.op = operation; + if ((operation & ENABLE) != 0) { + snprintf(req.data.en_v1.device, sizeof(req.data.en_v1.device), + "%s", device); + req.data.en_v1.queue = queue; + req.data.en_v1.ring = ring; + req.data.en_v1.mp = mp; + req.data.en_v1.filter = filter; + } else { + snprintf(req.data.dis_v1.device, sizeof(req.data.dis_v1.device), + "%s", device); + req.data.dis_v1.queue = queue; + req.data.dis_v1.ring = NULL; + req.data.dis_v1.mp = NULL; + req.data.dis_v1.filter = NULL; + } + + ret = pdump_create_client_socket(&req); + if (ret < 0) { + RTE_LOG(ERR, PDUMP, + "client request for pdump enable/disable failed\n"); + rte_errno = ret; + return -1; + } + + return 0; +} + +int +rte_pdump_enable(uint8_t port, uint16_t queue, uint32_t flags, + struct rte_ring *ring, + struct rte_mempool *mp, + void *filter) +{ + + int ret = 0; + char name[DEVICE_ID_SIZE]; + + ret = pdump_validate_port(port, name); + if (ret < 0) + return ret; + ret = pdump_validate_ring_mp(ring, mp); + if (ret < 0) + return ret; + ret = pdump_validate_flags(flags); + if (ret < 0) + return ret; + + ret = pdump_prepare_client_request(name, queue, flags, + ENABLE, ring, mp, filter); + + return ret; +} + +int +rte_pdump_enable_by_deviceid(char *device_id, uint16_t queue, + uint32_t flags, + struct rte_ring *ring, + struct rte_mempool *mp, + void *filter) +{ + int ret = 0; + char domBDF[DEVICE_ID_SIZE]; + + ret = pdump_validate_ring_mp(ring, mp); + if (ret < 0) + return ret; + ret = pdump_validate_flags(flags); + if (ret < 0) + return ret; + + if (pdump_get_dombdf(device_id, domBDF, sizeof(domBDF)) > 0) + ret = pdump_prepare_client_request(domBDF, queue, flags, + ENABLE, ring, mp, filter); + else + ret = pdump_prepare_client_request(device_id, queue, flags, + ENABLE, ring, mp, filter); + + return ret; +} + +int +rte_pdump_disable(uint8_t port, uint16_t queue, uint32_t flags) +{ + int ret = 0; + char name[DEVICE_ID_SIZE]; + + ret = pdump_validate_port(port, name); + if (ret < 0) + return ret; + ret = pdump_validate_flags(flags); + if (ret < 0) + return ret; + + ret = pdump_prepare_client_request(name, queue, flags, + DISABLE, NULL, NULL, NULL); + + return ret; +} + +int +rte_pdump_disable_by_deviceid(char *device_id, uint16_t queue, + uint32_t flags) +{ + int ret = 0; + char domBDF[DEVICE_ID_SIZE]; + + ret = pdump_validate_flags(flags); + if (ret < 0) + return ret; + + if (pdump_get_dombdf(device_id, domBDF, sizeof(domBDF)) > 0) + ret = pdump_prepare_client_request(domBDF, queue, flags, + DISABLE, NULL, NULL, NULL); + else + ret = pdump_prepare_client_request(device_id, queue, flags, + DISABLE, NULL, NULL, NULL); + + return ret; +} + +int +rte_pdump_set_socket_dir(const char *path, enum rte_pdump_socktype type) +{ + int ret, count; + + if (path != NULL) { + if (type == RTE_PDUMP_SOCKET_SERVER) { + count = sizeof(server_socket_dir); + ret = snprintf(server_socket_dir, count, "%s", path); + } else { + count = sizeof(client_socket_dir); + ret = snprintf(client_socket_dir, count, "%s", path); + } + + if (ret < 0 || ret >= count) { + RTE_LOG(ERR, PDUMP, + "Invalid socket path:%s:%d\n", + __func__, __LINE__); + if (type == RTE_PDUMP_SOCKET_SERVER) + server_socket_dir[0] = 0; + else + client_socket_dir[0] = 0; + return -EINVAL; + } + } + + return 0; +} diff --git a/lib/librte_pdump/rte_pdump.h b/lib/librte_pdump/rte_pdump.h new file mode 100644 index 00000000..b5f4e2f3 --- /dev/null +++ b/lib/librte_pdump/rte_pdump.h @@ -0,0 +1,216 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _RTE_PDUMP_H_ +#define _RTE_PDUMP_H_ + +/** + * @file + * RTE pdump + * + * packet dump library to provide packet capturing support on dpdk. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#define RTE_PDUMP_ALL_QUEUES UINT16_MAX + +enum { + RTE_PDUMP_FLAG_RX = 1, /* receive direction */ + RTE_PDUMP_FLAG_TX = 2, /* transmit direction */ + /* both receive and transmit directions */ + RTE_PDUMP_FLAG_RXTX = (RTE_PDUMP_FLAG_RX|RTE_PDUMP_FLAG_TX) +}; + +enum rte_pdump_socktype { + RTE_PDUMP_SOCKET_SERVER = 1, + RTE_PDUMP_SOCKET_CLIENT = 2 +}; + +/** + * Initialize packet capturing handling + * + * Creates pthread and server socket for handling clients + * requests to enable/disable rxtx callbacks. + * + * @param path + * directory path for server socket. + * + * @return + * 0 on success, -1 on error + */ +int +rte_pdump_init(const char *path); + +/** + * Un initialize packet capturing handling + * + * Cancels pthread, close server socket, removes server socket address. + * + * @return + * 0 on success, -1 on error + */ +int +rte_pdump_uninit(void); + +/** + * Enables packet capturing on given port and queue. + * + * @param port + * port on which packet capturing should be enabled. + * @param queue + * queue of a given port on which packet capturing should be enabled. + * users should pass on value UINT16_MAX to enable packet capturing on all + * queues of a given port. + * @param flags + * flags specifies RTE_PDUMP_FLAG_RX/RTE_PDUMP_FLAG_TX/RTE_PDUMP_FLAG_RXTX + * on which packet capturing should be enabled for a given port and queue. + * @param ring + * ring on which captured packets will be enqueued for user. + * @param mp + * mempool on to which original packets will be mirrored or duplicated. + * @param filter + * place holder for packet filtering. + * + * @return + * 0 on success, -1 on error, rte_errno is set accordingly. + */ + +int +rte_pdump_enable(uint8_t port, uint16_t queue, uint32_t flags, + struct rte_ring *ring, + struct rte_mempool *mp, + void *filter); + +/** + * Disables packet capturing on given port and queue. + * + * @param port + * port on which packet capturing should be disabled. + * @param queue + * queue of a given port on which packet capturing should be disabled. + * users should pass on value UINT16_MAX to disable packet capturing on all + * queues of a given port. + * @param flags + * flags specifies RTE_PDUMP_FLAG_RX/RTE_PDUMP_FLAG_TX/RTE_PDUMP_FLAG_RXTX + * on which packet capturing should be enabled for a given port and queue. + * + * @return + * 0 on success, -1 on error, rte_errno is set accordingly. + */ + +int +rte_pdump_disable(uint8_t port, uint16_t queue, uint32_t flags); + +/** + * Enables packet capturing on given device id and queue. + * device_id can be name or pci address of device. + * + * @param device_id + * device id on which packet capturing should be enabled. + * @param queue + * queue of a given device id on which packet capturing should be enabled. + * users should pass on value UINT16_MAX to enable packet capturing on all + * queues of a given device id. + * @param flags + * flags specifies RTE_PDUMP_FLAG_RX/RTE_PDUMP_FLAG_TX/RTE_PDUMP_FLAG_RXTX + * on which packet capturing should be enabled for a given port and queue. + * @param ring + * ring on which captured packets will be enqueued for user. + * @param mp + * mempool on to which original packets will be mirrored or duplicated. + * @param filter + * place holder for packet filtering. + * + * @return + * 0 on success, -1 on error, rte_errno is set accordingly. + */ + +int +rte_pdump_enable_by_deviceid(char *device_id, uint16_t queue, + uint32_t flags, + struct rte_ring *ring, + struct rte_mempool *mp, + void *filter); + +/** + * Disables packet capturing on given device_id and queue. + * device_id can be name or pci address of device. + * + * @param device_id + * pci address or name of the device on which packet capturing + * should be disabled. + * @param queue + * queue of a given device on which packet capturing should be disabled. + * users should pass on value UINT16_MAX to disable packet capturing on all + * queues of a given device id. + * @param flags + * flags specifies RTE_PDUMP_FLAG_RX/RTE_PDUMP_FLAG_TX/RTE_PDUMP_FLAG_RXTX + * on which packet capturing should be enabled for a given port and queue. + * + * @return + * 0 on success, -1 on error, rte_errno is set accordingly. + */ +int +rte_pdump_disable_by_deviceid(char *device_id, uint16_t queue, + uint32_t flags); + +/** + * Allows applications to set server and client socket paths. + * If specified path is null default path will be selected, i.e. + *"/var/run/" for root user and "$HOME" for non root user. + * Clients also need to call this API to set their server path if the + * server path is different from default path. + * This API is not thread-safe. + * + * @param path + * directory path for server or client socket. + * @type + * specifies RTE_PDUMP_SOCKET_SERVER if socket path is for server. + * (or) + * specifies RTE_PDUMP_SOCKET_CLIENT if socket path is for client. + * + * @return + * 0 on success, -EINVAL on error + * + */ +int +rte_pdump_set_socket_dir(const char *path, enum rte_pdump_socktype type); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_PDUMP_H_ */ diff --git a/lib/librte_pdump/rte_pdump_version.map b/lib/librte_pdump/rte_pdump_version.map new file mode 100644 index 00000000..edec99a4 --- /dev/null +++ b/lib/librte_pdump/rte_pdump_version.map @@ -0,0 +1,13 @@ +DPDK_16.07 { + global: + + rte_pdump_disable; + rte_pdump_disable_by_deviceid; + rte_pdump_enable; + rte_pdump_enable_by_deviceid; + rte_pdump_init; + rte_pdump_set_socket_dir; + rte_pdump_uninit; + + local: *; +}; diff --git a/lib/librte_pipeline/Makefile b/lib/librte_pipeline/Makefile index 822fd41c..05d64ff8 100644 --- a/lib/librte_pipeline/Makefile +++ b/lib/librte_pipeline/Makefile @@ -52,7 +52,10 @@ SRCS-$(CONFIG_RTE_LIBRTE_PIPELINE) := rte_pipeline.c SYMLINK-$(CONFIG_RTE_LIBRTE_PIPELINE)-include += rte_pipeline.h # this lib depends upon: -DEPDIRS-$(CONFIG_RTE_LIBRTE_PIPELINE) := lib/librte_table +DEPDIRS-$(CONFIG_RTE_LIBRTE_PIPELINE) += lib/librte_eal +DEPDIRS-$(CONFIG_RTE_LIBRTE_PIPELINE) += lib/librte_mbuf +DEPDIRS-$(CONFIG_RTE_LIBRTE_PIPELINE) += lib/librte_mempool +DEPDIRS-$(CONFIG_RTE_LIBRTE_PIPELINE) += lib/librte_table DEPDIRS-$(CONFIG_RTE_LIBRTE_PIPELINE) += lib/librte_port include $(RTE_SDK)/mk/rte.lib.mk diff --git a/lib/librte_port/Makefile b/lib/librte_port/Makefile index 2c0ccbe5..3d84a0e4 100644 --- a/lib/librte_port/Makefile +++ b/lib/librte_port/Makefile @@ -44,7 +44,7 @@ CFLAGS += $(WERROR_FLAGS) EXPORT_MAP := rte_port_version.map -LIBABIVER := 2 +LIBABIVER := 3 # # all source are stored in SRCS-y @@ -56,6 +56,9 @@ SRCS-$(CONFIG_RTE_LIBRTE_PORT) += rte_port_frag.c SRCS-$(CONFIG_RTE_LIBRTE_PORT) += rte_port_ras.c endif SRCS-$(CONFIG_RTE_LIBRTE_PORT) += rte_port_sched.c +ifeq ($(CONFIG_RTE_LIBRTE_KNI),y) +SRCS-$(CONFIG_RTE_LIBRTE_PORT) += rte_port_kni.c +endif SRCS-$(CONFIG_RTE_LIBRTE_PORT) += rte_port_source_sink.c # install includes @@ -67,6 +70,9 @@ SYMLINK-$(CONFIG_RTE_LIBRTE_PORT)-include += rte_port_frag.h SYMLINK-$(CONFIG_RTE_LIBRTE_PORT)-include += rte_port_ras.h endif SYMLINK-$(CONFIG_RTE_LIBRTE_PORT)-include += rte_port_sched.h +ifeq ($(CONFIG_RTE_LIBRTE_KNI),y) +SYMLINK-$(CONFIG_RTE_LIBRTE_PORT)-include += rte_port_kni.h +endif SYMLINK-$(CONFIG_RTE_LIBRTE_PORT)-include += rte_port_source_sink.h # this lib depends upon: @@ -75,5 +81,9 @@ DEPDIRS-$(CONFIG_RTE_LIBRTE_PORT) += lib/librte_mbuf DEPDIRS-$(CONFIG_RTE_LIBRTE_PORT) += lib/librte_mempool DEPDIRS-$(CONFIG_RTE_LIBRTE_PORT) += lib/librte_ether DEPDIRS-$(CONFIG_RTE_LIBRTE_PORT) += lib/librte_ip_frag +DEPDIRS-$(CONFIG_RTE_LIBRTE_PORT) += lib/librte_sched +ifeq ($(CONFIG_RTE_LIBRTE_KNI),y) +DEPDIRS-$(CONFIG_RTE_LIBRTE_PORT) += lib/librte_kni +endif include $(RTE_SDK)/mk/rte.lib.mk diff --git a/lib/librte_port/rte_port_kni.c b/lib/librte_port/rte_port_kni.c new file mode 100644 index 00000000..08f4ac2a --- /dev/null +++ b/lib/librte_port/rte_port_kni.c @@ -0,0 +1,545 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2016 Ethan Zhuang <zhuangwj@gmail.com>. + * Copyright(c) 2016 Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <string.h> + +#include <rte_common.h> +#include <rte_malloc.h> +#include <rte_kni.h> + +#include "rte_port_kni.h" + +/* + * Port KNI Reader + */ +#ifdef RTE_PORT_STATS_COLLECT + +#define RTE_PORT_KNI_READER_STATS_PKTS_IN_ADD(port, val) \ + port->stats.n_pkts_in += val +#define RTE_PORT_KNI_READER_STATS_PKTS_DROP_ADD(port, val) \ + port->stats.n_pkts_drop += val + +#else + +#define RTE_PORT_KNI_READER_STATS_PKTS_IN_ADD(port, val) +#define RTE_PORT_KNI_READER_STATS_PKTS_DROP_ADD(port, val) + +#endif + +struct rte_port_kni_reader { + struct rte_port_in_stats stats; + + struct rte_kni *kni; +}; + +static void * +rte_port_kni_reader_create(void *params, int socket_id) +{ + struct rte_port_kni_reader_params *conf = + (struct rte_port_kni_reader_params *) params; + struct rte_port_kni_reader *port; + + /* Check input parameters */ + if (conf == NULL) { + RTE_LOG(ERR, PORT, "%s: params is NULL\n", __func__); + return NULL; + } + + /* Memory allocation */ + port = rte_zmalloc_socket("PORT", sizeof(*port), + RTE_CACHE_LINE_SIZE, socket_id); + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: Failed to allocate port\n", __func__); + return NULL; + } + + /* Initialization */ + port->kni = conf->kni; + + return port; +} + +static int +rte_port_kni_reader_rx(void *port, struct rte_mbuf **pkts, uint32_t n_pkts) +{ + struct rte_port_kni_reader *p = + (struct rte_port_kni_reader *) port; + uint16_t rx_pkt_cnt; + + rx_pkt_cnt = rte_kni_rx_burst(p->kni, pkts, n_pkts); + RTE_PORT_KNI_READER_STATS_PKTS_IN_ADD(p, rx_pkt_cnt); + return rx_pkt_cnt; +} + +static int +rte_port_kni_reader_free(void *port) +{ + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: port is NULL\n", __func__); + return -EINVAL; + } + + rte_free(port); + + return 0; +} + +static int rte_port_kni_reader_stats_read(void *port, + struct rte_port_in_stats *stats, int clear) +{ + struct rte_port_kni_reader *p = + (struct rte_port_kni_reader *) port; + + if (stats != NULL) + memcpy(stats, &p->stats, sizeof(p->stats)); + + if (clear) + memset(&p->stats, 0, sizeof(p->stats)); + + return 0; +} + +/* + * Port KNI Writer + */ +#ifdef RTE_PORT_STATS_COLLECT + +#define RTE_PORT_KNI_WRITER_STATS_PKTS_IN_ADD(port, val) \ + port->stats.n_pkts_in += val +#define RTE_PORT_KNI_WRITER_STATS_PKTS_DROP_ADD(port, val) \ + port->stats.n_pkts_drop += val + +#else + +#define RTE_PORT_KNI_WRITER_STATS_PKTS_IN_ADD(port, val) +#define RTE_PORT_KNI_WRITER_STATS_PKTS_DROP_ADD(port, val) + +#endif + +struct rte_port_kni_writer { + struct rte_port_out_stats stats; + + struct rte_mbuf *tx_buf[2 * RTE_PORT_IN_BURST_SIZE_MAX]; + uint32_t tx_burst_sz; + uint32_t tx_buf_count; + uint64_t bsz_mask; + struct rte_kni *kni; +}; + +static void * +rte_port_kni_writer_create(void *params, int socket_id) +{ + struct rte_port_kni_writer_params *conf = + (struct rte_port_kni_writer_params *) params; + struct rte_port_kni_writer *port; + + /* Check input parameters */ + if ((conf == NULL) || + (conf->tx_burst_sz == 0) || + (conf->tx_burst_sz > RTE_PORT_IN_BURST_SIZE_MAX) || + (!rte_is_power_of_2(conf->tx_burst_sz))) { + RTE_LOG(ERR, PORT, "%s: Invalid input parameters\n", __func__); + return NULL; + } + + /* Memory allocation */ + port = rte_zmalloc_socket("PORT", sizeof(*port), + RTE_CACHE_LINE_SIZE, socket_id); + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: Failed to allocate port\n", __func__); + return NULL; + } + + /* Initialization */ + port->kni = conf->kni; + port->tx_burst_sz = conf->tx_burst_sz; + port->tx_buf_count = 0; + port->bsz_mask = 1LLU << (conf->tx_burst_sz - 1); + + return port; +} + +static inline void +send_burst(struct rte_port_kni_writer *p) +{ + uint32_t nb_tx; + + nb_tx = rte_kni_tx_burst(p->kni, p->tx_buf, p->tx_buf_count); + + RTE_PORT_KNI_WRITER_STATS_PKTS_DROP_ADD(p, p->tx_buf_count - nb_tx); + for (; nb_tx < p->tx_buf_count; nb_tx++) + rte_pktmbuf_free(p->tx_buf[nb_tx]); + + p->tx_buf_count = 0; +} + +static int +rte_port_kni_writer_tx(void *port, struct rte_mbuf *pkt) +{ + struct rte_port_kni_writer *p = + (struct rte_port_kni_writer *) port; + + p->tx_buf[p->tx_buf_count++] = pkt; + RTE_PORT_KNI_WRITER_STATS_PKTS_IN_ADD(p, 1); + if (p->tx_buf_count >= p->tx_burst_sz) + send_burst(p); + + return 0; +} + +static int +rte_port_kni_writer_tx_bulk(void *port, + struct rte_mbuf **pkts, + uint64_t pkts_mask) +{ + struct rte_port_kni_writer *p = + (struct rte_port_kni_writer *) port; + uint64_t bsz_mask = p->bsz_mask; + uint32_t tx_buf_count = p->tx_buf_count; + uint64_t expr = (pkts_mask & (pkts_mask + 1)) | + ((pkts_mask & bsz_mask) ^ bsz_mask); + + if (expr == 0) { + uint64_t n_pkts = __builtin_popcountll(pkts_mask); + uint32_t n_pkts_ok; + + if (tx_buf_count) + send_burst(p); + + RTE_PORT_KNI_WRITER_STATS_PKTS_IN_ADD(p, n_pkts); + n_pkts_ok = rte_kni_tx_burst(p->kni, pkts, n_pkts); + + RTE_PORT_KNI_WRITER_STATS_PKTS_DROP_ADD(p, n_pkts - n_pkts_ok); + for (; n_pkts_ok < n_pkts; n_pkts_ok++) { + struct rte_mbuf *pkt = pkts[n_pkts_ok]; + + rte_pktmbuf_free(pkt); + } + } else { + for (; pkts_mask;) { + uint32_t pkt_index = __builtin_ctzll(pkts_mask); + uint64_t pkt_mask = 1LLU << pkt_index; + struct rte_mbuf *pkt = pkts[pkt_index]; + + p->tx_buf[tx_buf_count++] = pkt; + RTE_PORT_KNI_WRITER_STATS_PKTS_IN_ADD(p, 1); + pkts_mask &= ~pkt_mask; + } + + p->tx_buf_count = tx_buf_count; + if (tx_buf_count >= p->tx_burst_sz) + send_burst(p); + } + + return 0; +} + +static int +rte_port_kni_writer_flush(void *port) +{ + struct rte_port_kni_writer *p = + (struct rte_port_kni_writer *) port; + + if (p->tx_buf_count > 0) + send_burst(p); + + return 0; +} + +static int +rte_port_kni_writer_free(void *port) +{ + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: Port is NULL\n", __func__); + return -EINVAL; + } + + rte_port_kni_writer_flush(port); + rte_free(port); + + return 0; +} + +static int rte_port_kni_writer_stats_read(void *port, + struct rte_port_out_stats *stats, int clear) +{ + struct rte_port_kni_writer *p = + (struct rte_port_kni_writer *) port; + + if (stats != NULL) + memcpy(stats, &p->stats, sizeof(p->stats)); + + if (clear) + memset(&p->stats, 0, sizeof(p->stats)); + + return 0; +} + +/* + * Port KNI Writer Nodrop + */ +#ifdef RTE_PORT_STATS_COLLECT + +#define RTE_PORT_KNI_WRITER_NODROP_STATS_PKTS_IN_ADD(port, val) \ + port->stats.n_pkts_in += val +#define RTE_PORT_KNI_WRITER_NODROP_STATS_PKTS_DROP_ADD(port, val) \ + port->stats.n_pkts_drop += val + +#else + +#define RTE_PORT_KNI_WRITER_NODROP_STATS_PKTS_IN_ADD(port, val) +#define RTE_PORT_KNI_WRITER_NODROP_STATS_PKTS_DROP_ADD(port, val) + +#endif + +struct rte_port_kni_writer_nodrop { + struct rte_port_out_stats stats; + + struct rte_mbuf *tx_buf[2 * RTE_PORT_IN_BURST_SIZE_MAX]; + uint32_t tx_burst_sz; + uint32_t tx_buf_count; + uint64_t bsz_mask; + uint64_t n_retries; + struct rte_kni *kni; +}; + +static void * +rte_port_kni_writer_nodrop_create(void *params, int socket_id) +{ + struct rte_port_kni_writer_nodrop_params *conf = + (struct rte_port_kni_writer_nodrop_params *) params; + struct rte_port_kni_writer_nodrop *port; + + /* Check input parameters */ + if ((conf == NULL) || + (conf->tx_burst_sz == 0) || + (conf->tx_burst_sz > RTE_PORT_IN_BURST_SIZE_MAX) || + (!rte_is_power_of_2(conf->tx_burst_sz))) { + RTE_LOG(ERR, PORT, "%s: Invalid input parameters\n", __func__); + return NULL; + } + + /* Memory allocation */ + port = rte_zmalloc_socket("PORT", sizeof(*port), + RTE_CACHE_LINE_SIZE, socket_id); + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: Failed to allocate port\n", __func__); + return NULL; + } + + /* Initialization */ + port->kni = conf->kni; + port->tx_burst_sz = conf->tx_burst_sz; + port->tx_buf_count = 0; + port->bsz_mask = 1LLU << (conf->tx_burst_sz - 1); + + /* + * When n_retries is 0 it means that we should wait for every packet to + * send no matter how many retries should it take. To limit number of + * branches in fast path, we use UINT64_MAX instead of branching. + */ + port->n_retries = (conf->n_retries == 0) ? UINT64_MAX : conf->n_retries; + + return port; +} + +static inline void +send_burst_nodrop(struct rte_port_kni_writer_nodrop *p) +{ + uint32_t nb_tx = 0, i; + + nb_tx = rte_kni_tx_burst(p->kni, p->tx_buf, p->tx_buf_count); + + /* We sent all the packets in a first try */ + if (nb_tx >= p->tx_buf_count) { + p->tx_buf_count = 0; + return; + } + + for (i = 0; i < p->n_retries; i++) { + nb_tx += rte_kni_tx_burst(p->kni, + p->tx_buf + nb_tx, + p->tx_buf_count - nb_tx); + + /* We sent all the packets in more than one try */ + if (nb_tx >= p->tx_buf_count) { + p->tx_buf_count = 0; + return; + } + } + + /* We didn't send the packets in maximum allowed attempts */ + RTE_PORT_KNI_WRITER_NODROP_STATS_PKTS_DROP_ADD(p, p->tx_buf_count - nb_tx); + for ( ; nb_tx < p->tx_buf_count; nb_tx++) + rte_pktmbuf_free(p->tx_buf[nb_tx]); + + p->tx_buf_count = 0; +} + +static int +rte_port_kni_writer_nodrop_tx(void *port, struct rte_mbuf *pkt) +{ + struct rte_port_kni_writer_nodrop *p = + (struct rte_port_kni_writer_nodrop *) port; + + p->tx_buf[p->tx_buf_count++] = pkt; + RTE_PORT_KNI_WRITER_STATS_PKTS_IN_ADD(p, 1); + if (p->tx_buf_count >= p->tx_burst_sz) + send_burst_nodrop(p); + + return 0; +} + +static int +rte_port_kni_writer_nodrop_tx_bulk(void *port, + struct rte_mbuf **pkts, + uint64_t pkts_mask) +{ + struct rte_port_kni_writer_nodrop *p = + (struct rte_port_kni_writer_nodrop *) port; + + uint64_t bsz_mask = p->bsz_mask; + uint32_t tx_buf_count = p->tx_buf_count; + uint64_t expr = (pkts_mask & (pkts_mask + 1)) | + ((pkts_mask & bsz_mask) ^ bsz_mask); + + if (expr == 0) { + uint64_t n_pkts = __builtin_popcountll(pkts_mask); + uint32_t n_pkts_ok; + + if (tx_buf_count) + send_burst_nodrop(p); + + RTE_PORT_KNI_WRITER_NODROP_STATS_PKTS_IN_ADD(p, n_pkts); + n_pkts_ok = rte_kni_tx_burst(p->kni, pkts, n_pkts); + + if (n_pkts_ok >= n_pkts) + return 0; + + /* + * If we didn't manage to send all packets in single burst, move + * remaining packets to the buffer and call send burst. + */ + for (; n_pkts_ok < n_pkts; n_pkts_ok++) { + struct rte_mbuf *pkt = pkts[n_pkts_ok]; + p->tx_buf[p->tx_buf_count++] = pkt; + } + send_burst_nodrop(p); + } else { + for ( ; pkts_mask; ) { + uint32_t pkt_index = __builtin_ctzll(pkts_mask); + uint64_t pkt_mask = 1LLU << pkt_index; + struct rte_mbuf *pkt = pkts[pkt_index]; + + p->tx_buf[tx_buf_count++] = pkt; + RTE_PORT_KNI_WRITER_NODROP_STATS_PKTS_IN_ADD(p, 1); + pkts_mask &= ~pkt_mask; + } + + p->tx_buf_count = tx_buf_count; + if (tx_buf_count >= p->tx_burst_sz) + send_burst_nodrop(p); + } + + return 0; +} + +static int +rte_port_kni_writer_nodrop_flush(void *port) +{ + struct rte_port_kni_writer_nodrop *p = + (struct rte_port_kni_writer_nodrop *) port; + + if (p->tx_buf_count > 0) + send_burst_nodrop(p); + + return 0; +} + +static int +rte_port_kni_writer_nodrop_free(void *port) +{ + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: Port is NULL\n", __func__); + return -EINVAL; + } + + rte_port_kni_writer_nodrop_flush(port); + rte_free(port); + + return 0; +} + +static int rte_port_kni_writer_nodrop_stats_read(void *port, + struct rte_port_out_stats *stats, int clear) +{ + struct rte_port_kni_writer_nodrop *p = + (struct rte_port_kni_writer_nodrop *) port; + + if (stats != NULL) + memcpy(stats, &p->stats, sizeof(p->stats)); + + if (clear) + memset(&p->stats, 0, sizeof(p->stats)); + + return 0; +} + + +/* + * Summary of port operations + */ +struct rte_port_in_ops rte_port_kni_reader_ops = { + .f_create = rte_port_kni_reader_create, + .f_free = rte_port_kni_reader_free, + .f_rx = rte_port_kni_reader_rx, + .f_stats = rte_port_kni_reader_stats_read, +}; + +struct rte_port_out_ops rte_port_kni_writer_ops = { + .f_create = rte_port_kni_writer_create, + .f_free = rte_port_kni_writer_free, + .f_tx = rte_port_kni_writer_tx, + .f_tx_bulk = rte_port_kni_writer_tx_bulk, + .f_flush = rte_port_kni_writer_flush, + .f_stats = rte_port_kni_writer_stats_read, +}; + +struct rte_port_out_ops rte_port_kni_writer_nodrop_ops = { + .f_create = rte_port_kni_writer_nodrop_create, + .f_free = rte_port_kni_writer_nodrop_free, + .f_tx = rte_port_kni_writer_nodrop_tx, + .f_tx_bulk = rte_port_kni_writer_nodrop_tx_bulk, + .f_flush = rte_port_kni_writer_nodrop_flush, + .f_stats = rte_port_kni_writer_nodrop_stats_read, +}; diff --git a/lib/librte_vhost/virtio-net.h b/lib/librte_port/rte_port_kni.h index 75fb57e5..4b60689c 100644 --- a/lib/librte_vhost/virtio-net.h +++ b/lib/librte_port/rte_port_kni.h @@ -1,7 +1,8 @@ /*- * BSD LICENSE * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * Copyright(c) 2016 Ethan Zhuang <zhuangwj@gmail.com>. + * Copyright(c) 2016 Intel Corporation. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -31,13 +32,64 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef _VIRTIO_NET_H -#define _VIRTIO_NET_H +#ifndef __INCLUDE_RTE_PORT_KNI_H__ +#define __INCLUDE_RTE_PORT_KNI_H__ -#include "vhost-net.h" -#include "rte_virtio_net.h" +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @file + * RTE Port KNI Interface + * + * kni_reader: input port built on top of pre-initialized KNI interface + * kni_writer: output port built on top of pre-initialized KNI interface + * + ***/ + +#include <stdint.h> + +#include <rte_kni.h> + +#include "rte_port.h" + +/** kni_reader port parameters */ +struct rte_port_kni_reader_params { + /** KNI interface reference */ + struct rte_kni *kni; +}; -struct virtio_net_device_ops const *notify_ops; -struct virtio_net *get_device(struct vhost_device_ctx ctx); +/** kni_reader port operations */ +extern struct rte_port_in_ops rte_port_kni_reader_ops; + + +/** kni_writer port parameters */ +struct rte_port_kni_writer_params { + /** KNI interface reference */ + struct rte_kni *kni; + /** Burst size to KNI interface. */ + uint32_t tx_burst_sz; +}; + +/** kni_writer port operations */ +extern struct rte_port_out_ops rte_port_kni_writer_ops; + +/** kni_writer_nodrop port parameters */ +struct rte_port_kni_writer_nodrop_params { + /** KNI interface reference */ + struct rte_kni *kni; + /** Burst size to KNI interface. */ + uint32_t tx_burst_sz; + /** Maximum number of retries, 0 for no limit */ + uint32_t n_retries; +}; + +/** kni_writer_nodrop port operations */ +extern struct rte_port_out_ops rte_port_kni_writer_nodrop_ops; + +#ifdef __cplusplus +} +#endif #endif diff --git a/lib/librte_port/rte_port_source_sink.c b/lib/librte_port/rte_port_source_sink.c index 056c9756..4cad7109 100644 --- a/lib/librte_port/rte_port_source_sink.c +++ b/lib/librte_port/rte_port_source_sink.c @@ -38,17 +38,11 @@ #include <rte_malloc.h> #include <rte_memcpy.h> -#ifdef RTE_NEXT_ABI - #ifdef RTE_PORT_PCAP #include <rte_ether.h> #include <pcap.h> #endif -#else -#undef RTE_PORT_PCAP -#endif - #include "rte_port_source_sink.h" /* @@ -81,8 +75,6 @@ struct rte_port_source { uint32_t pkt_index; }; -#ifdef RTE_NEXT_ABI - #ifdef RTE_PORT_PCAP static int @@ -232,8 +224,6 @@ error_exit: #endif /* RTE_PORT_PCAP */ -#endif /* RTE_NEXT_ABI */ - static void * rte_port_source_create(void *params, int socket_id) { @@ -258,8 +248,6 @@ rte_port_source_create(void *params, int socket_id) /* Initialization */ port->mempool = (struct rte_mempool *) p->mempool; -#ifdef RTE_NEXT_ABI - if (p->file_name) { int status = PCAP_SOURCE_LOAD(port, p->file_name, p->n_bytes_per_pkt, socket_id); @@ -270,8 +258,6 @@ rte_port_source_create(void *params, int socket_id) } } -#endif - return port; } diff --git a/lib/librte_port/rte_port_source_sink.h b/lib/librte_port/rte_port_source_sink.h index 917abe4f..4db8a8a8 100644 --- a/lib/librte_port/rte_port_source_sink.h +++ b/lib/librte_port/rte_port_source_sink.h @@ -53,7 +53,6 @@ extern "C" { struct rte_port_source_params { /** Pre-initialized buffer pool */ struct rte_mempool *mempool; -#ifdef RTE_NEXT_ABI /** The full path of the pcap file to read packets from */ char *file_name; @@ -62,8 +61,6 @@ struct rte_port_source_params { * if it is bigger than packet size, the generated packets * will contain the whole packet */ uint32_t n_bytes_per_pkt; - -#endif }; /** source port operations */ diff --git a/lib/librte_port/rte_port_version.map b/lib/librte_port/rte_port_version.map index 7a0b34d0..048c20d7 100644 --- a/lib/librte_port/rte_port_version.map +++ b/lib/librte_port/rte_port_version.map @@ -5,10 +5,8 @@ DPDK_2.0 { rte_port_ethdev_writer_ops; rte_port_ring_reader_ipv4_frag_ops; rte_port_ring_reader_ops; - rte_port_ring_reader_ops; rte_port_ring_writer_ipv4_ras_ops; rte_port_ring_writer_ops; - rte_port_ring_writer_ops; rte_port_sched_reader_ops; rte_port_sched_writer_ops; rte_port_sink_ops; @@ -35,3 +33,12 @@ DPDK_2.2 { rte_port_ring_multi_writer_nodrop_ops; } DPDK_2.1; + +DPDK_16.07 { + global: + + rte_port_kni_reader_ops; + rte_port_kni_writer_ops; + rte_port_kni_writer_nodrop_ops; + +} DPDK_2.2; diff --git a/lib/librte_power/guest_channel.c b/lib/librte_power/guest_channel.c index d6b6d0aa..85c92fab 100644 --- a/lib/librte_power/guest_channel.c +++ b/lib/librte_power/guest_channel.c @@ -103,8 +103,10 @@ guest_channel_host_connect(const char *path, unsigned lcore_id) global_fds[lcore_id] = fd; ret = guest_channel_send_msg(&pkt, lcore_id); if (ret != 0) { - RTE_LOG(ERR, GUEST_CHANNEL, "Error on channel '%s' communications " - "test: %s\n", fd_path, strerror(ret)); + RTE_LOG(ERR, GUEST_CHANNEL, + "Error on channel '%s' communications test: %s\n", + fd_path, ret > 0 ? strerror(ret) : + "channel not connected"); goto error; } RTE_LOG(INFO, GUEST_CHANNEL, "Channel '%s' is now connected\n", fd_path); diff --git a/lib/librte_power/rte_power_kvm_vm.c b/lib/librte_power/rte_power_kvm_vm.c index 7bb2774c..a1badf34 100644 --- a/lib/librte_power/rte_power_kvm_vm.c +++ b/lib/librte_power/rte_power_kvm_vm.c @@ -106,7 +106,8 @@ send_msg(unsigned lcore_id, uint32_t scale_direction) ret = guest_channel_send_msg(&pkt[lcore_id], lcore_id); if (ret == 0) return 1; - RTE_LOG(DEBUG, POWER, "Error sending message: %s\n", strerror(ret)); + RTE_LOG(DEBUG, POWER, "Error sending message: %s\n", + ret > 0 ? strerror(ret) : "channel not connected"); return -1; } diff --git a/lib/librte_reorder/Makefile b/lib/librte_reorder/Makefile index 0c01de16..0d111aad 100644 --- a/lib/librte_reorder/Makefile +++ b/lib/librte_reorder/Makefile @@ -49,6 +49,7 @@ SYMLINK-$(CONFIG_RTE_LIBRTE_REORDER)-include := rte_reorder.h # this lib depends upon: DEPDIRS-$(CONFIG_RTE_LIBRTE_REORDER) += lib/librte_mbuf +DEPDIRS-$(CONFIG_RTE_LIBRTE_REORDER) += lib/librte_mempool DEPDIRS-$(CONFIG_RTE_LIBRTE_REORDER) += lib/librte_eal include $(RTE_SDK)/mk/rte.lib.mk diff --git a/lib/librte_ring/rte_ring.c b/lib/librte_ring/rte_ring.c index d80faf3b..ca0a1082 100644 --- a/lib/librte_ring/rte_ring.c +++ b/lib/librte_ring/rte_ring.c @@ -122,6 +122,8 @@ int rte_ring_init(struct rte_ring *r, const char *name, unsigned count, unsigned flags) { + int ret; + /* compilation-time checks */ RTE_BUILD_BUG_ON((sizeof(struct rte_ring) & RTE_CACHE_LINE_MASK) != 0); @@ -140,7 +142,9 @@ rte_ring_init(struct rte_ring *r, const char *name, unsigned count, /* init the ring structure */ memset(r, 0, sizeof(*r)); - snprintf(r->name, sizeof(r->name), "%s", name); + ret = snprintf(r->name, sizeof(r->name), "%s", name); + if (ret < 0 || ret >= (int)sizeof(r->name)) + return -ENAMETOOLONG; r->flags = flags; r->prod.watermark = count; r->prod.sp_enqueue = !!(flags & RING_F_SP_ENQ); @@ -165,6 +169,7 @@ rte_ring_create(const char *name, unsigned count, int socket_id, ssize_t ring_size; int mz_flags = 0; struct rte_ring_list* ring_list = NULL; + int ret; ring_list = RTE_TAILQ_CAST(rte_ring_tailq.head, rte_ring_list); @@ -174,6 +179,13 @@ rte_ring_create(const char *name, unsigned count, int socket_id, return NULL; } + ret = snprintf(mz_name, sizeof(mz_name), "%s%s", + RTE_RING_MZ_PREFIX, name); + if (ret < 0 || ret >= (int)sizeof(mz_name)) { + rte_errno = ENAMETOOLONG; + return NULL; + } + te = rte_zmalloc("RING_TAILQ_ENTRY", sizeof(*te), 0); if (te == NULL) { RTE_LOG(ERR, RING, "Cannot reserve memory for tailq\n"); @@ -181,8 +193,6 @@ rte_ring_create(const char *name, unsigned count, int socket_id, return NULL; } - snprintf(mz_name, sizeof(mz_name), "%s%s", RTE_RING_MZ_PREFIX, name); - rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); /* reserve a memory zone for this ring. If we can't get rte_config or diff --git a/lib/librte_sched/Makefile b/lib/librte_sched/Makefile index a782cd1a..44cb780f 100644 --- a/lib/librte_sched/Makefile +++ b/lib/librte_sched/Makefile @@ -59,6 +59,7 @@ SYMLINK-$(CONFIG_RTE_LIBRTE_SCHED)-include := rte_sched.h rte_bitmap.h rte_sched SYMLINK-$(CONFIG_RTE_LIBRTE_SCHED)-include += rte_reciprocal.h # this lib depends upon: +DEPDIRS-$(CONFIG_RTE_LIBRTE_SCHED) += lib/librte_eal DEPDIRS-$(CONFIG_RTE_LIBRTE_SCHED) += lib/librte_mempool lib/librte_mbuf DEPDIRS-$(CONFIG_RTE_LIBRTE_SCHED) += lib/librte_net lib/librte_timer diff --git a/lib/librte_sched/rte_red.c b/lib/librte_sched/rte_red.c index fdf40576..ade57d1f 100644 --- a/lib/librte_sched/rte_red.c +++ b/lib/librte_sched/rte_red.c @@ -77,7 +77,7 @@ __rte_red_init_tables(void) scale = 1024.0; - RTE_RED_ASSERT(RTE_RED_WQ_LOG2_NUM == RTE_DIM(rte_red_log2_1_minus_Wq)); + RTE_ASSERT(RTE_RED_WQ_LOG2_NUM == RTE_DIM(rte_red_log2_1_minus_Wq)); for (i = RTE_RED_WQ_LOG2_MIN; i <= RTE_RED_WQ_LOG2_MAX; i++) { double n = (double)i; diff --git a/lib/librte_sched/rte_red.h b/lib/librte_sched/rte_red.h index 7f9ac901..ca122275 100644 --- a/lib/librte_sched/rte_red.h +++ b/lib/librte_sched/rte_red.h @@ -63,19 +63,6 @@ extern "C" { #define RTE_RED_INT16_NBITS (sizeof(uint16_t) * CHAR_BIT) #define RTE_RED_WQ_LOG2_NUM (RTE_RED_WQ_LOG2_MAX - RTE_RED_WQ_LOG2_MIN + 1) -#ifdef RTE_RED_DEBUG - -#define RTE_RED_ASSERT(exp) \ -if (!(exp)) { \ - rte_panic("line%d\tassert \"" #exp "\" failed\n", __LINE__); \ -} - -#else - -#define RTE_RED_ASSERT(exp) do { } while(0) - -#endif /* RTE_RED_DEBUG */ - /** * Externs * @@ -246,8 +233,8 @@ rte_red_enqueue_empty(const struct rte_red_config *red_cfg, { uint64_t time_diff = 0, m = 0; - RTE_RED_ASSERT(red_cfg != NULL); - RTE_RED_ASSERT(red != NULL); + RTE_ASSERT(red_cfg != NULL); + RTE_ASSERT(red != NULL); red->count ++; @@ -361,8 +348,8 @@ rte_red_enqueue_nonempty(const struct rte_red_config *red_cfg, struct rte_red *red, const unsigned q) { - RTE_RED_ASSERT(red_cfg != NULL); - RTE_RED_ASSERT(red != NULL); + RTE_ASSERT(red_cfg != NULL); + RTE_ASSERT(red != NULL); /** * EWMA filter (Sally Floyd and Van Jacobson): @@ -424,8 +411,8 @@ rte_red_enqueue(const struct rte_red_config *red_cfg, const unsigned q, const uint64_t time) { - RTE_RED_ASSERT(red_cfg != NULL); - RTE_RED_ASSERT(red != NULL); + RTE_ASSERT(red_cfg != NULL); + RTE_ASSERT(red != NULL); if (q != 0) { return rte_red_enqueue_nonempty(red_cfg, red, q); diff --git a/lib/librte_sched/rte_sched.c b/lib/librte_sched/rte_sched.c index 1609ea87..86964234 100644 --- a/lib/librte_sched/rte_sched.c +++ b/lib/librte_sched/rte_sched.c @@ -1084,10 +1084,17 @@ rte_sched_port_update_subport_stats(struct rte_sched_port *port, uint32_t qindex s->stats.n_bytes_tc[tc_index] += pkt_len; } +#ifdef RTE_SCHED_RED static inline void rte_sched_port_update_subport_stats_on_drop(struct rte_sched_port *port, - uint32_t qindex, - struct rte_mbuf *pkt, uint32_t red) + uint32_t qindex, + struct rte_mbuf *pkt, uint32_t red) +#else +static inline void +rte_sched_port_update_subport_stats_on_drop(struct rte_sched_port *port, + uint32_t qindex, + struct rte_mbuf *pkt, __rte_unused uint32_t red) +#endif { struct rte_sched_subport *s = port->subport + (qindex / rte_sched_port_queues_per_subport(port)); uint32_t tc_index = (qindex >> 2) & 0x3; @@ -1110,10 +1117,17 @@ rte_sched_port_update_queue_stats(struct rte_sched_port *port, uint32_t qindex, qe->stats.n_bytes += pkt_len; } +#ifdef RTE_SCHED_RED static inline void rte_sched_port_update_queue_stats_on_drop(struct rte_sched_port *port, - uint32_t qindex, - struct rte_mbuf *pkt, uint32_t red) + uint32_t qindex, + struct rte_mbuf *pkt, uint32_t red) +#else +static inline void +rte_sched_port_update_queue_stats_on_drop(struct rte_sched_port *port, + uint32_t qindex, + struct rte_mbuf *pkt, __rte_unused uint32_t red) +#endif { struct rte_sched_queue_extra *qe = port->queue_extra + qindex; uint32_t pkt_len = pkt->pkt_len; diff --git a/lib/librte_table/rte_table_acl.c b/lib/librte_table/rte_table_acl.c index c1eb8488..8f1f8ceb 100644 --- a/lib/librte_table/rte_table_acl.c +++ b/lib/librte_table/rte_table_acl.c @@ -236,8 +236,6 @@ rte_table_acl_build(struct rte_table_acl *acl, struct rte_acl_ctx **acl_ctx) return -1; } - rte_acl_dump(ctx); - *acl_ctx = ctx; return 0; } diff --git a/lib/librte_table/rte_table_lpm.c b/lib/librte_table/rte_table_lpm.c index cdeb0f5a..598b79f5 100644 --- a/lib/librte_table/rte_table_lpm.c +++ b/lib/librte_table/rte_table_lpm.c @@ -44,7 +44,9 @@ #include "rte_table_lpm.h" -#define RTE_TABLE_LPM_MAX_NEXT_HOPS 256 +#ifndef RTE_TABLE_LPM_MAX_NEXT_HOPS +#define RTE_TABLE_LPM_MAX_NEXT_HOPS 65536 +#endif #ifdef RTE_TABLE_STATS_COLLECT @@ -74,7 +76,7 @@ struct rte_table_lpm { /* Next Hop Table (NHT) */ uint32_t nht_users[RTE_TABLE_LPM_MAX_NEXT_HOPS]; - uint32_t nht[0] __rte_cache_aligned; + uint8_t nht[0] __rte_cache_aligned; }; static void * @@ -188,7 +190,7 @@ nht_find_existing(struct rte_table_lpm *lpm, void *entry, uint32_t *pos) uint32_t i; for (i = 0; i < RTE_TABLE_LPM_MAX_NEXT_HOPS; i++) { - uint32_t *nht_entry = &lpm->nht[i * lpm->entry_size]; + uint8_t *nht_entry = &lpm->nht[i * lpm->entry_size]; if ((lpm->nht_users[i] > 0) && (memcmp(nht_entry, entry, lpm->entry_unique_size) == 0)) { @@ -242,7 +244,7 @@ rte_table_lpm_entry_add( /* Find existing or free NHT entry */ if (nht_find_existing(lpm, entry, &nht_pos) == 0) { - uint32_t *nht_entry; + uint8_t *nht_entry; if (nht_find_free(lpm, &nht_pos) == 0) { RTE_LOG(ERR, TABLE, "%s: NHT full\n", __func__); diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile index e33ff53e..538adb0b 100644 --- a/lib/librte_vhost/Makefile +++ b/lib/librte_vhost/Makefile @@ -36,7 +36,7 @@ LIB = librte_vhost.a EXPORT_MAP := rte_vhost_version.map -LIBABIVER := 2 +LIBABIVER := 3 CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 -D_FILE_OFFSET_BITS=64 ifeq ($(CONFIG_RTE_LIBRTE_VHOST_USER),y) @@ -66,6 +66,7 @@ SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_virtio_net.h DEPDIRS-$(CONFIG_RTE_LIBRTE_VHOST) += lib/librte_eal DEPDIRS-$(CONFIG_RTE_LIBRTE_VHOST) += lib/librte_ether DEPDIRS-$(CONFIG_RTE_LIBRTE_VHOST) += lib/librte_mbuf +DEPDIRS-$(CONFIG_RTE_LIBRTE_VHOST) += lib/librte_mempool DEPDIRS-$(CONFIG_RTE_LIBRTE_VHOST) += lib/librte_net include $(RTE_SDK)/mk/rte.lib.mk diff --git a/lib/librte_vhost/rte_vhost_version.map b/lib/librte_vhost/rte_vhost_version.map index 3d8709e5..5ceaa8a5 100644 --- a/lib/librte_vhost/rte_vhost_version.map +++ b/lib/librte_vhost/rte_vhost_version.map @@ -20,3 +20,13 @@ DPDK_2.1 { rte_vhost_driver_unregister; } DPDK_2.0; + +DPDK_16.07 { + global: + + rte_vhost_avail_entries; + rte_vhost_get_ifname; + rte_vhost_get_numa_node; + rte_vhost_get_queue_num; + +} DPDK_2.1; diff --git a/lib/librte_vhost/rte_virtio_net.h b/lib/librte_vhost/rte_virtio_net.h index 600b20b4..9caa6221 100644 --- a/lib/librte_vhost/rte_virtio_net.h +++ b/lib/librte_vhost/rte_virtio_net.h @@ -51,125 +51,12 @@ #include <rte_mempool.h> #include <rte_ether.h> -struct rte_mbuf; - -#define VHOST_MEMORY_MAX_NREGIONS 8 - -/* Used to indicate that the device is running on a data core */ -#define VIRTIO_DEV_RUNNING 1 - -/* Backend value set by guest. */ -#define VIRTIO_DEV_STOPPED -1 - +#define RTE_VHOST_USER_CLIENT (1ULL << 0) +#define RTE_VHOST_USER_NO_RECONNECT (1ULL << 1) /* Enum for virtqueue management. */ enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM}; -#define BUF_VECTOR_MAX 256 - -/** - * Structure contains buffer address, length and descriptor index - * from vring to do scatter RX. - */ -struct buf_vector { - uint64_t buf_addr; - uint32_t buf_len; - uint32_t desc_idx; -}; - -/** - * Structure contains variables relevant to RX/TX virtqueues. - */ -struct vhost_virtqueue { - struct vring_desc *desc; /**< Virtqueue descriptor ring. */ - struct vring_avail *avail; /**< Virtqueue available ring. */ - struct vring_used *used; /**< Virtqueue used ring. */ - uint32_t size; /**< Size of descriptor ring. */ - uint32_t backend; /**< Backend value to determine if device should started/stopped. */ - uint16_t vhost_hlen; /**< Vhost header length (varies depending on RX merge buffers. */ - volatile uint16_t last_used_idx; /**< Last index used on the available ring */ - volatile uint16_t last_used_idx_res; /**< Used for multiple devices reserving buffers. */ -#define VIRTIO_INVALID_EVENTFD (-1) -#define VIRTIO_UNINITIALIZED_EVENTFD (-2) - int callfd; /**< Used to notify the guest (trigger interrupt). */ - int kickfd; /**< Currently unused as polling mode is enabled. */ - int enabled; - uint64_t log_guest_addr; /**< Physical address of used ring, for logging */ - uint64_t reserved[15]; /**< Reserve some spaces for future extension. */ - struct buf_vector buf_vec[BUF_VECTOR_MAX]; /**< for scatter RX. */ -} __rte_cache_aligned; - -/* Old kernels have no such macro defined */ -#ifndef VIRTIO_NET_F_GUEST_ANNOUNCE - #define VIRTIO_NET_F_GUEST_ANNOUNCE 21 -#endif - - -/* - * Make an extra wrapper for VIRTIO_NET_F_MQ and - * VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX as they are - * introduced since kernel v3.8. This makes our - * code buildable for older kernel. - */ -#ifdef VIRTIO_NET_F_MQ - #define VHOST_MAX_QUEUE_PAIRS VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX - #define VHOST_SUPPORTS_MQ (1ULL << VIRTIO_NET_F_MQ) -#else - #define VHOST_MAX_QUEUE_PAIRS 1 - #define VHOST_SUPPORTS_MQ 0 -#endif - -/* - * Define virtio 1.0 for older kernels - */ -#ifndef VIRTIO_F_VERSION_1 - #define VIRTIO_F_VERSION_1 32 -#endif - -/** - * Device structure contains all configuration information relating to the device. - */ -struct virtio_net { - struct virtio_memory *mem; /**< QEMU memory and memory region information. */ - uint64_t features; /**< Negotiated feature set. */ - uint64_t protocol_features; /**< Negotiated protocol feature set. */ - uint64_t device_fh; /**< device identifier. */ - uint32_t flags; /**< Device flags. Only used to check if device is running on data core. */ -#define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ) - char ifname[IF_NAME_SZ]; /**< Name of the tap device or socket path. */ - uint32_t virt_qp_nb; /**< number of queue pair we have allocated */ - void *priv; /**< private context */ - uint64_t log_size; /**< Size of log area */ - uint64_t log_base; /**< Where dirty pages are logged */ - struct ether_addr mac; /**< MAC address */ - rte_atomic16_t broadcast_rarp; /**< A flag to tell if we need broadcast rarp packet */ - uint64_t reserved[61]; /**< Reserve some spaces for future extension. */ - struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2]; /**< Contains all virtqueue information. */ -} __rte_cache_aligned; - -/** - * Information relating to memory regions including offsets to addresses in QEMUs memory file. - */ -struct virtio_memory_regions { - uint64_t guest_phys_address; /**< Base guest physical address of region. */ - uint64_t guest_phys_address_end; /**< End guest physical address of region. */ - uint64_t memory_size; /**< Size of region. */ - uint64_t userspace_address; /**< Base userspace address of region. */ - uint64_t address_offset; /**< Offset of region for address translation. */ -}; - - -/** - * Memory structure includes region and mapping information. - */ -struct virtio_memory { - uint64_t base_address; /**< Base QEMU userspace address of the memory file. */ - uint64_t mapped_address; /**< Mapped address of memory file base in our applications memory space. */ - uint64_t mapped_size; /**< Total size of memory file. */ - uint32_t nregions; /**< Number of memory regions. */ - struct virtio_memory_regions regions[0]; /**< Memory region information. */ -}; - /** * Device and vring operations. * @@ -178,45 +65,13 @@ struct virtio_memory { * */ struct virtio_net_device_ops { - int (*new_device)(struct virtio_net *); /**< Add device. */ - void (*destroy_device)(volatile struct virtio_net *); /**< Remove device. */ - - int (*vring_state_changed)(struct virtio_net *dev, uint16_t queue_id, int enable); /**< triggered when a vring is enabled or disabled */ -}; - -static inline uint16_t __attribute__((always_inline)) -rte_vring_available_entries(struct virtio_net *dev, uint16_t queue_id) -{ - struct vhost_virtqueue *vq = dev->virtqueue[queue_id]; - - if (!vq->enabled) - return 0; + int (*new_device)(int vid); /**< Add device. */ + void (*destroy_device)(int vid); /**< Remove device. */ - return *(volatile uint16_t *)&vq->avail->idx - vq->last_used_idx_res; -} - -/** - * Function to convert guest physical addresses to vhost virtual addresses. - * This is used to convert guest virtio buffer addresses. - */ -static inline uint64_t __attribute__((always_inline)) -gpa_to_vva(struct virtio_net *dev, uint64_t guest_pa) -{ - struct virtio_memory_regions *region; - uint32_t regionidx; - uint64_t vhost_va = 0; - - for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) { - region = &dev->mem->regions[regionidx]; - if ((guest_pa >= region->guest_phys_address) && - (guest_pa <= region->guest_phys_address_end)) { - vhost_va = region->address_offset + guest_pa; - break; - } - } - return vhost_va; -} + int (*vring_state_changed)(int vid, uint16_t queue_id, int enable); /**< triggered when a vring is enabled or disabled */ + void *reserved[5]; /**< Reserved for future extension */ +}; /** * Disable features in feature_mask. Returns 0 on success. @@ -231,13 +86,16 @@ int rte_vhost_feature_enable(uint64_t feature_mask); /* Returns currently supported vhost features */ uint64_t rte_vhost_feature_get(void); -int rte_vhost_enable_guest_notification(struct virtio_net *dev, uint16_t queue_id, int enable); +int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable); -/* Register vhost driver. dev_name could be different for multiple instance support. */ -int rte_vhost_driver_register(const char *dev_name); +/** + * Register vhost driver. path could be different for multiple + * instance support. + */ +int rte_vhost_driver_register(const char *path, uint64_t flags); /* Unregister vhost driver. This is only meaningful to vhost user. */ -int rte_vhost_driver_unregister(const char *dev_name); +int rte_vhost_driver_unregister(const char *path); /* Register callbacks. */ int rte_vhost_driver_callback_register(struct virtio_net_device_ops const * const); @@ -245,12 +103,65 @@ int rte_vhost_driver_callback_register(struct virtio_net_device_ops const * cons int rte_vhost_driver_session_start(void); /** + * Get the numa node from which the virtio net device's memory + * is allocated. + * + * @param vid + * virtio-net device ID + * + * @return + * The numa node, -1 on failure + */ +int rte_vhost_get_numa_node(int vid); + +/** + * Get the number of queues the device supports. + * + * @param vid + * virtio-net device ID + * + * @return + * The number of queues, 0 on failure + */ +uint32_t rte_vhost_get_queue_num(int vid); + +/** + * Get the virtio net device's ifname. For vhost-cuse, ifname is the + * path of the char device. For vhost-user, ifname is the vhost-user + * socket file path. + * + * @param vid + * virtio-net device ID + * @param buf + * The buffer to stored the queried ifname + * @param len + * The length of buf + * + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_get_ifname(int vid, char *buf, size_t len); + +/** + * Get how many avail entries are left in the queue + * + * @param vid + * virtio-net device ID + * @param queue_id + * virtio queue index + * + * @return + * num of avail entires left + */ +uint16_t rte_vhost_avail_entries(int vid, uint16_t queue_id); + +/** * This function adds buffers to the virtio devices RX virtqueue. Buffers can * be received from the physical port or from another virtual device. A packet * count is returned to indicate the number of packets that were succesfully * added to the RX queue. - * @param dev - * virtio-net device + * @param vid + * virtio-net device ID * @param queue_id * virtio queue index in mq case * @param pkts @@ -260,14 +171,14 @@ int rte_vhost_driver_session_start(void); * @return * num of packets enqueued */ -uint16_t rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id, +uint16_t rte_vhost_enqueue_burst(int vid, uint16_t queue_id, struct rte_mbuf **pkts, uint16_t count); /** * This function gets guest buffers from the virtio device TX virtqueue, * construct host mbufs, copies guest buffer content to host mbufs and * store them in pkts to be processed. - * @param dev + * @param vid * virtio-net device * @param queue_id * virtio queue index in mq case @@ -280,7 +191,7 @@ uint16_t rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id, * @return * num of packets dequeued */ -uint16_t rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, +uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count); #endif /* _VIRTIO_NET_H_ */ diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h index f193a1f6..38593a29 100644 --- a/lib/librte_vhost/vhost-net.h +++ b/lib/librte_vhost/vhost-net.h @@ -43,6 +43,128 @@ #include "rte_virtio_net.h" +/* Used to indicate that the device is running on a data core */ +#define VIRTIO_DEV_RUNNING 1 + +/* Backend value set by guest. */ +#define VIRTIO_DEV_STOPPED -1 + +#define BUF_VECTOR_MAX 256 + +/** + * Structure contains buffer address, length and descriptor index + * from vring to do scatter RX. + */ +struct buf_vector { + uint64_t buf_addr; + uint32_t buf_len; + uint32_t desc_idx; +}; + +/** + * Structure contains variables relevant to RX/TX virtqueues. + */ +struct vhost_virtqueue { + struct vring_desc *desc; + struct vring_avail *avail; + struct vring_used *used; + uint32_t size; + + /* Last index used on the available ring */ + volatile uint16_t last_used_idx; +#define VIRTIO_INVALID_EVENTFD (-1) +#define VIRTIO_UNINITIALIZED_EVENTFD (-2) + + /* Backend value to determine if device should started/stopped */ + int backend; + /* Used to notify the guest (trigger interrupt) */ + int callfd; + /* Currently unused as polling mode is enabled */ + int kickfd; + int enabled; + + /* Physical address of used ring, for logging */ + uint64_t log_guest_addr; +} __rte_cache_aligned; + +/* Old kernels have no such macro defined */ +#ifndef VIRTIO_NET_F_GUEST_ANNOUNCE + #define VIRTIO_NET_F_GUEST_ANNOUNCE 21 +#endif + + +/* + * Make an extra wrapper for VIRTIO_NET_F_MQ and + * VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX as they are + * introduced since kernel v3.8. This makes our + * code buildable for older kernel. + */ +#ifdef VIRTIO_NET_F_MQ + #define VHOST_MAX_QUEUE_PAIRS VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX + #define VHOST_SUPPORTS_MQ (1ULL << VIRTIO_NET_F_MQ) +#else + #define VHOST_MAX_QUEUE_PAIRS 1 + #define VHOST_SUPPORTS_MQ 0 +#endif + +/* + * Define virtio 1.0 for older kernels + */ +#ifndef VIRTIO_F_VERSION_1 + #define VIRTIO_F_VERSION_1 32 +#endif + +/** + * Device structure contains all configuration information relating + * to the device. + */ +struct virtio_net { + /* Frontend (QEMU) memory and memory region information */ + struct virtio_memory *mem; + uint64_t features; + uint64_t protocol_features; + int vid; + uint32_t flags; + uint16_t vhost_hlen; + /* to tell if we need broadcast rarp packet */ + rte_atomic16_t broadcast_rarp; + uint32_t virt_qp_nb; + struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2]; +#define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ) + char ifname[IF_NAME_SZ]; + uint64_t log_size; + uint64_t log_base; + uint64_t log_addr; + struct ether_addr mac; + +} __rte_cache_aligned; + +/** + * Information relating to memory regions including offsets to + * addresses in QEMUs memory file. + */ +struct virtio_memory_regions { + uint64_t guest_phys_address; + uint64_t guest_phys_address_end; + uint64_t memory_size; + uint64_t userspace_address; + uint64_t address_offset; +}; + + +/** + * Memory structure includes region and mapping information. + */ +struct virtio_memory { + /* Base QEMU userspace address of the memory file. */ + uint64_t base_address; + uint64_t mapped_address; + uint64_t mapped_size; + uint32_t nregions; + struct virtio_memory_regions regions[0]; +}; + + /* Macros for printing using RTE_LOG */ #define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1 #define RTE_LOGTYPE_VHOST_DATA RTE_LOGTYPE_USER1 @@ -57,9 +179,9 @@ char packet[VHOST_MAX_PRINT_BUFF]; \ \ if ((header)) \ - snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%" PRIu64 ") Header size %d: ", (device->device_fh), (size)); \ + snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Header size %d: ", (device->vid), (size)); \ else \ - snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%" PRIu64 ") Packet size %d: ", (device->device_fh), (size)); \ + snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Packet size %d: ", (device->vid), (size)); \ for (index = 0; index < (size); index++) { \ snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), \ "%02hhx ", pkt_addr[index]); \ @@ -74,37 +196,51 @@ #define PRINT_PACKET(device, addr, size, header) do {} while (0) #endif - -/* - * Structure used to identify device context. +/** + * Function to convert guest physical addresses to vhost virtual addresses. + * This is used to convert guest virtio buffer addresses. */ -struct vhost_device_ctx { - pid_t pid; /* PID of process calling the IOCTL. */ - uint64_t fh; /* Populated with fi->fh to track the device index. */ -}; - -int vhost_new_device(struct vhost_device_ctx); -void vhost_destroy_device(struct vhost_device_ctx); - -void vhost_set_ifname(struct vhost_device_ctx, - const char *if_name, unsigned int if_len); - -int vhost_get_features(struct vhost_device_ctx, uint64_t *); -int vhost_set_features(struct vhost_device_ctx, uint64_t *); - -int vhost_set_vring_num(struct vhost_device_ctx, struct vhost_vring_state *); -int vhost_set_vring_addr(struct vhost_device_ctx, struct vhost_vring_addr *); -int vhost_set_vring_base(struct vhost_device_ctx, struct vhost_vring_state *); -int vhost_get_vring_base(struct vhost_device_ctx, - uint32_t, struct vhost_vring_state *); - -int vhost_set_vring_kick(struct vhost_device_ctx, struct vhost_vring_file *); -int vhost_set_vring_call(struct vhost_device_ctx, struct vhost_vring_file *); - -int vhost_set_backend(struct vhost_device_ctx, struct vhost_vring_file *); - -int vhost_set_owner(struct vhost_device_ctx); -int vhost_reset_owner(struct vhost_device_ctx); +static inline uint64_t __attribute__((always_inline)) +gpa_to_vva(struct virtio_net *dev, uint64_t guest_pa) +{ + struct virtio_memory_regions *region; + uint32_t regionidx; + uint64_t vhost_va = 0; + + for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) { + region = &dev->mem->regions[regionidx]; + if ((guest_pa >= region->guest_phys_address) && + (guest_pa <= region->guest_phys_address_end)) { + vhost_va = region->address_offset + guest_pa; + break; + } + } + return vhost_va; +} + +struct virtio_net_device_ops const *notify_ops; +struct virtio_net *get_device(int vid); + +int vhost_new_device(void); +void vhost_destroy_device(int); + +void vhost_set_ifname(int, const char *if_name, unsigned int if_len); + +int vhost_get_features(int, uint64_t *); +int vhost_set_features(int, uint64_t *); + +int vhost_set_vring_num(int, struct vhost_vring_state *); +int vhost_set_vring_addr(int, struct vhost_vring_addr *); +int vhost_set_vring_base(int, struct vhost_vring_state *); +int vhost_get_vring_base(int, uint32_t, struct vhost_vring_state *); + +int vhost_set_vring_kick(int, struct vhost_vring_file *); +int vhost_set_vring_call(int, struct vhost_vring_file *); + +int vhost_set_backend(int, struct vhost_vring_file *); + +int vhost_set_owner(int); +int vhost_reset_owner(int); /* * Backend-specific cleanup. Defined by vhost-cuse and vhost-user. diff --git a/lib/librte_vhost/vhost_cuse/vhost-net-cdev.c b/lib/librte_vhost/vhost_cuse/vhost-net-cdev.c index c613e68e..5d150116 100644 --- a/lib/librte_vhost/vhost_cuse/vhost-net-cdev.c +++ b/lib/librte_vhost/vhost_cuse/vhost-net-cdev.c @@ -60,17 +60,18 @@ static const char default_cdev[] = "vhost-net"; static struct fuse_session *session; /* - * Returns vhost_device_ctx from given fuse_req_t. The index is populated later - * when the device is added to the device linked list. + * Returns vhost_cuse_device_ctx from given fuse_req_t. The + * index is populated later when the device is added to the + * device linked list. */ -static struct vhost_device_ctx +static struct vhost_cuse_device_ctx fuse_req_to_vhost_ctx(fuse_req_t req, struct fuse_file_info *fi) { - struct vhost_device_ctx ctx; + struct vhost_cuse_device_ctx ctx; struct fuse_ctx const *const req_ctx = fuse_req_ctx(req); ctx.pid = req_ctx->pid; - ctx.fh = fi->fh; + ctx.vid = (int)fi->fh; return ctx; } @@ -82,19 +83,18 @@ fuse_req_to_vhost_ctx(fuse_req_t req, struct fuse_file_info *fi) static void vhost_net_open(fuse_req_t req, struct fuse_file_info *fi) { - struct vhost_device_ctx ctx = fuse_req_to_vhost_ctx(req, fi); - int err = 0; + int vid = 0; - err = vhost_new_device(ctx); - if (err == -1) { + vid = vhost_new_device(); + if (vid == -1) { fuse_reply_err(req, EPERM); return; } - fi->fh = err; + fi->fh = vid; RTE_LOG(INFO, VHOST_CONFIG, - "(%"PRIu64") Device configuration started\n", fi->fh); + "(%d) device configuration started\n", vid); fuse_reply_open(req, fi); } @@ -105,19 +105,19 @@ static void vhost_net_release(fuse_req_t req, struct fuse_file_info *fi) { int err = 0; - struct vhost_device_ctx ctx = fuse_req_to_vhost_ctx(req, fi); + struct vhost_cuse_device_ctx ctx = fuse_req_to_vhost_ctx(req, fi); - vhost_destroy_device(ctx); - RTE_LOG(INFO, VHOST_CONFIG, "(%"PRIu64") Device released\n", ctx.fh); + vhost_destroy_device(ctx.vid); + RTE_LOG(INFO, VHOST_CONFIG, "(%d) device released\n", ctx.vid); fuse_reply_err(req, err); } /* * Boilerplate code for CUSE IOCTL - * Implicit arguments: ctx, req, result. + * Implicit arguments: vid, req, result. */ #define VHOST_IOCTL(func) do { \ - result = (func)(ctx); \ + result = (func)(vid); \ fuse_reply_ioctl(req, result, NULL, 0); \ } while (0) @@ -134,41 +134,41 @@ vhost_net_release(fuse_req_t req, struct fuse_file_info *fi) /* * Boilerplate code for CUSE Read IOCTL - * Implicit arguments: ctx, req, result, in_bufsz, in_buf. + * Implicit arguments: vid, req, result, in_bufsz, in_buf. */ #define VHOST_IOCTL_R(type, var, func) do { \ if (!in_bufsz) { \ VHOST_IOCTL_RETRY(sizeof(type), 0);\ } else { \ (var) = *(const type*)in_buf; \ - result = func(ctx, &(var)); \ + result = func(vid, &(var)); \ fuse_reply_ioctl(req, result, NULL, 0);\ } \ } while (0) /* * Boilerplate code for CUSE Write IOCTL - * Implicit arguments: ctx, req, result, out_bufsz. + * Implicit arguments: vid, req, result, out_bufsz. */ #define VHOST_IOCTL_W(type, var, func) do { \ if (!out_bufsz) { \ VHOST_IOCTL_RETRY(0, sizeof(type));\ } else { \ - result = (func)(ctx, &(var));\ + result = (func)(vid, &(var));\ fuse_reply_ioctl(req, result, &(var), sizeof(type));\ } \ } while (0) /* * Boilerplate code for CUSE Read/Write IOCTL - * Implicit arguments: ctx, req, result, in_bufsz, in_buf. + * Implicit arguments: vid, req, result, in_bufsz, in_buf. */ #define VHOST_IOCTL_RW(type1, var1, type2, var2, func) do { \ if (!in_bufsz) { \ VHOST_IOCTL_RETRY(sizeof(type1), sizeof(type2));\ } else { \ (var1) = *(const type1*) (in_buf); \ - result = (func)(ctx, (var1), &(var2)); \ + result = (func)(vid, (var1), &(var2)); \ fuse_reply_ioctl(req, result, &(var2), sizeof(type2));\ } \ } while (0) @@ -183,18 +183,19 @@ vhost_net_ioctl(fuse_req_t req, int cmd, void *arg, struct fuse_file_info *fi, __rte_unused unsigned flags, const void *in_buf, size_t in_bufsz, size_t out_bufsz) { - struct vhost_device_ctx ctx = fuse_req_to_vhost_ctx(req, fi); + struct vhost_cuse_device_ctx ctx = fuse_req_to_vhost_ctx(req, fi); struct vhost_vring_file file; struct vhost_vring_state state; struct vhost_vring_addr addr; uint64_t features; uint32_t index; int result = 0; + int vid = ctx.vid; switch (cmd) { case VHOST_NET_SET_BACKEND: LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: VHOST_NET_SET_BACKEND\n", ctx.fh); + "(%d) IOCTL: VHOST_NET_SET_BACKEND\n", ctx.vid); if (!in_buf) { VHOST_IOCTL_RETRY(sizeof(file), 0); break; @@ -206,32 +207,32 @@ vhost_net_ioctl(fuse_req_t req, int cmd, void *arg, case VHOST_GET_FEATURES: LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: VHOST_GET_FEATURES\n", ctx.fh); + "(%d) IOCTL: VHOST_GET_FEATURES\n", vid); VHOST_IOCTL_W(uint64_t, features, vhost_get_features); break; case VHOST_SET_FEATURES: LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: VHOST_SET_FEATURES\n", ctx.fh); + "(%d) IOCTL: VHOST_SET_FEATURES\n", vid); VHOST_IOCTL_R(uint64_t, features, vhost_set_features); break; case VHOST_RESET_OWNER: LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: VHOST_RESET_OWNER\n", ctx.fh); + "(%d) IOCTL: VHOST_RESET_OWNER\n", vid); VHOST_IOCTL(vhost_reset_owner); break; case VHOST_SET_OWNER: LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: VHOST_SET_OWNER\n", ctx.fh); + "(%d) IOCTL: VHOST_SET_OWNER\n", vid); VHOST_IOCTL(vhost_set_owner); break; case VHOST_SET_MEM_TABLE: /*TODO fix race condition.*/ LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: VHOST_SET_MEM_TABLE\n", ctx.fh); + "(%d) IOCTL: VHOST_SET_MEM_TABLE\n", vid); static struct vhost_memory mem_temp; switch (in_bufsz) { @@ -264,28 +265,28 @@ vhost_net_ioctl(fuse_req_t req, int cmd, void *arg, case VHOST_SET_VRING_NUM: LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: VHOST_SET_VRING_NUM\n", ctx.fh); + "(%d) IOCTL: VHOST_SET_VRING_NUM\n", vid); VHOST_IOCTL_R(struct vhost_vring_state, state, vhost_set_vring_num); break; case VHOST_SET_VRING_BASE: LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: VHOST_SET_VRING_BASE\n", ctx.fh); + "(%d) IOCTL: VHOST_SET_VRING_BASE\n", vid); VHOST_IOCTL_R(struct vhost_vring_state, state, vhost_set_vring_base); break; case VHOST_GET_VRING_BASE: LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: VHOST_GET_VRING_BASE\n", ctx.fh); + "(%d) IOCTL: VHOST_GET_VRING_BASE\n", vid); VHOST_IOCTL_RW(uint32_t, index, struct vhost_vring_state, state, vhost_get_vring_base); break; case VHOST_SET_VRING_ADDR: LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: VHOST_SET_VRING_ADDR\n", ctx.fh); + "(%d) IOCTL: VHOST_SET_VRING_ADDR\n", vid); VHOST_IOCTL_R(struct vhost_vring_addr, addr, vhost_set_vring_addr); break; @@ -294,12 +295,10 @@ vhost_net_ioctl(fuse_req_t req, int cmd, void *arg, case VHOST_SET_VRING_CALL: if (cmd == VHOST_SET_VRING_KICK) LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: VHOST_SET_VRING_KICK\n", - ctx.fh); + "(%d) IOCTL: VHOST_SET_VRING_KICK\n", vid); else LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: VHOST_SET_VRING_CALL\n", - ctx.fh); + "(%d) IOCTL: VHOST_SET_VRING_CALL\n", vid); if (!in_buf) VHOST_IOCTL_RETRY(sizeof(struct vhost_vring_file), 0); else { @@ -315,10 +314,10 @@ vhost_net_ioctl(fuse_req_t req, int cmd, void *arg, } file.fd = fd; if (cmd == VHOST_SET_VRING_KICK) { - result = vhost_set_vring_kick(ctx, &file); + result = vhost_set_vring_kick(vid, &file); fuse_reply_ioctl(req, result, NULL, 0); } else { - result = vhost_set_vring_call(ctx, &file); + result = vhost_set_vring_call(vid, &file); fuse_reply_ioctl(req, result, NULL, 0); } } @@ -326,17 +325,17 @@ vhost_net_ioctl(fuse_req_t req, int cmd, void *arg, default: RTE_LOG(ERR, VHOST_CONFIG, - "(%"PRIu64") IOCTL: DOESN NOT EXIST\n", ctx.fh); + "(%d) IOCTL: DOESN NOT EXIST\n", vid); result = -1; fuse_reply_ioctl(req, result, NULL, 0); } if (result < 0) LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: FAIL\n", ctx.fh); + "(%d) IOCTL: FAIL\n", vid); else LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: SUCCESS\n", ctx.fh); + "(%d) IOCTL: SUCCESS\n", vid); } /* @@ -353,7 +352,7 @@ static const struct cuse_lowlevel_ops vhost_net_ops = { * vhost_net_device_ops are also passed when the device is registered in app. */ int -rte_vhost_driver_register(const char *dev_name) +rte_vhost_driver_register(const char *dev_name, uint64_t flags) { struct cuse_info cuse_info; char device_name[PATH_MAX] = ""; @@ -365,6 +364,12 @@ rte_vhost_driver_register(const char *dev_name) char fuse_opt_nomulti[] = FUSE_OPT_NOMULTI; char *fuse_argv[] = {fuse_opt_dummy, fuse_opt_fore, fuse_opt_nomulti}; + if (flags) { + RTE_LOG(ERR, VHOST_CONFIG, + "vhost-cuse does not support any flags so far\n"); + return -1; + } + if (access(cuse_device_name, R_OK | W_OK) < 0) { RTE_LOG(ERR, VHOST_CONFIG, "char device %s can't be accessed, maybe not exist\n", diff --git a/lib/librte_vhost/vhost_cuse/virtio-net-cdev.c b/lib/librte_vhost/vhost_cuse/virtio-net-cdev.c index a68a8bd4..552be7d4 100644 --- a/lib/librte_vhost/vhost_cuse/virtio-net-cdev.c +++ b/lib/librte_vhost/vhost_cuse/virtio-net-cdev.c @@ -54,7 +54,6 @@ #include "rte_virtio_net.h" #include "vhost-net.h" #include "virtio-net-cdev.h" -#include "virtio-net.h" #include "eventfd_copy.h" /* Line size for reading maps file. */ @@ -263,7 +262,7 @@ host_memory_map(pid_t pid, uint64_t addr, } int -cuse_set_mem_table(struct vhost_device_ctx ctx, +cuse_set_mem_table(struct vhost_cuse_device_ctx ctx, const struct vhost_memory *mem_regions_addr, uint32_t nregions) { uint64_t size = offsetof(struct vhost_memory, regions); @@ -274,7 +273,7 @@ cuse_set_mem_table(struct vhost_device_ctx ctx, uint64_t base_address = 0, mapped_address, mapped_size; struct virtio_net *dev; - dev = get_device(ctx); + dev = get_device(ctx.vid); if (dev == NULL) return -1; @@ -289,8 +288,8 @@ cuse_set_mem_table(struct vhost_device_ctx ctx, sizeof(struct virtio_memory_regions) * nregions); if (dev->mem == NULL) { RTE_LOG(ERR, VHOST_CONFIG, - "(%"PRIu64") Failed to allocate memory for dev->mem\n", - dev->device_fh); + "(%d) failed to allocate memory for dev->mem\n", + dev->vid); return -1; } @@ -379,7 +378,7 @@ cuse_set_mem_table(struct vhost_device_ctx ctx, * save it in the device structure. */ static int -get_ifname(struct vhost_device_ctx ctx, struct virtio_net *dev, int tap_fd, int pid) +get_ifname(int vid, int tap_fd, int pid) { int fd_tap; struct ifreq ifr; @@ -393,33 +392,32 @@ get_ifname(struct vhost_device_ctx ctx, struct virtio_net *dev, int tap_fd, int ret = ioctl(fd_tap, TUNGETIFF, &ifr); if (close(fd_tap) < 0) - RTE_LOG(ERR, VHOST_CONFIG, - "(%"PRIu64") fd close failed\n", - dev->device_fh); + RTE_LOG(ERR, VHOST_CONFIG, "(%d) fd close failed\n", vid); if (ret >= 0) { ifr_size = strnlen(ifr.ifr_name, sizeof(ifr.ifr_name)); - vhost_set_ifname(ctx, ifr.ifr_name, ifr_size); + vhost_set_ifname(vid, ifr.ifr_name, ifr_size); } else RTE_LOG(ERR, VHOST_CONFIG, - "(%"PRIu64") TUNGETIFF ioctl failed\n", - dev->device_fh); + "(%d) TUNGETIFF ioctl failed\n", vid); return 0; } -int cuse_set_backend(struct vhost_device_ctx ctx, struct vhost_vring_file *file) +int +cuse_set_backend(struct vhost_cuse_device_ctx ctx, + struct vhost_vring_file *file) { struct virtio_net *dev; - dev = get_device(ctx); + dev = get_device(ctx.vid); if (dev == NULL) return -1; if (!(dev->flags & VIRTIO_DEV_RUNNING) && file->fd != VIRTIO_DEV_STOPPED) - get_ifname(ctx, dev, file->fd, ctx.pid); + get_ifname(ctx.vid, file->fd, ctx.pid); - return vhost_set_backend(ctx, file); + return vhost_set_backend(ctx.vid, file); } void diff --git a/lib/librte_vhost/vhost_cuse/virtio-net-cdev.h b/lib/librte_vhost/vhost_cuse/virtio-net-cdev.h index eb6b0bab..3f67154b 100644 --- a/lib/librte_vhost/vhost_cuse/virtio-net-cdev.h +++ b/lib/librte_vhost/vhost_cuse/virtio-net-cdev.h @@ -38,11 +38,19 @@ #include "vhost-net.h" +/* + * Structure used to identify device context. + */ +struct vhost_cuse_device_ctx { + pid_t pid; /* PID of process calling the IOCTL. */ + int vid; /* Virtio-net device ID */ +}; + int -cuse_set_mem_table(struct vhost_device_ctx ctx, +cuse_set_mem_table(struct vhost_cuse_device_ctx ctx, const struct vhost_memory *mem_regions_addr, uint32_t nregions); int -cuse_set_backend(struct vhost_device_ctx ctx, struct vhost_vring_file *); +cuse_set_backend(struct vhost_cuse_device_ctx ctx, struct vhost_vring_file *); #endif diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c index 750821a4..15ca9562 100644 --- a/lib/librte_vhost/vhost_rxtx.c +++ b/lib/librte_vhost/vhost_rxtx.c @@ -126,10 +126,10 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr) } static inline void -copy_virtio_net_hdr(struct vhost_virtqueue *vq, uint64_t desc_addr, +copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr, struct virtio_net_hdr_mrg_rxbuf hdr) { - if (vq->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf)) + if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf)) *(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr; else *(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr; @@ -137,7 +137,7 @@ copy_virtio_net_hdr(struct vhost_virtqueue *vq, uint64_t desc_addr, static inline int __attribute__((always_inline)) copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, - struct rte_mbuf *m, uint16_t desc_idx, uint32_t *copied) + struct rte_mbuf *m, uint16_t desc_idx) { uint32_t desc_avail, desc_offset; uint32_t mbuf_avail, mbuf_offset; @@ -147,21 +147,20 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0}; desc = &vq->desc[desc_idx]; - if (unlikely(desc->len < vq->vhost_hlen)) + if (unlikely(desc->len < dev->vhost_hlen)) return -1; desc_addr = gpa_to_vva(dev, desc->addr); rte_prefetch0((void *)(uintptr_t)desc_addr); virtio_enqueue_offload(m, &virtio_hdr.hdr); - copy_virtio_net_hdr(vq, desc_addr, virtio_hdr); - vhost_log_write(dev, desc->addr, vq->vhost_hlen); - PRINT_PACKET(dev, (uintptr_t)desc_addr, vq->vhost_hlen, 0); + copy_virtio_net_hdr(dev, desc_addr, virtio_hdr); + vhost_log_write(dev, desc->addr, dev->vhost_hlen); + PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0); - desc_offset = vq->vhost_hlen; - desc_avail = desc->len - vq->vhost_hlen; + desc_offset = dev->vhost_hlen; + desc_avail = desc->len - dev->vhost_hlen; - *copied = rte_pktmbuf_pkt_len(m); mbuf_avail = rte_pktmbuf_data_len(m); mbuf_offset = 0; while (mbuf_avail != 0 || m->next != NULL) { @@ -205,49 +204,6 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, return 0; } -/* - * As many data cores may want to access available buffers - * they need to be reserved. - */ -static inline uint32_t -reserve_avail_buf(struct vhost_virtqueue *vq, uint32_t count, - uint16_t *start, uint16_t *end) -{ - uint16_t res_start_idx; - uint16_t res_end_idx; - uint16_t avail_idx; - uint16_t free_entries; - int success; - - count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST); - -again: - res_start_idx = vq->last_used_idx_res; - avail_idx = *((volatile uint16_t *)&vq->avail->idx); - - free_entries = avail_idx - res_start_idx; - count = RTE_MIN(count, free_entries); - if (count == 0) - return 0; - - res_end_idx = res_start_idx + count; - - /* - * update vq->last_used_idx_res atomically; try again if failed. - * - * TODO: Allow to disable cmpset if no concurrency in application. - */ - success = rte_atomic16_cmpset(&vq->last_used_idx_res, - res_start_idx, res_end_idx); - if (unlikely(!success)) - goto again; - - *start = res_start_idx; - *end = res_end_idx; - - return count; -} - /** * This function adds buffers to the virtio devices RX virtqueue. Buffers can * be received from the physical port or from another virtio device. A packet @@ -260,15 +216,15 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, struct rte_mbuf **pkts, uint32_t count) { struct vhost_virtqueue *vq; - uint16_t res_start_idx, res_end_idx; + uint16_t avail_idx, free_entries, start_idx; uint16_t desc_indexes[MAX_PKT_BURST]; + uint16_t used_idx; uint32_t i; - LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh); + LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) { - RTE_LOG(ERR, VHOST_DATA, - "%s (%"PRIu64"): virtqueue idx:%d invalid.\n", - __func__, dev->device_fh, queue_id); + RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n", + dev->vid, __func__, queue_id); return 0; } @@ -276,38 +232,43 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, if (unlikely(vq->enabled == 0)) return 0; - count = reserve_avail_buf(vq, count, &res_start_idx, &res_end_idx); + avail_idx = *((volatile uint16_t *)&vq->avail->idx); + start_idx = vq->last_used_idx; + free_entries = avail_idx - start_idx; + count = RTE_MIN(count, free_entries); + count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST); if (count == 0) return 0; - LOG_DEBUG(VHOST_DATA, - "(%"PRIu64") res_start_idx %d| res_end_idx Index %d\n", - dev->device_fh, res_start_idx, res_end_idx); + LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n", + dev->vid, start_idx, start_idx + count); /* Retrieve all of the desc indexes first to avoid caching issues. */ - rte_prefetch0(&vq->avail->ring[res_start_idx & (vq->size - 1)]); + rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]); for (i = 0; i < count; i++) { - desc_indexes[i] = vq->avail->ring[(res_start_idx + i) & - (vq->size - 1)]; + used_idx = (start_idx + i) & (vq->size - 1); + desc_indexes[i] = vq->avail->ring[used_idx]; + vq->used->ring[used_idx].id = desc_indexes[i]; + vq->used->ring[used_idx].len = pkts[i]->pkt_len + + dev->vhost_hlen; + vhost_log_used_vring(dev, vq, + offsetof(struct vring_used, ring[used_idx]), + sizeof(vq->used->ring[used_idx])); } rte_prefetch0(&vq->desc[desc_indexes[0]]); for (i = 0; i < count; i++) { uint16_t desc_idx = desc_indexes[i]; - uint16_t used_idx = (res_start_idx + i) & (vq->size - 1); - uint32_t copied; int err; - err = copy_mbuf_to_desc(dev, vq, pkts[i], desc_idx, &copied); - - vq->used->ring[used_idx].id = desc_idx; - if (unlikely(err)) - vq->used->ring[used_idx].len = vq->vhost_hlen; - else - vq->used->ring[used_idx].len = copied + vq->vhost_hlen; - vhost_log_used_vring(dev, vq, - offsetof(struct vring_used, ring[used_idx]), - sizeof(vq->used->ring[used_idx])); + err = copy_mbuf_to_desc(dev, vq, pkts[i], desc_idx); + if (unlikely(err)) { + used_idx = (start_idx + i) & (vq->size - 1); + vq->used->ring[used_idx].len = dev->vhost_hlen; + vhost_log_used_vring(dev, vq, + offsetof(struct vring_used, ring[used_idx]), + sizeof(vq->used->ring[used_idx])); + } if (i + 1 < count) rte_prefetch0(&vq->desc[desc_indexes[i+1]]); @@ -315,12 +276,8 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, rte_smp_wmb(); - /* Wait until it's our turn to add our buffer to the used ring. */ - while (unlikely(vq->last_used_idx != res_start_idx)) - rte_pause(); - *(volatile uint16_t *)&vq->used->idx += count; - vq->last_used_idx = res_end_idx; + vq->last_used_idx += count; vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx), sizeof(vq->used->idx)); @@ -337,7 +294,8 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, static inline int fill_vec_buf(struct vhost_virtqueue *vq, uint32_t avail_idx, - uint32_t *allocated, uint32_t *vec_idx) + uint32_t *allocated, uint32_t *vec_idx, + struct buf_vector *buf_vec) { uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)]; uint32_t vec_id = *vec_idx; @@ -348,9 +306,9 @@ fill_vec_buf(struct vhost_virtqueue *vq, uint32_t avail_idx, return -1; len += vq->desc[idx].len; - vq->buf_vec[vec_id].buf_addr = vq->desc[idx].addr; - vq->buf_vec[vec_id].buf_len = vq->desc[idx].len; - vq->buf_vec[vec_id].desc_idx = idx; + buf_vec[vec_id].buf_addr = vq->desc[idx].addr; + buf_vec[vec_id].buf_len = vq->desc[idx].len; + buf_vec[vec_id].desc_idx = idx; vec_id++; if ((vq->desc[idx].flags & VRING_DESC_F_NEXT) == 0) @@ -366,39 +324,30 @@ fill_vec_buf(struct vhost_virtqueue *vq, uint32_t avail_idx, } /* - * As many data cores may want to access available buffers concurrently, - * they need to be reserved. - * * Returns -1 on fail, 0 on success */ static inline int reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size, - uint16_t *start, uint16_t *end) + uint16_t *end, struct buf_vector *buf_vec) { - uint16_t res_start_idx; - uint16_t res_cur_idx; + uint16_t cur_idx; uint16_t avail_idx; - uint32_t allocated; - uint32_t vec_idx; - uint16_t tries; + uint32_t allocated = 0; + uint32_t vec_idx = 0; + uint16_t tries = 0; -again: - res_start_idx = vq->last_used_idx_res; - res_cur_idx = res_start_idx; + cur_idx = vq->last_used_idx; - allocated = 0; - vec_idx = 0; - tries = 0; while (1) { avail_idx = *((volatile uint16_t *)&vq->avail->idx); - if (unlikely(res_cur_idx == avail_idx)) + if (unlikely(cur_idx == avail_idx)) return -1; - if (unlikely(fill_vec_buf(vq, res_cur_idx, &allocated, - &vec_idx) < 0)) + if (unlikely(fill_vec_buf(vq, cur_idx, &allocated, + &vec_idx, buf_vec) < 0)) return -1; - res_cur_idx++; + cur_idx++; tries++; if (allocated >= size) @@ -413,27 +362,19 @@ again: return -1; } - /* - * update vq->last_used_idx_res atomically. - * retry again if failed. - */ - if (rte_atomic16_cmpset(&vq->last_used_idx_res, - res_start_idx, res_cur_idx) == 0) - goto again; - - *start = res_start_idx; - *end = res_cur_idx; + *end = cur_idx; return 0; } static inline uint32_t __attribute__((always_inline)) copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq, - uint16_t res_start_idx, uint16_t res_end_idx, - struct rte_mbuf *m) + uint16_t end_idx, struct rte_mbuf *m, + struct buf_vector *buf_vec) { struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0}; uint32_t vec_idx = 0; - uint16_t cur_idx = res_start_idx; + uint16_t start_idx = vq->last_used_idx; + uint16_t cur_idx = start_idx; uint64_t desc_addr; uint32_t mbuf_offset, mbuf_avail; uint32_t desc_offset, desc_avail; @@ -443,34 +384,33 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq, if (unlikely(m == NULL)) return 0; - LOG_DEBUG(VHOST_DATA, - "(%"PRIu64") Current Index %d| End Index %d\n", - dev->device_fh, cur_idx, res_end_idx); + LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n", + dev->vid, cur_idx, end_idx); - if (vq->buf_vec[vec_idx].buf_len < vq->vhost_hlen) + if (buf_vec[vec_idx].buf_len < dev->vhost_hlen) return -1; - desc_addr = gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr); + desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr); rte_prefetch0((void *)(uintptr_t)desc_addr); - virtio_hdr.num_buffers = res_end_idx - res_start_idx; - LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge buffers %d\n", - dev->device_fh, virtio_hdr.num_buffers); + virtio_hdr.num_buffers = end_idx - start_idx; + LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n", + dev->vid, virtio_hdr.num_buffers); virtio_enqueue_offload(m, &virtio_hdr.hdr); - copy_virtio_net_hdr(vq, desc_addr, virtio_hdr); - vhost_log_write(dev, vq->buf_vec[vec_idx].buf_addr, vq->vhost_hlen); - PRINT_PACKET(dev, (uintptr_t)desc_addr, vq->vhost_hlen, 0); + copy_virtio_net_hdr(dev, desc_addr, virtio_hdr); + vhost_log_write(dev, buf_vec[vec_idx].buf_addr, dev->vhost_hlen); + PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0); - desc_avail = vq->buf_vec[vec_idx].buf_len - vq->vhost_hlen; - desc_offset = vq->vhost_hlen; + desc_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen; + desc_offset = dev->vhost_hlen; mbuf_avail = rte_pktmbuf_data_len(m); mbuf_offset = 0; while (mbuf_avail != 0 || m->next != NULL) { /* done with current desc buf, get the next one */ if (desc_avail == 0) { - desc_idx = vq->buf_vec[vec_idx].desc_idx; + desc_idx = buf_vec[vec_idx].desc_idx; if (!(vq->desc[desc_idx].flags & VRING_DESC_F_NEXT)) { /* Update used ring with desc information */ @@ -484,12 +424,12 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq, } vec_idx++; - desc_addr = gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr); + desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr); /* Prefetch buffer address. */ rte_prefetch0((void *)(uintptr_t)desc_addr); desc_offset = 0; - desc_avail = vq->buf_vec[vec_idx].buf_len; + desc_avail = buf_vec[vec_idx].buf_len; } /* done with current mbuf, get the next one */ @@ -504,7 +444,7 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq, rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)), rte_pktmbuf_mtod_offset(m, void *, mbuf_offset), cpy_len); - vhost_log_write(dev, vq->buf_vec[vec_idx].buf_addr + desc_offset, + vhost_log_write(dev, buf_vec[vec_idx].buf_addr + desc_offset, cpy_len); PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset), cpy_len, 0); @@ -516,13 +456,13 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq, } used_idx = cur_idx & (vq->size - 1); - vq->used->ring[used_idx].id = vq->buf_vec[vec_idx].desc_idx; + vq->used->ring[used_idx].id = buf_vec[vec_idx].desc_idx; vq->used->ring[used_idx].len = desc_offset; vhost_log_used_vring(dev, vq, offsetof(struct vring_used, ring[used_idx]), sizeof(vq->used->ring[used_idx])); - return res_end_idx - res_start_idx; + return end_idx - start_idx; } static inline uint32_t __attribute__((always_inline)) @@ -531,14 +471,13 @@ virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id, { struct vhost_virtqueue *vq; uint32_t pkt_idx = 0, nr_used = 0; - uint16_t start, end; + uint16_t end; + struct buf_vector buf_vec[BUF_VECTOR_MAX]; - LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_rx()\n", - dev->device_fh); + LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) { - RTE_LOG(ERR, VHOST_DATA, - "%s (%"PRIu64"): virtqueue idx:%d invalid.\n", - __func__, dev->device_fh, queue_id); + RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n", + dev->vid, __func__, queue_id); return 0; } @@ -551,27 +490,20 @@ virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id, return 0; for (pkt_idx = 0; pkt_idx < count; pkt_idx++) { - uint32_t pkt_len = pkts[pkt_idx]->pkt_len + vq->vhost_hlen; + uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen; if (unlikely(reserve_avail_buf_mergeable(vq, pkt_len, - &start, &end) < 0)) { + &end, buf_vec) < 0)) { LOG_DEBUG(VHOST_DATA, - "(%" PRIu64 ") Failed to get enough desc from vring\n", - dev->device_fh); + "(%d) failed to get enough desc from vring\n", + dev->vid); break; } - nr_used = copy_mbuf_to_desc_mergeable(dev, vq, start, end, - pkts[pkt_idx]); + nr_used = copy_mbuf_to_desc_mergeable(dev, vq, end, + pkts[pkt_idx], buf_vec); rte_smp_wmb(); - /* - * Wait until it's our turn to add our buffer - * to the used ring. - */ - while (unlikely(vq->last_used_idx != start)) - rte_pause(); - *(volatile uint16_t *)&vq->used->idx += nr_used; vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx), sizeof(vq->used->idx)); @@ -592,9 +524,14 @@ virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id, } uint16_t -rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id, +rte_vhost_enqueue_burst(int vid, uint16_t queue_id, struct rte_mbuf **pkts, uint16_t count) { + struct virtio_net *dev = get_device(vid); + + if (!dev) + return 0; + if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) return virtio_dev_merge_rx(dev, queue_id, pkts, count); else @@ -747,22 +684,53 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, uint32_t nr_desc = 1; desc = &vq->desc[desc_idx]; - if (unlikely(desc->len < vq->vhost_hlen)) + if (unlikely(desc->len < dev->vhost_hlen)) return -1; desc_addr = gpa_to_vva(dev, desc->addr); - rte_prefetch0((void *)(uintptr_t)desc_addr); - - /* Retrieve virtio net header */ hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr); - desc_avail = desc->len - vq->vhost_hlen; - desc_offset = vq->vhost_hlen; + rte_prefetch0(hdr); + + /* + * A virtio driver normally uses at least 2 desc buffers + * for Tx: the first for storing the header, and others + * for storing the data. + */ + if (likely((desc->len == dev->vhost_hlen) && + (desc->flags & VRING_DESC_F_NEXT) != 0)) { + desc = &vq->desc[desc->next]; + + desc_addr = gpa_to_vva(dev, desc->addr); + rte_prefetch0((void *)(uintptr_t)desc_addr); + + desc_offset = 0; + desc_avail = desc->len; + nr_desc += 1; + + PRINT_PACKET(dev, (uintptr_t)desc_addr, desc->len, 0); + } else { + desc_avail = desc->len - dev->vhost_hlen; + desc_offset = dev->vhost_hlen; + } mbuf_offset = 0; mbuf_avail = m->buf_len - RTE_PKTMBUF_HEADROOM; - while (desc_avail != 0 || (desc->flags & VRING_DESC_F_NEXT) != 0) { + while (1) { + cpy_len = RTE_MIN(desc_avail, mbuf_avail); + rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, mbuf_offset), + (void *)((uintptr_t)(desc_addr + desc_offset)), + cpy_len); + + mbuf_avail -= cpy_len; + mbuf_offset += cpy_len; + desc_avail -= cpy_len; + desc_offset += cpy_len; + /* This desc reaches to its end, get the next one */ if (desc_avail == 0) { + if ((desc->flags & VRING_DESC_F_NEXT) == 0) + break; + if (unlikely(desc->next >= vq->size || ++nr_desc >= vq->size)) return -1; @@ -798,16 +766,6 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, mbuf_offset = 0; mbuf_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM; } - - cpy_len = RTE_MIN(desc_avail, mbuf_avail); - rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, mbuf_offset), - (void *)((uintptr_t)(desc_addr + desc_offset)), - cpy_len); - - mbuf_avail -= cpy_len; - mbuf_offset += cpy_len; - desc_avail -= cpy_len; - desc_offset += cpy_len; } prev->data_len = mbuf_offset; @@ -820,9 +778,10 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, } uint16_t -rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, +rte_vhost_dequeue_burst(int vid, uint16_t queue_id, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) { + struct virtio_net *dev; struct rte_mbuf *rarp_mbuf = NULL; struct vhost_virtqueue *vq; uint32_t desc_indexes[MAX_PKT_BURST]; @@ -831,10 +790,13 @@ rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, uint16_t free_entries; uint16_t avail_idx; + dev = get_device(vid); + if (!dev) + return 0; + if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->virt_qp_nb))) { - RTE_LOG(ERR, VHOST_DATA, - "%s (%"PRIu64"): virtqueue idx:%d invalid.\n", - __func__, dev->device_fh, queue_id); + RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n", + dev->vid, __func__, queue_id); return 0; } @@ -870,35 +832,37 @@ rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, if (free_entries == 0) goto out; - LOG_DEBUG(VHOST_DATA, "%s (%"PRIu64")\n", __func__, dev->device_fh); + LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); /* Prefetch available ring to retrieve head indexes. */ used_idx = vq->last_used_idx & (vq->size - 1); rte_prefetch0(&vq->avail->ring[used_idx]); + rte_prefetch0(&vq->used->ring[used_idx]); count = RTE_MIN(count, MAX_PKT_BURST); count = RTE_MIN(count, free_entries); - LOG_DEBUG(VHOST_DATA, "(%"PRIu64") about to dequeue %u buffers\n", - dev->device_fh, count); + LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n", + dev->vid, count); /* Retrieve all of the head indexes first to avoid caching issues. */ for (i = 0; i < count; i++) { - desc_indexes[i] = vq->avail->ring[(vq->last_used_idx + i) & - (vq->size - 1)]; + used_idx = (vq->last_used_idx + i) & (vq->size - 1); + desc_indexes[i] = vq->avail->ring[used_idx]; + + vq->used->ring[used_idx].id = desc_indexes[i]; + vq->used->ring[used_idx].len = 0; + vhost_log_used_vring(dev, vq, + offsetof(struct vring_used, ring[used_idx]), + sizeof(vq->used->ring[used_idx])); } /* Prefetch descriptor index. */ rte_prefetch0(&vq->desc[desc_indexes[0]]); - rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]); - for (i = 0; i < count; i++) { int err; - if (likely(i + 1 < count)) { + if (likely(i + 1 < count)) rte_prefetch0(&vq->desc[desc_indexes[i + 1]]); - rte_prefetch0(&vq->used->ring[(used_idx + 1) & - (vq->size - 1)]); - } pkts[i] = rte_pktmbuf_alloc(mbuf_pool); if (unlikely(pkts[i] == NULL)) { @@ -912,18 +876,12 @@ rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, rte_pktmbuf_free(pkts[i]); break; } - - used_idx = vq->last_used_idx++ & (vq->size - 1); - vq->used->ring[used_idx].id = desc_indexes[i]; - vq->used->ring[used_idx].len = 0; - vhost_log_used_vring(dev, vq, - offsetof(struct vring_used, ring[used_idx]), - sizeof(vq->used->ring[used_idx])); } rte_smp_wmb(); rte_smp_rmb(); vq->used->idx += i; + vq->last_used_idx += i; vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx), sizeof(vq->used->idx)); diff --git a/lib/librte_vhost/vhost_user/vhost-net-user.c b/lib/librte_vhost/vhost_user/vhost-net-user.c index df2bd648..94f1b923 100644 --- a/lib/librte_vhost/vhost_user/vhost-net-user.c +++ b/lib/librte_vhost/vhost_user/vhost-net-user.c @@ -33,6 +33,7 @@ #include <stdint.h> #include <stdio.h> +#include <stdbool.h> #include <limits.h> #include <stdlib.h> #include <unistd.h> @@ -40,6 +41,7 @@ #include <sys/types.h> #include <sys/socket.h> #include <sys/un.h> +#include <sys/queue.h> #include <errno.h> #include <pthread.h> @@ -51,32 +53,44 @@ #include "vhost-net.h" #include "virtio-net-user.h" -#define MAX_VIRTIO_BACKLOG 128 - -static void vserver_new_vq_conn(int fd, void *data, int *remove); -static void vserver_message_handler(int fd, void *dat, int *remove); +/* + * Every time rte_vhost_driver_register() is invoked, an associated + * vhost_user_socket struct will be created. + */ +struct vhost_user_socket { + char *path; + int listenfd; + bool is_server; + bool reconnect; +}; -struct connfd_ctx { - struct vhost_server *vserver; - uint32_t fh; +struct vhost_user_connection { + struct vhost_user_socket *vsocket; + int vid; }; -#define MAX_VHOST_SERVER 1024 -struct _vhost_server { - struct vhost_server *server[MAX_VHOST_SERVER]; +#define MAX_VHOST_SOCKET 1024 +struct vhost_user { + struct vhost_user_socket *vsockets[MAX_VHOST_SOCKET]; struct fdset fdset; - int vserver_cnt; - pthread_mutex_t server_mutex; + int vsocket_cnt; + pthread_mutex_t mutex; }; -static struct _vhost_server g_vhost_server = { +#define MAX_VIRTIO_BACKLOG 128 + +static void vhost_user_server_new_connection(int fd, void *data, int *remove); +static void vhost_user_msg_handler(int fd, void *dat, int *remove); +static int vhost_user_create_client(struct vhost_user_socket *vsocket); + +static struct vhost_user vhost_user = { .fdset = { .fd = { [0 ... MAX_FDS - 1] = {-1, NULL, NULL, NULL, 0} }, .fd_mutex = PTHREAD_MUTEX_INITIALIZER, .num = 0 }, - .vserver_cnt = 0, - .server_mutex = PTHREAD_MUTEX_INITIALIZER, + .vsocket_cnt = 0, + .mutex = PTHREAD_MUTEX_INITIALIZER, }; static const char *vhost_message_str[VHOST_USER_MAX] = { @@ -102,48 +116,6 @@ static const char *vhost_message_str[VHOST_USER_MAX] = { [VHOST_USER_SEND_RARP] = "VHOST_USER_SEND_RARP", }; -/** - * Create a unix domain socket, bind to path and listen for connection. - * @return - * socket fd or -1 on failure - */ -static int -uds_socket(const char *path) -{ - struct sockaddr_un un; - int sockfd; - int ret; - - if (path == NULL) - return -1; - - sockfd = socket(AF_UNIX, SOCK_STREAM, 0); - if (sockfd < 0) - return -1; - RTE_LOG(INFO, VHOST_CONFIG, "socket created, fd:%d\n", sockfd); - - memset(&un, 0, sizeof(un)); - un.sun_family = AF_UNIX; - snprintf(un.sun_path, sizeof(un.sun_path), "%s", path); - ret = bind(sockfd, (struct sockaddr *)&un, sizeof(un)); - if (ret == -1) { - RTE_LOG(ERR, VHOST_CONFIG, "fail to bind fd:%d, remove file:%s and try again.\n", - sockfd, path); - goto err; - } - RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path); - - ret = listen(sockfd, MAX_VIRTIO_BACKLOG); - if (ret == -1) - goto err; - - return sockfd; - -err: - close(sockfd); - return -1; -} - /* return bytes# of read on success or negative val on failure. */ static int read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num) @@ -278,62 +250,66 @@ send_vhost_message(int sockfd, struct VhostUserMsg *msg) return ret; } -/* call back when there is new virtio connection. */ + static void -vserver_new_vq_conn(int fd, void *dat, __rte_unused int *remove) +vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket) { - struct vhost_server *vserver = (struct vhost_server *)dat; - int conn_fd; - struct connfd_ctx *ctx; - int fh; - struct vhost_device_ctx vdev_ctx = { (pid_t)0, 0 }; - unsigned int size; - - conn_fd = accept(fd, NULL, NULL); - RTE_LOG(INFO, VHOST_CONFIG, - "new virtio connection is %d\n", conn_fd); - if (conn_fd < 0) - return; + int vid; + size_t size; + struct vhost_user_connection *conn; - ctx = calloc(1, sizeof(*ctx)); - if (ctx == NULL) { - close(conn_fd); + conn = malloc(sizeof(*conn)); + if (conn == NULL) { + close(fd); return; } - fh = vhost_new_device(vdev_ctx); - if (fh == -1) { - free(ctx); - close(conn_fd); + vid = vhost_new_device(); + if (vid == -1) { + close(fd); + free(conn); return; } - vdev_ctx.fh = fh; - size = strnlen(vserver->path, PATH_MAX); - vhost_set_ifname(vdev_ctx, vserver->path, - size); + size = strnlen(vsocket->path, PATH_MAX); + vhost_set_ifname(vid, vsocket->path, size); + + RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", vid); - RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", fh); + conn->vsocket = vsocket; + conn->vid = vid; + fdset_add(&vhost_user.fdset, fd, vhost_user_msg_handler, NULL, conn); +} + +/* call back when there is new vhost-user connection from client */ +static void +vhost_user_server_new_connection(int fd, void *dat, int *remove __rte_unused) +{ + struct vhost_user_socket *vsocket = dat; - ctx->vserver = vserver; - ctx->fh = fh; - fdset_add(&g_vhost_server.fdset, - conn_fd, vserver_message_handler, NULL, ctx); + fd = accept(fd, NULL, NULL); + if (fd < 0) + return; + + RTE_LOG(INFO, VHOST_CONFIG, "new vhost user connection is %d\n", fd); + vhost_user_add_connection(fd, vsocket); } /* callback when there is message on the connfd */ static void -vserver_message_handler(int connfd, void *dat, int *remove) +vhost_user_msg_handler(int connfd, void *dat, int *remove) { - struct vhost_device_ctx ctx; - struct connfd_ctx *cfd_ctx = (struct connfd_ctx *)dat; + int vid; + struct vhost_user_connection *conn = dat; struct VhostUserMsg msg; uint64_t features; int ret; - ctx.fh = cfd_ctx->fh; + vid = conn->vid; ret = read_vhost_message(connfd, &msg); if (ret <= 0 || msg.request >= VHOST_USER_MAX) { + struct vhost_user_socket *vsocket = conn->vsocket; + if (ret < 0) RTE_LOG(ERR, VHOST_CONFIG, "vhost read message failed\n"); @@ -346,8 +322,11 @@ vserver_message_handler(int connfd, void *dat, int *remove) close(connfd); *remove = 1; - free(cfd_ctx); - vhost_destroy_device(ctx); + free(conn); + vhost_destroy_device(vid); + + if (vsocket->reconnect) + vhost_user_create_client(vsocket); return; } @@ -356,14 +335,14 @@ vserver_message_handler(int connfd, void *dat, int *remove) vhost_message_str[msg.request]); switch (msg.request) { case VHOST_USER_GET_FEATURES: - ret = vhost_get_features(ctx, &features); + ret = vhost_get_features(vid, &features); msg.payload.u64 = features; msg.size = sizeof(msg.payload.u64); send_vhost_message(connfd, &msg); break; case VHOST_USER_SET_FEATURES: features = msg.payload.u64; - vhost_set_features(ctx, &features); + vhost_set_features(vid, &features); break; case VHOST_USER_GET_PROTOCOL_FEATURES: @@ -372,22 +351,22 @@ vserver_message_handler(int connfd, void *dat, int *remove) send_vhost_message(connfd, &msg); break; case VHOST_USER_SET_PROTOCOL_FEATURES: - user_set_protocol_features(ctx, msg.payload.u64); + user_set_protocol_features(vid, msg.payload.u64); break; case VHOST_USER_SET_OWNER: - vhost_set_owner(ctx); + vhost_set_owner(vid); break; case VHOST_USER_RESET_OWNER: - vhost_reset_owner(ctx); + vhost_reset_owner(vid); break; case VHOST_USER_SET_MEM_TABLE: - user_set_mem_table(ctx, &msg); + user_set_mem_table(vid, &msg); break; case VHOST_USER_SET_LOG_BASE: - user_set_log_base(ctx, &msg); + user_set_log_base(vid, &msg); /* it needs a reply */ msg.size = sizeof(msg.payload.u64); @@ -399,26 +378,26 @@ vserver_message_handler(int connfd, void *dat, int *remove) break; case VHOST_USER_SET_VRING_NUM: - vhost_set_vring_num(ctx, &msg.payload.state); + vhost_set_vring_num(vid, &msg.payload.state); break; case VHOST_USER_SET_VRING_ADDR: - vhost_set_vring_addr(ctx, &msg.payload.addr); + vhost_set_vring_addr(vid, &msg.payload.addr); break; case VHOST_USER_SET_VRING_BASE: - vhost_set_vring_base(ctx, &msg.payload.state); + vhost_set_vring_base(vid, &msg.payload.state); break; case VHOST_USER_GET_VRING_BASE: - ret = user_get_vring_base(ctx, &msg.payload.state); + ret = user_get_vring_base(vid, &msg.payload.state); msg.size = sizeof(msg.payload.state); send_vhost_message(connfd, &msg); break; case VHOST_USER_SET_VRING_KICK: - user_set_vring_kick(ctx, &msg); + user_set_vring_kick(vid, &msg); break; case VHOST_USER_SET_VRING_CALL: - user_set_vring_call(ctx, &msg); + user_set_vring_call(vid, &msg); break; case VHOST_USER_SET_VRING_ERR: @@ -434,10 +413,10 @@ vserver_message_handler(int connfd, void *dat, int *remove) break; case VHOST_USER_SET_VRING_ENABLE: - user_set_vring_enable(ctx, &msg.payload.state); + user_set_vring_enable(vid, &msg.payload.state); break; case VHOST_USER_SEND_RARP: - user_send_rarp(ctx, &msg); + user_send_rarp(vid, &msg); break; default: @@ -446,50 +425,222 @@ vserver_message_handler(int connfd, void *dat, int *remove) } } -/** - * Creates and initialise the vhost server. - */ -int -rte_vhost_driver_register(const char *path) +static int +create_unix_socket(const char *path, struct sockaddr_un *un, bool is_server) { - struct vhost_server *vserver; + int fd; - pthread_mutex_lock(&g_vhost_server.server_mutex); + fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd < 0) + return -1; + RTE_LOG(INFO, VHOST_CONFIG, "vhost-user %s: socket created, fd: %d\n", + is_server ? "server" : "client", fd); - if (g_vhost_server.vserver_cnt == MAX_VHOST_SERVER) { - RTE_LOG(ERR, VHOST_CONFIG, - "error: the number of servers reaches maximum\n"); - pthread_mutex_unlock(&g_vhost_server.server_mutex); + memset(un, 0, sizeof(*un)); + un->sun_family = AF_UNIX; + strncpy(un->sun_path, path, sizeof(un->sun_path)); + + return fd; +} + +static int +vhost_user_create_server(struct vhost_user_socket *vsocket) +{ + int fd; + int ret; + struct sockaddr_un un; + const char *path = vsocket->path; + + fd = create_unix_socket(path, &un, vsocket->is_server); + if (fd < 0) return -1; + + ret = bind(fd, (struct sockaddr *)&un, sizeof(un)); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to bind to %s: %s; remove it and try again\n", + path, strerror(errno)); + goto err; } + RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path); - vserver = calloc(sizeof(struct vhost_server), 1); - if (vserver == NULL) { - pthread_mutex_unlock(&g_vhost_server.server_mutex); - return -1; + ret = listen(fd, MAX_VIRTIO_BACKLOG); + if (ret < 0) + goto err; + + vsocket->listenfd = fd; + fdset_add(&vhost_user.fdset, fd, vhost_user_server_new_connection, + NULL, vsocket); + + return 0; + +err: + close(fd); + return -1; +} + +struct vhost_user_reconnect { + struct sockaddr_un un; + int fd; + struct vhost_user_socket *vsocket; + + TAILQ_ENTRY(vhost_user_reconnect) next; +}; + +TAILQ_HEAD(vhost_user_reconnect_tailq_list, vhost_user_reconnect); +struct vhost_user_reconnect_list { + struct vhost_user_reconnect_tailq_list head; + pthread_mutex_t mutex; +}; + +static struct vhost_user_reconnect_list reconn_list; +static pthread_t reconn_tid; + +static void * +vhost_user_client_reconnect(void *arg __rte_unused) +{ + struct vhost_user_reconnect *reconn, *next; + + while (1) { + pthread_mutex_lock(&reconn_list.mutex); + + /* + * An equal implementation of TAILQ_FOREACH_SAFE, + * which does not exist on all platforms. + */ + for (reconn = TAILQ_FIRST(&reconn_list.head); + reconn != NULL; reconn = next) { + next = TAILQ_NEXT(reconn, next); + + if (connect(reconn->fd, (struct sockaddr *)&reconn->un, + sizeof(reconn->un)) < 0) + continue; + + RTE_LOG(INFO, VHOST_CONFIG, + "%s: connected\n", reconn->vsocket->path); + vhost_user_add_connection(reconn->fd, reconn->vsocket); + TAILQ_REMOVE(&reconn_list.head, reconn, next); + free(reconn); + } + + pthread_mutex_unlock(&reconn_list.mutex); + sleep(1); } - vserver->listenfd = uds_socket(path); - if (vserver->listenfd < 0) { - free(vserver); - pthread_mutex_unlock(&g_vhost_server.server_mutex); + return NULL; +} + +static int +vhost_user_reconnect_init(void) +{ + int ret; + + pthread_mutex_init(&reconn_list.mutex, NULL); + TAILQ_INIT(&reconn_list.head); + + ret = pthread_create(&reconn_tid, NULL, + vhost_user_client_reconnect, NULL); + if (ret < 0) + RTE_LOG(ERR, VHOST_CONFIG, "failed to create reconnect thread"); + + return ret; +} + +static int +vhost_user_create_client(struct vhost_user_socket *vsocket) +{ + int fd; + int ret; + struct sockaddr_un un; + const char *path = vsocket->path; + struct vhost_user_reconnect *reconn; + + fd = create_unix_socket(path, &un, vsocket->is_server); + if (fd < 0) return -1; + + ret = connect(fd, (struct sockaddr *)&un, sizeof(un)); + if (ret == 0) { + vhost_user_add_connection(fd, vsocket); + return 0; } - vserver->path = strdup(path); + RTE_LOG(ERR, VHOST_CONFIG, + "failed to connect to %s: %s\n", + path, strerror(errno)); - fdset_add(&g_vhost_server.fdset, vserver->listenfd, - vserver_new_vq_conn, NULL, vserver); + if (!vsocket->reconnect) { + close(fd); + return -1; + } - g_vhost_server.server[g_vhost_server.vserver_cnt++] = vserver; - pthread_mutex_unlock(&g_vhost_server.server_mutex); + RTE_LOG(ERR, VHOST_CONFIG, "%s: reconnecting...\n", path); + reconn = malloc(sizeof(*reconn)); + reconn->un = un; + reconn->fd = fd; + reconn->vsocket = vsocket; + pthread_mutex_lock(&reconn_list.mutex); + TAILQ_INSERT_TAIL(&reconn_list.head, reconn, next); + pthread_mutex_unlock(&reconn_list.mutex); return 0; } +/* + * Register a new vhost-user socket; here we could act as server + * (the default case), or client (when RTE_VHOST_USER_CLIENT) flag + * is set. + */ +int +rte_vhost_driver_register(const char *path, uint64_t flags) +{ + int ret = -1; + struct vhost_user_socket *vsocket; + + if (!path) + return -1; + + pthread_mutex_lock(&vhost_user.mutex); + + if (vhost_user.vsocket_cnt == MAX_VHOST_SOCKET) { + RTE_LOG(ERR, VHOST_CONFIG, + "error: the number of vhost sockets reaches maximum\n"); + goto out; + } + + vsocket = malloc(sizeof(struct vhost_user_socket)); + if (!vsocket) + goto out; + memset(vsocket, 0, sizeof(struct vhost_user_socket)); + vsocket->path = strdup(path); + + if ((flags & RTE_VHOST_USER_CLIENT) != 0) { + vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT); + if (vsocket->reconnect && reconn_tid == 0) { + if (vhost_user_reconnect_init() < 0) + goto out; + } + ret = vhost_user_create_client(vsocket); + } else { + vsocket->is_server = true; + ret = vhost_user_create_server(vsocket); + } + if (ret < 0) { + free(vsocket->path); + free(vsocket); + goto out; + } + + vhost_user.vsockets[vhost_user.vsocket_cnt++] = vsocket; + +out: + pthread_mutex_unlock(&vhost_user.mutex); + + return ret; +} /** - * Unregister the specified vhost server + * Unregister the specified vhost socket */ int rte_vhost_driver_unregister(const char *path) @@ -497,28 +648,29 @@ rte_vhost_driver_unregister(const char *path) int i; int count; - pthread_mutex_lock(&g_vhost_server.server_mutex); - - for (i = 0; i < g_vhost_server.vserver_cnt; i++) { - if (!strcmp(g_vhost_server.server[i]->path, path)) { - fdset_del(&g_vhost_server.fdset, - g_vhost_server.server[i]->listenfd); + pthread_mutex_lock(&vhost_user.mutex); - close(g_vhost_server.server[i]->listenfd); - free(g_vhost_server.server[i]->path); - free(g_vhost_server.server[i]); + for (i = 0; i < vhost_user.vsocket_cnt; i++) { + if (!strcmp(vhost_user.vsockets[i]->path, path)) { + if (vhost_user.vsockets[i]->is_server) { + fdset_del(&vhost_user.fdset, + vhost_user.vsockets[i]->listenfd); + close(vhost_user.vsockets[i]->listenfd); + unlink(path); + } - unlink(path); + free(vhost_user.vsockets[i]->path); + free(vhost_user.vsockets[i]); - count = --g_vhost_server.vserver_cnt; - g_vhost_server.server[i] = g_vhost_server.server[count]; - g_vhost_server.server[count] = NULL; - pthread_mutex_unlock(&g_vhost_server.server_mutex); + count = --vhost_user.vsocket_cnt; + vhost_user.vsockets[i] = vhost_user.vsockets[count]; + vhost_user.vsockets[count] = NULL; + pthread_mutex_unlock(&vhost_user.mutex); return 0; } } - pthread_mutex_unlock(&g_vhost_server.server_mutex); + pthread_mutex_unlock(&vhost_user.mutex); return -1; } @@ -526,6 +678,6 @@ rte_vhost_driver_unregister(const char *path) int rte_vhost_driver_session_start(void) { - fdset_event_dispatch(&g_vhost_server.fdset); + fdset_event_dispatch(&vhost_user.fdset); return 0; } diff --git a/lib/librte_vhost/vhost_user/vhost-net-user.h b/lib/librte_vhost/vhost_user/vhost-net-user.h index e3bb4138..f5332396 100644 --- a/lib/librte_vhost/vhost_user/vhost-net-user.h +++ b/lib/librte_vhost/vhost_user/vhost-net-user.h @@ -38,15 +38,11 @@ #include <linux/vhost.h> #include "rte_virtio_net.h" -#include "fd_man.h" - -struct vhost_server { - char *path; /**< The path the uds is bind to. */ - int listenfd; /**< The listener sockfd. */ -}; /* refer to hw/virtio/vhost-user.c */ +#define VHOST_MEMORY_MAX_NREGIONS 8 + typedef enum VhostUserRequest { VHOST_USER_NONE = 0, VHOST_USER_GET_FEATURES = 1, diff --git a/lib/librte_vhost/vhost_user/virtio-net-user.c b/lib/librte_vhost/vhost_user/virtio-net-user.c index f5248bc4..e7c43479 100644 --- a/lib/librte_vhost/vhost_user/virtio-net-user.c +++ b/lib/librte_vhost/vhost_user/virtio-net-user.c @@ -43,7 +43,6 @@ #include <rte_common.h> #include <rte_log.h> -#include "virtio-net.h" #include "virtio-net-user.h" #include "vhost-net-user.h" #include "vhost-net.h" @@ -64,9 +63,10 @@ static uint64_t get_blk_size(int fd) { struct stat stat; + int ret; - fstat(fd, &stat); - return (uint64_t)stat.st_blksize; + ret = fstat(fd, &stat); + return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize; } static void @@ -96,10 +96,14 @@ vhost_backend_cleanup(struct virtio_net *dev) free(dev->mem); dev->mem = NULL; } + if (dev->log_addr) { + munmap((void *)(uintptr_t)dev->log_addr, dev->log_size); + dev->log_addr = 0; + } } int -user_set_mem_table(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg) +user_set_mem_table(int vid, struct VhostUserMsg *pmsg) { struct VhostUserMemory memory = pmsg->payload.memory; struct virtio_memory_regions *pregion; @@ -110,13 +114,15 @@ user_set_mem_table(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg) uint64_t alignment; /* unmap old memory regions one by one*/ - dev = get_device(ctx); + dev = get_device(vid); if (dev == NULL) return -1; /* Remove from the data plane. */ - if (dev->flags & VIRTIO_DEV_RUNNING) - notify_ops->destroy_device(dev); + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + notify_ops->destroy_device(vid); + } if (dev->mem) { free_mem_region(dev); @@ -130,8 +136,8 @@ user_set_mem_table(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg) sizeof(struct orig_region_map) * memory.nregions); if (dev->mem == NULL) { RTE_LOG(ERR, VHOST_CONFIG, - "(%"PRIu64") Failed to allocate memory for dev->mem\n", - dev->device_fh); + "(%d) failed to allocate memory for dev->mem\n", + dev->vid); return -1; } dev->mem->nregions = memory.nregions; @@ -162,6 +168,11 @@ user_set_mem_table(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg) * aligned. */ alignment = get_blk_size(pmsg->fds[idx]); + if (alignment == (uint64_t)-1) { + RTE_LOG(ERR, VHOST_CONFIG, + "couldn't get hugepage size through fstat\n"); + goto err_mmap; + } mapped_size = RTE_ALIGN_CEIL(mapped_size, alignment); mapped_address = (uint64_t)(uintptr_t)mmap(NULL, @@ -252,7 +263,7 @@ virtio_is_ready(struct virtio_net *dev) } void -user_set_vring_call(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg) +user_set_vring_call(int vid, struct VhostUserMsg *pmsg) { struct vhost_vring_file file; @@ -263,7 +274,7 @@ user_set_vring_call(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg) file.fd = pmsg->fds[0]; RTE_LOG(INFO, VHOST_CONFIG, "vring call idx:%d file:%d\n", file.index, file.fd); - vhost_set_vring_call(ctx, &file); + vhost_set_vring_call(vid, &file); } @@ -272,10 +283,13 @@ user_set_vring_call(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg) * device is ready for packet processing. */ void -user_set_vring_kick(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg) +user_set_vring_kick(int vid, struct VhostUserMsg *pmsg) { struct vhost_vring_file file; - struct virtio_net *dev = get_device(ctx); + struct virtio_net *dev = get_device(vid); + + if (!dev) + return; file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK) @@ -284,30 +298,32 @@ user_set_vring_kick(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg) file.fd = pmsg->fds[0]; RTE_LOG(INFO, VHOST_CONFIG, "vring kick idx:%d file:%d\n", file.index, file.fd); - vhost_set_vring_kick(ctx, &file); + vhost_set_vring_kick(vid, &file); - if (virtio_is_ready(dev) && - !(dev->flags & VIRTIO_DEV_RUNNING)) - notify_ops->new_device(dev); + if (virtio_is_ready(dev) && !(dev->flags & VIRTIO_DEV_RUNNING)) { + if (notify_ops->new_device(vid) == 0) + dev->flags |= VIRTIO_DEV_RUNNING; + } } /* * when virtio is stopped, qemu will send us the GET_VRING_BASE message. */ int -user_get_vring_base(struct vhost_device_ctx ctx, - struct vhost_vring_state *state) +user_get_vring_base(int vid, struct vhost_vring_state *state) { - struct virtio_net *dev = get_device(ctx); + struct virtio_net *dev = get_device(vid); if (dev == NULL) return -1; /* We have to stop the queue (virtio) if it is running. */ - if (dev->flags & VIRTIO_DEV_RUNNING) - notify_ops->destroy_device(dev); + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + notify_ops->destroy_device(vid); + } /* Here we are safe to get the last used index */ - vhost_get_vring_base(ctx, state->index, state); + vhost_get_vring_base(vid, state->index, state); RTE_LOG(INFO, VHOST_CONFIG, "vring base idx:%d file:%d\n", state->index, state->num); @@ -329,19 +345,21 @@ user_get_vring_base(struct vhost_device_ctx ctx, * enable the virtio queue pair. */ int -user_set_vring_enable(struct vhost_device_ctx ctx, - struct vhost_vring_state *state) +user_set_vring_enable(int vid, struct vhost_vring_state *state) { - struct virtio_net *dev = get_device(ctx); + struct virtio_net *dev; int enable = (int)state->num; + dev = get_device(vid); + if (dev == NULL) + return -1; + RTE_LOG(INFO, VHOST_CONFIG, "set queue enable: %d to qp idx: %d\n", enable, state->index); - if (notify_ops->vring_state_changed) { - notify_ops->vring_state_changed(dev, state->index, enable); - } + if (notify_ops->vring_state_changed) + notify_ops->vring_state_changed(vid, state->index, enable); dev->virtqueue[state->index]->enabled = enable; @@ -349,12 +367,11 @@ user_set_vring_enable(struct vhost_device_ctx ctx, } void -user_set_protocol_features(struct vhost_device_ctx ctx, - uint64_t protocol_features) +user_set_protocol_features(int vid, uint64_t protocol_features) { struct virtio_net *dev; - dev = get_device(ctx); + dev = get_device(vid); if (dev == NULL || protocol_features & ~VHOST_USER_PROTOCOL_FEATURES) return; @@ -362,15 +379,14 @@ user_set_protocol_features(struct vhost_device_ctx ctx, } int -user_set_log_base(struct vhost_device_ctx ctx, - struct VhostUserMsg *msg) +user_set_log_base(int vid, struct VhostUserMsg *msg) { struct virtio_net *dev; int fd = msg->fds[0]; uint64_t size, off; void *addr; - dev = get_device(ctx); + dev = get_device(vid); if (!dev) return -1; @@ -397,13 +413,21 @@ user_set_log_base(struct vhost_device_ctx ctx, * fail when offset is not page size aligned. */ addr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + close(fd); if (addr == MAP_FAILED) { RTE_LOG(ERR, VHOST_CONFIG, "mmap log base failed!\n"); return -1; } - /* TODO: unmap on stop */ - dev->log_base = (uint64_t)(uintptr_t)addr + off; + /* + * Free previously mapped log memory on occasionally + * multiple VHOST_USER_SET_LOG_BASE. + */ + if (dev->log_addr) { + munmap((void *)(uintptr_t)dev->log_addr, dev->log_size); + } + dev->log_addr = (uint64_t)(uintptr_t)addr; + dev->log_base = dev->log_addr + off; dev->log_size = size; return 0; @@ -418,12 +442,12 @@ user_set_log_base(struct vhost_device_ctx ctx, * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it. */ int -user_send_rarp(struct vhost_device_ctx ctx, struct VhostUserMsg *msg) +user_send_rarp(int vid, struct VhostUserMsg *msg) { struct virtio_net *dev; uint8_t *mac = (uint8_t *)&msg->payload.u64; - dev = get_device(ctx); + dev = get_device(vid); if (!dev) return -1; diff --git a/lib/librte_vhost/vhost_user/virtio-net-user.h b/lib/librte_vhost/vhost_user/virtio-net-user.h index cefec162..e1b967b8 100644 --- a/lib/librte_vhost/vhost_user/virtio-net-user.h +++ b/lib/librte_vhost/vhost_user/virtio-net-user.h @@ -45,20 +45,18 @@ (1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD) |\ (1ULL << VHOST_USER_PROTOCOL_F_RARP)) -int user_set_mem_table(struct vhost_device_ctx, struct VhostUserMsg *); +int user_set_mem_table(int, struct VhostUserMsg *); -void user_set_vring_call(struct vhost_device_ctx, struct VhostUserMsg *); +void user_set_vring_call(int, struct VhostUserMsg *); -void user_set_vring_kick(struct vhost_device_ctx, struct VhostUserMsg *); +void user_set_vring_kick(int, struct VhostUserMsg *); -void user_set_protocol_features(struct vhost_device_ctx ctx, - uint64_t protocol_features); -int user_set_log_base(struct vhost_device_ctx ctx, struct VhostUserMsg *); -int user_send_rarp(struct vhost_device_ctx ctx, struct VhostUserMsg *); +void user_set_protocol_features(int vid, uint64_t protocol_features); +int user_set_log_base(int vid, struct VhostUserMsg *); +int user_send_rarp(int vid, struct VhostUserMsg *); -int user_get_vring_base(struct vhost_device_ctx, struct vhost_vring_state *); +int user_get_vring_base(int, struct vhost_vring_state *); -int user_set_vring_enable(struct vhost_device_ctx ctx, - struct vhost_vring_state *state); +int user_set_vring_enable(int vid, struct vhost_vring_state *state); #endif diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c index d870ad97..1785695b 100644 --- a/lib/librte_vhost/virtio-net.c +++ b/lib/librte_vhost/virtio-net.c @@ -53,7 +53,6 @@ #include <rte_virtio_net.h> #include "vhost-net.h" -#include "virtio-net.h" #define MAX_VHOST_DEVICE 1024 static struct virtio_net *vhost_devices[MAX_VHOST_DEVICE]; @@ -108,15 +107,14 @@ qva_to_vva(struct virtio_net *dev, uint64_t qemu_va) return vhost_va; } - struct virtio_net * -get_device(struct vhost_device_ctx ctx) +get_device(int vid) { - struct virtio_net *dev = vhost_devices[ctx.fh]; + struct virtio_net *dev = vhost_devices[vid]; if (unlikely(!dev)) { RTE_LOG(ERR, VHOST_CONFIG, - "(%"PRIu64") device not found.\n", ctx.fh); + "(%d) device not found.\n", vid); } return dev; @@ -233,7 +231,7 @@ alloc_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx) /* * Reset some variables in device structure, while keeping few - * others untouched, such as device_fh, ifname, virt_qp_nb: they + * others untouched, such as vid, ifname, virt_qp_nb: they * should be same unless the device is removed. */ static void @@ -255,7 +253,7 @@ reset_device(struct virtio_net *dev) * list. */ int -vhost_new_device(struct vhost_device_ctx ctx) +vhost_new_device(void) { struct virtio_net *dev; int i; @@ -263,8 +261,7 @@ vhost_new_device(struct vhost_device_ctx ctx) dev = rte_zmalloc(NULL, sizeof(struct virtio_net), 0); if (dev == NULL) { RTE_LOG(ERR, VHOST_CONFIG, - "(%"PRIu64") Failed to allocate memory for dev.\n", - ctx.fh); + "Failed to allocate memory for new dev.\n"); return -1; } @@ -279,7 +276,7 @@ vhost_new_device(struct vhost_device_ctx ctx) } vhost_devices[i] = dev; - dev->device_fh = i; + dev->vid = i; return i; } @@ -289,30 +286,31 @@ vhost_new_device(struct vhost_device_ctx ctx) * cleanup the device and remove it from device configuration linked list. */ void -vhost_destroy_device(struct vhost_device_ctx ctx) +vhost_destroy_device(int vid) { - struct virtio_net *dev = get_device(ctx); + struct virtio_net *dev = get_device(vid); if (dev == NULL) return; - if (dev->flags & VIRTIO_DEV_RUNNING) - notify_ops->destroy_device(dev); + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + notify_ops->destroy_device(vid); + } cleanup_device(dev, 1); free_device(dev); - vhost_devices[ctx.fh] = NULL; + vhost_devices[vid] = NULL; } void -vhost_set_ifname(struct vhost_device_ctx ctx, - const char *if_name, unsigned int if_len) +vhost_set_ifname(int vid, const char *if_name, unsigned int if_len) { struct virtio_net *dev; unsigned int len; - dev = get_device(ctx); + dev = get_device(vid); if (dev == NULL) return; @@ -320,6 +318,7 @@ vhost_set_ifname(struct vhost_device_ctx ctx, sizeof(dev->ifname) : if_len; strncpy(dev->ifname, if_name, len); + dev->ifname[sizeof(dev->ifname) - 1] = '\0'; } @@ -329,11 +328,11 @@ vhost_set_ifname(struct vhost_device_ctx ctx, * the device hasn't been initialised. */ int -vhost_set_owner(struct vhost_device_ctx ctx) +vhost_set_owner(int vid) { struct virtio_net *dev; - dev = get_device(ctx); + dev = get_device(vid); if (dev == NULL) return -1; @@ -344,16 +343,18 @@ vhost_set_owner(struct vhost_device_ctx ctx) * Called from CUSE IOCTL: VHOST_RESET_OWNER */ int -vhost_reset_owner(struct vhost_device_ctx ctx) +vhost_reset_owner(int vid) { struct virtio_net *dev; - dev = get_device(ctx); + dev = get_device(vid); if (dev == NULL) return -1; - if (dev->flags & VIRTIO_DEV_RUNNING) - notify_ops->destroy_device(dev); + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + notify_ops->destroy_device(vid); + } cleanup_device(dev, 0); reset_device(dev); @@ -365,11 +366,11 @@ vhost_reset_owner(struct vhost_device_ctx ctx) * The features that we support are requested. */ int -vhost_get_features(struct vhost_device_ctx ctx, uint64_t *pu) +vhost_get_features(int vid, uint64_t *pu) { struct virtio_net *dev; - dev = get_device(ctx); + dev = get_device(vid); if (dev == NULL) return -1; @@ -383,13 +384,11 @@ vhost_get_features(struct vhost_device_ctx ctx, uint64_t *pu) * We receive the negotiated features supported by us and the virtio device. */ int -vhost_set_features(struct vhost_device_ctx ctx, uint64_t *pu) +vhost_set_features(int vid, uint64_t *pu) { struct virtio_net *dev; - uint16_t vhost_hlen; - uint16_t i; - dev = get_device(ctx); + dev = get_device(vid); if (dev == NULL) return -1; if (*pu & ~VHOST_FEATURES) @@ -398,23 +397,16 @@ vhost_set_features(struct vhost_device_ctx ctx, uint64_t *pu) dev->features = *pu; if (dev->features & ((1 << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_VERSION_1))) { - vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf); + dev->vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf); } else { - vhost_hlen = sizeof(struct virtio_net_hdr); + dev->vhost_hlen = sizeof(struct virtio_net_hdr); } LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") Mergeable RX buffers %s, virtio 1 %s\n", - dev->device_fh, + "(%d) mergeable RX buffers %s, virtio 1 %s\n", + dev->vid, (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? "on" : "off", (dev->features & (1ULL << VIRTIO_F_VERSION_1)) ? "on" : "off"); - for (i = 0; i < dev->virt_qp_nb; i++) { - uint16_t base_idx = i * VIRTIO_QNUM; - - dev->virtqueue[base_idx + VIRTIO_RXQ]->vhost_hlen = vhost_hlen; - dev->virtqueue[base_idx + VIRTIO_TXQ]->vhost_hlen = vhost_hlen; - } - return 0; } @@ -423,12 +415,11 @@ vhost_set_features(struct vhost_device_ctx ctx, uint64_t *pu) * The virtio device sends us the size of the descriptor ring. */ int -vhost_set_vring_num(struct vhost_device_ctx ctx, - struct vhost_vring_state *state) +vhost_set_vring_num(int vid, struct vhost_vring_state *state) { struct virtio_net *dev; - dev = get_device(ctx); + dev = get_device(vid); if (dev == NULL) return -1; @@ -509,7 +500,7 @@ numa_realloc(struct virtio_net *dev, int index) out: dev->virtqueue[index] = vq; dev->virtqueue[index + 1] = vq + 1; - vhost_devices[dev->device_fh] = dev; + vhost_devices[dev->vid] = dev; return dev; } @@ -527,12 +518,12 @@ numa_realloc(struct virtio_net *dev, int index __rte_unused) * This function then converts these to our address space. */ int -vhost_set_vring_addr(struct vhost_device_ctx ctx, struct vhost_vring_addr *addr) +vhost_set_vring_addr(int vid, struct vhost_vring_addr *addr) { struct virtio_net *dev; struct vhost_virtqueue *vq; - dev = get_device(ctx); + dev = get_device(vid); if ((dev == NULL) || (dev->mem == NULL)) return -1; @@ -544,8 +535,8 @@ vhost_set_vring_addr(struct vhost_device_ctx ctx, struct vhost_vring_addr *addr) addr->desc_user_addr); if (vq->desc == 0) { RTE_LOG(ERR, VHOST_CONFIG, - "(%"PRIu64") Failed to find desc ring address.\n", - dev->device_fh); + "(%d) failed to find desc ring address.\n", + dev->vid); return -1; } @@ -556,8 +547,8 @@ vhost_set_vring_addr(struct vhost_device_ctx ctx, struct vhost_vring_addr *addr) addr->avail_user_addr); if (vq->avail == 0) { RTE_LOG(ERR, VHOST_CONFIG, - "(%"PRIu64") Failed to find avail ring address.\n", - dev->device_fh); + "(%d) failed to find avail ring address.\n", + dev->vid); return -1; } @@ -565,21 +556,29 @@ vhost_set_vring_addr(struct vhost_device_ctx ctx, struct vhost_vring_addr *addr) addr->used_user_addr); if (vq->used == 0) { RTE_LOG(ERR, VHOST_CONFIG, - "(%"PRIu64") Failed to find used ring address.\n", - dev->device_fh); + "(%d) failed to find used ring address.\n", + dev->vid); return -1; } + if (vq->last_used_idx != vq->used->idx) { + RTE_LOG(WARNING, VHOST_CONFIG, + "last_used_idx (%u) and vq->used->idx (%u) mismatches; " + "some packets maybe resent for Tx and dropped for Rx\n", + vq->last_used_idx, vq->used->idx); + vq->last_used_idx = vq->used->idx; + } + vq->log_guest_addr = addr->log_guest_addr; - LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") mapped address desc: %p\n", - dev->device_fh, vq->desc); - LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") mapped address avail: %p\n", - dev->device_fh, vq->avail); - LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") mapped address used: %p\n", - dev->device_fh, vq->used); - LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") log_guest_addr: %"PRIx64"\n", - dev->device_fh, vq->log_guest_addr); + LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address desc: %p\n", + dev->vid, vq->desc); + LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address avail: %p\n", + dev->vid, vq->avail); + LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address used: %p\n", + dev->vid, vq->used); + LOG_DEBUG(VHOST_CONFIG, "(%d) log_guest_addr: %" PRIx64 "\n", + dev->vid, vq->log_guest_addr); return 0; } @@ -589,18 +588,16 @@ vhost_set_vring_addr(struct vhost_device_ctx ctx, struct vhost_vring_addr *addr) * The virtio device sends us the available ring last used index. */ int -vhost_set_vring_base(struct vhost_device_ctx ctx, - struct vhost_vring_state *state) +vhost_set_vring_base(int vid, struct vhost_vring_state *state) { struct virtio_net *dev; - dev = get_device(ctx); + dev = get_device(vid); if (dev == NULL) return -1; /* State->index refers to the queue index. The txq is 1, rxq is 0. */ dev->virtqueue[state->index]->last_used_idx = state->num; - dev->virtqueue[state->index]->last_used_idx_res = state->num; return 0; } @@ -610,12 +607,12 @@ vhost_set_vring_base(struct vhost_device_ctx ctx, * We send the virtio device our available ring last used index. */ int -vhost_get_vring_base(struct vhost_device_ctx ctx, uint32_t index, +vhost_get_vring_base(int vid, uint32_t index, struct vhost_vring_state *state) { struct virtio_net *dev; - dev = get_device(ctx); + dev = get_device(vid); if (dev == NULL) return -1; @@ -633,13 +630,13 @@ vhost_get_vring_base(struct vhost_device_ctx ctx, uint32_t index, * copied into our process space. */ int -vhost_set_vring_call(struct vhost_device_ctx ctx, struct vhost_vring_file *file) +vhost_set_vring_call(int vid, struct vhost_vring_file *file) { struct virtio_net *dev; struct vhost_virtqueue *vq; uint32_t cur_qp_idx = file->index / VIRTIO_QNUM; - dev = get_device(ctx); + dev = get_device(vid); if (dev == NULL) return -1; @@ -670,12 +667,12 @@ vhost_set_vring_call(struct vhost_device_ctx ctx, struct vhost_vring_file *file) * This fd gets copied into our process space. */ int -vhost_set_vring_kick(struct vhost_device_ctx ctx, struct vhost_vring_file *file) +vhost_set_vring_kick(int vid, struct vhost_vring_file *file) { struct virtio_net *dev; struct vhost_virtqueue *vq; - dev = get_device(ctx); + dev = get_device(vid); if (dev == NULL) return -1; @@ -700,11 +697,11 @@ vhost_set_vring_kick(struct vhost_device_ctx ctx, struct vhost_vring_file *file) * The device will still exist in the device configuration linked list. */ int -vhost_set_backend(struct vhost_device_ctx ctx, struct vhost_vring_file *file) +vhost_set_backend(int vid, struct vhost_vring_file *file) { struct virtio_net *dev; - dev = get_device(ctx); + dev = get_device(vid); if (dev == NULL) return -1; @@ -716,20 +713,98 @@ vhost_set_backend(struct vhost_device_ctx ctx, struct vhost_vring_file *file) * we add the device. */ if (!(dev->flags & VIRTIO_DEV_RUNNING)) { - if (((int)dev->virtqueue[VIRTIO_TXQ]->backend != VIRTIO_DEV_STOPPED) && - ((int)dev->virtqueue[VIRTIO_RXQ]->backend != VIRTIO_DEV_STOPPED)) { - return notify_ops->new_device(dev); + if (dev->virtqueue[VIRTIO_TXQ]->backend != VIRTIO_DEV_STOPPED && + dev->virtqueue[VIRTIO_RXQ]->backend != VIRTIO_DEV_STOPPED) { + if (notify_ops->new_device(vid) < 0) + return -1; + dev->flags |= VIRTIO_DEV_RUNNING; } - /* Otherwise we remove it. */ - } else - if (file->fd == VIRTIO_DEV_STOPPED) - notify_ops->destroy_device(dev); + } else if (file->fd == VIRTIO_DEV_STOPPED) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + notify_ops->destroy_device(vid); + } + + return 0; +} + +int +rte_vhost_get_numa_node(int vid) +{ +#ifdef RTE_LIBRTE_VHOST_NUMA + struct virtio_net *dev = get_device(vid); + int numa_node; + int ret; + + if (dev == NULL) + return -1; + + ret = get_mempolicy(&numa_node, NULL, 0, dev, + MPOL_F_NODE | MPOL_F_ADDR); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) failed to query numa node: %d\n", vid, ret); + return -1; + } + + return numa_node; +#else + RTE_SET_USED(vid); + return -1; +#endif +} + +uint32_t +rte_vhost_get_queue_num(int vid) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return 0; + + return dev->virt_qp_nb; +} + +int +rte_vhost_get_ifname(int vid, char *buf, size_t len) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return -1; + + len = RTE_MIN(len, sizeof(dev->ifname)); + + strncpy(buf, dev->ifname, len); + buf[len - 1] = '\0'; + return 0; } -int rte_vhost_enable_guest_notification(struct virtio_net *dev, - uint16_t queue_id, int enable) +uint16_t +rte_vhost_avail_entries(int vid, uint16_t queue_id) +{ + struct virtio_net *dev; + struct vhost_virtqueue *vq; + + dev = get_device(vid); + if (!dev) + return 0; + + vq = dev->virtqueue[queue_id]; + if (!vq->enabled) + return 0; + + return *(volatile uint16_t *)&vq->avail->idx - vq->last_used_idx; +} + +int +rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable) { + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return -1; + if (enable) { RTE_LOG(ERR, VHOST_CONFIG, "guest notification isn't supported.\n"); |