diff options
Diffstat (limited to 'lib')
178 files changed, 11110 insertions, 4419 deletions
diff --git a/lib/Makefile b/lib/Makefile index d82462ba..afa604e2 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -4,7 +4,10 @@ include $(RTE_SDK)/mk/rte.vars.mk DIRS-y += librte_compat +DIRS-$(CONFIG_RTE_LIBRTE_KVARGS) += librte_kvargs +DEPDIRS-librte_kvargs := librte_compat DIRS-$(CONFIG_RTE_LIBRTE_EAL) += librte_eal +DEPDIRS-librte_eal := librte_kvargs DIRS-$(CONFIG_RTE_LIBRTE_PCI) += librte_pci DEPDIRS-librte_pci := librte_eal DIRS-$(CONFIG_RTE_LIBRTE_RING) += librte_ring @@ -76,8 +79,6 @@ DEPDIRS-librte_flow_classify := librte_net librte_table librte_acl DIRS-$(CONFIG_RTE_LIBRTE_SCHED) += librte_sched DEPDIRS-librte_sched := librte_eal librte_mempool librte_mbuf librte_net DEPDIRS-librte_sched += librte_timer -DIRS-$(CONFIG_RTE_LIBRTE_KVARGS) += librte_kvargs -DEPDIRS-librte_kvargs := librte_eal DIRS-$(CONFIG_RTE_LIBRTE_DISTRIBUTOR) += librte_distributor DEPDIRS-librte_distributor := librte_eal librte_mbuf librte_ethdev DIRS-$(CONFIG_RTE_LIBRTE_PORT) += librte_port diff --git a/lib/librte_bbdev/rte_bbdev.c b/lib/librte_bbdev/rte_bbdev.c index 28434e08..c4cc18d9 100644 --- a/lib/librte_bbdev/rte_bbdev.c +++ b/lib/librte_bbdev/rte_bbdev.c @@ -1125,9 +1125,7 @@ rte_bbdev_op_type_str(enum rte_bbdev_op_type op_type) return NULL; } -RTE_INIT(rte_bbdev_init_log); -static void -rte_bbdev_init_log(void) +RTE_INIT(rte_bbdev_init_log) { bbdev_logtype = rte_log_register("lib.bbdev"); if (bbdev_logtype >= 0) diff --git a/lib/librte_bitratestats/rte_bitrate.c b/lib/librte_bitratestats/rte_bitrate.c index 964e3c39..c4b28f62 100644 --- a/lib/librte_bitratestats/rte_bitrate.c +++ b/lib/librte_bitratestats/rte_bitrate.c @@ -47,6 +47,9 @@ rte_stats_bitrate_reg(struct rte_stats_bitrates *bitrate_data) }; int return_value; + if (bitrate_data == NULL) + return -EINVAL; + return_value = rte_metrics_reg_names(&names[0], ARRAY_SIZE(names)); if (return_value >= 0) bitrate_data->id_stats_set = return_value; @@ -65,6 +68,9 @@ rte_stats_bitrate_calc(struct rte_stats_bitrates *bitrate_data, const int64_t alpha_percent = 20; uint64_t values[6]; + if (bitrate_data == NULL) + return -EINVAL; + ret_code = rte_eth_stats_get(port_id, ð_stats); if (ret_code != 0) return ret_code; diff --git a/lib/librte_bpf/bpf.c b/lib/librte_bpf/bpf.c index dc6d1099..f590c8c3 100644 --- a/lib/librte_bpf/bpf.c +++ b/lib/librte_bpf/bpf.c @@ -53,10 +53,7 @@ bpf_jit(struct rte_bpf *bpf) return rc; } -RTE_INIT(rte_bpf_init_log); - -static void -rte_bpf_init_log(void) +RTE_INIT(rte_bpf_init_log) { rte_bpf_logtype = rte_log_register("lib.bpf"); if (rte_bpf_logtype >= 0) diff --git a/lib/librte_bpf/bpf_def.h b/lib/librte_bpf/bpf_def.h index 6b69de34..c10f3aec 100644 --- a/lib/librte_bpf/bpf_def.h +++ b/lib/librte_bpf/bpf_def.h @@ -131,6 +131,11 @@ struct ebpf_insn { int32_t imm; }; +/* + * eBPF allows functions with R1-R5 as arguments. + */ +#define EBPF_FUNC_MAX_ARGS (EBPF_REG_6 - EBPF_REG_1) + #ifdef __cplusplus } #endif diff --git a/lib/librte_bpf/bpf_exec.c b/lib/librte_bpf/bpf_exec.c index e373b1f3..6a79139c 100644 --- a/lib/librte_bpf/bpf_exec.c +++ b/lib/librte_bpf/bpf_exec.c @@ -402,7 +402,7 @@ bpf_exec(const struct rte_bpf *bpf, uint64_t reg[EBPF_REG_NUM]) break; /* call instructions */ case (BPF_JMP | EBPF_CALL): - reg[EBPF_REG_0] = bpf->prm.xsym[ins->imm].func( + reg[EBPF_REG_0] = bpf->prm.xsym[ins->imm].func.val( reg[EBPF_REG_1], reg[EBPF_REG_2], reg[EBPF_REG_3], reg[EBPF_REG_4], reg[EBPF_REG_5]); diff --git a/lib/librte_bpf/bpf_impl.h b/lib/librte_bpf/bpf_impl.h index 5d7e65c3..b577e2cb 100644 --- a/lib/librte_bpf/bpf_impl.h +++ b/lib/librte_bpf/bpf_impl.h @@ -34,6 +34,20 @@ extern int rte_bpf_logtype; #define RTE_BPF_LOG(lvl, fmt, args...) \ rte_log(RTE_LOG_## lvl, rte_bpf_logtype, fmt, ##args) +static inline size_t +bpf_size(uint32_t bpf_op_sz) +{ + if (bpf_op_sz == BPF_B) + return sizeof(uint8_t); + else if (bpf_op_sz == BPF_H) + return sizeof(uint16_t); + else if (bpf_op_sz == BPF_W) + return sizeof(uint32_t); + else if (bpf_op_sz == EBPF_DW) + return sizeof(uint64_t); + return 0; +} + #ifdef __cplusplus } #endif diff --git a/lib/librte_bpf/bpf_jit_x86.c b/lib/librte_bpf/bpf_jit_x86.c index 111e028d..68ea389f 100644 --- a/lib/librte_bpf/bpf_jit_x86.c +++ b/lib/librte_bpf/bpf_jit_x86.c @@ -113,20 +113,6 @@ union bpf_jit_imm { uint8_t u8[4]; }; -static size_t -bpf_size(uint32_t bpf_op_sz) -{ - if (bpf_op_sz == BPF_B) - return sizeof(uint8_t); - else if (bpf_op_sz == BPF_H) - return sizeof(uint16_t); - else if (bpf_op_sz == BPF_W) - return sizeof(uint32_t); - else if (bpf_op_sz == EBPF_DW) - return sizeof(uint64_t); - return 0; -} - /* * In many cases for imm8 we can produce shorter code. */ @@ -1294,7 +1280,8 @@ emit(struct bpf_jit_state *st, const struct rte_bpf *bpf) break; /* call instructions */ case (BPF_JMP | EBPF_CALL): - emit_call(st, (uintptr_t)bpf->prm.xsym[ins->imm].func); + emit_call(st, + (uintptr_t)bpf->prm.xsym[ins->imm].func.val); break; /* return instruction */ case (BPF_JMP | EBPF_EXIT): diff --git a/lib/librte_bpf/bpf_load.c b/lib/librte_bpf/bpf_load.c index d1c9abd7..2b84fe72 100644 --- a/lib/librte_bpf/bpf_load.c +++ b/lib/librte_bpf/bpf_load.c @@ -51,17 +51,64 @@ bpf_load(const struct rte_bpf_prm *prm) return bpf; } +/* + * Check that user provided external symbol. + */ +static int +bpf_check_xsym(const struct rte_bpf_xsym *xsym) +{ + uint32_t i; + + if (xsym->name == NULL) + return -EINVAL; + + if (xsym->type == RTE_BPF_XTYPE_VAR) { + if (xsym->var.desc.type == RTE_BPF_ARG_UNDEF) + return -EINVAL; + } else if (xsym->type == RTE_BPF_XTYPE_FUNC) { + + if (xsym->func.nb_args > EBPF_FUNC_MAX_ARGS) + return -EINVAL; + + /* check function arguments */ + for (i = 0; i != xsym->func.nb_args; i++) { + if (xsym->func.args[i].type == RTE_BPF_ARG_UNDEF) + return -EINVAL; + } + + /* check return value info */ + if (xsym->func.ret.type != RTE_BPF_ARG_UNDEF && + xsym->func.ret.size == 0) + return -EINVAL; + } else + return -EINVAL; + + return 0; +} + __rte_experimental struct rte_bpf * rte_bpf_load(const struct rte_bpf_prm *prm) { struct rte_bpf *bpf; int32_t rc; + uint32_t i; - if (prm == NULL || prm->ins == NULL) { + if (prm == NULL || prm->ins == NULL || + (prm->nb_xsym != 0 && prm->xsym == NULL)) { rte_errno = EINVAL; return NULL; } + rc = 0; + for (i = 0; i != prm->nb_xsym && rc == 0; i++) + rc = bpf_check_xsym(prm->xsym + i); + + if (rc != 0) { + rte_errno = -rc; + RTE_BPF_LOG(ERR, "%s: %d-th xsym is invalid\n", __func__, i); + return NULL; + } + bpf = bpf_load(prm); if (bpf == NULL) { rte_errno = ENOMEM; diff --git a/lib/librte_bpf/bpf_load_elf.c b/lib/librte_bpf/bpf_load_elf.c index 6ab03d86..96d3630f 100644 --- a/lib/librte_bpf/bpf_load_elf.c +++ b/lib/librte_bpf/bpf_load_elf.c @@ -81,9 +81,9 @@ resolve_xsym(const char *sn, size_t ofs, struct ebpf_insn *ins, size_t ins_sz, ins[idx].imm = fidx; /* for variable we need to store its absolute address */ else { - ins[idx].imm = (uintptr_t)prm->xsym[fidx].var; + ins[idx].imm = (uintptr_t)prm->xsym[fidx].var.val; ins[idx + 1].imm = - (uint64_t)(uintptr_t)prm->xsym[fidx].var >> 32; + (uint64_t)(uintptr_t)prm->xsym[fidx].var.val >> 32; } return 0; diff --git a/lib/librte_bpf/bpf_validate.c b/lib/librte_bpf/bpf_validate.c index b7081c85..83983efc 100644 --- a/lib/librte_bpf/bpf_validate.c +++ b/lib/librte_bpf/bpf_validate.c @@ -11,9 +11,28 @@ #include <rte_common.h> #include <rte_eal.h> +#include <rte_byteorder.h> #include "bpf_impl.h" +struct bpf_reg_val { + struct rte_bpf_arg v; + uint64_t mask; + struct { + int64_t min; + int64_t max; + } s; + struct { + uint64_t min; + uint64_t max; + } u; +}; + +struct bpf_eval_state { + struct bpf_reg_val rv[EBPF_REG_NUM]; + struct bpf_reg_val sv[MAX_BPF_STACK_SIZE / sizeof(uint64_t)]; +}; + /* possible instruction node colour */ enum { WHITE, @@ -31,14 +50,6 @@ enum { MAX_EDGE_TYPE }; -struct bpf_reg_state { - uint64_t val; -}; - -struct bpf_eval_state { - struct bpf_reg_state rs[EBPF_REG_NUM]; -}; - #define MAX_EDGES 2 struct inst_node { @@ -54,12 +65,13 @@ struct inst_node { struct bpf_verifier { const struct rte_bpf_prm *prm; struct inst_node *in; - int32_t stack_sz; + uint64_t stack_sz; uint32_t nb_nodes; uint32_t nb_jcc_nodes; uint32_t node_colour[MAX_NODE_COLOUR]; uint32_t edge_type[MAX_EDGE_TYPE]; struct bpf_eval_state *evst; + struct inst_node *evin; struct { uint32_t num; uint32_t cur; @@ -101,40 +113,823 @@ check_alu_bele(const struct ebpf_insn *ins) } static const char * -eval_stack(struct bpf_verifier *bvf, const struct ebpf_insn *ins) +eval_exit(struct bpf_verifier *bvf, const struct ebpf_insn *ins) +{ + RTE_SET_USED(ins); + if (bvf->evst->rv[EBPF_REG_0].v.type == RTE_BPF_ARG_UNDEF) + return "undefined return value"; + return NULL; +} + +/* setup max possible with this mask bounds */ +static void +eval_umax_bound(struct bpf_reg_val *rv, uint64_t mask) +{ + rv->u.max = mask; + rv->u.min = 0; +} + +static void +eval_smax_bound(struct bpf_reg_val *rv, uint64_t mask) +{ + rv->s.max = mask >> 1; + rv->s.min = rv->s.max ^ UINT64_MAX; +} + +static void +eval_max_bound(struct bpf_reg_val *rv, uint64_t mask) +{ + eval_umax_bound(rv, mask); + eval_smax_bound(rv, mask); +} + +static void +eval_fill_max_bound(struct bpf_reg_val *rv, uint64_t mask) +{ + eval_max_bound(rv, mask); + rv->v.type = RTE_BPF_ARG_RAW; + rv->mask = mask; +} + +static void +eval_fill_imm64(struct bpf_reg_val *rv, uint64_t mask, uint64_t val) +{ + rv->mask = mask; + rv->s.min = val; + rv->s.max = val; + rv->u.min = val; + rv->u.max = val; +} + +static void +eval_fill_imm(struct bpf_reg_val *rv, uint64_t mask, int32_t imm) +{ + uint64_t v; + + v = (uint64_t)imm & mask; + + rv->v.type = RTE_BPF_ARG_RAW; + eval_fill_imm64(rv, mask, v); +} + +static const char * +eval_ld_imm64(struct bpf_verifier *bvf, const struct ebpf_insn *ins) { - int32_t ofs; + uint32_t i; + uint64_t val; + struct bpf_reg_val *rd; + + val = (uint32_t)ins[0].imm | (uint64_t)(uint32_t)ins[1].imm << 32; - ofs = ins->off; + rd = bvf->evst->rv + ins->dst_reg; + rd->v.type = RTE_BPF_ARG_RAW; + eval_fill_imm64(rd, UINT64_MAX, val); - if (ofs >= 0 || ofs < -MAX_BPF_STACK_SIZE) - return "stack boundary violation"; + for (i = 0; i != bvf->prm->nb_xsym; i++) { + + /* load of external variable */ + if (bvf->prm->xsym[i].type == RTE_BPF_XTYPE_VAR && + (uintptr_t)bvf->prm->xsym[i].var.val == val) { + rd->v = bvf->prm->xsym[i].var.desc; + eval_fill_imm64(rd, UINT64_MAX, 0); + break; + } + } - ofs = -ofs; - bvf->stack_sz = RTE_MAX(bvf->stack_sz, ofs); return NULL; } +static void +eval_apply_mask(struct bpf_reg_val *rv, uint64_t mask) +{ + struct bpf_reg_val rt; + + rt.u.min = rv->u.min & mask; + rt.u.max = rv->u.max & mask; + if (rt.u.min != rv->u.min || rt.u.max != rv->u.max) { + rv->u.max = RTE_MAX(rt.u.max, mask); + rv->u.min = 0; + } + + eval_smax_bound(&rt, mask); + rv->s.max = RTE_MIN(rt.s.max, rv->s.max); + rv->s.min = RTE_MAX(rt.s.min, rv->s.min); + + rv->mask = mask; +} + +static void +eval_add(struct bpf_reg_val *rd, const struct bpf_reg_val *rs, uint64_t msk) +{ + struct bpf_reg_val rv; + + rv.u.min = (rd->u.min + rs->u.min) & msk; + rv.u.max = (rd->u.min + rs->u.max) & msk; + rv.s.min = (rd->s.min + rs->s.min) & msk; + rv.s.max = (rd->s.max + rs->s.max) & msk; + + /* + * if at least one of the operands is not constant, + * then check for overflow + */ + if ((rd->u.min != rd->u.max || rs->u.min != rs->u.max) && + (rv.u.min < rd->u.min || rv.u.max < rd->u.max)) + eval_umax_bound(&rv, msk); + + if ((rd->s.min != rd->s.max || rs->s.min != rs->s.max) && + (((rs->s.min < 0 && rv.s.min > rd->s.min) || + rv.s.min < rd->s.min) || + ((rs->s.max < 0 && rv.s.max > rd->s.max) || + rv.s.max < rd->s.max))) + eval_smax_bound(&rv, msk); + + rd->s = rv.s; + rd->u = rv.u; +} + +static void +eval_sub(struct bpf_reg_val *rd, const struct bpf_reg_val *rs, uint64_t msk) +{ + struct bpf_reg_val rv; + + rv.u.min = (rd->u.min - rs->u.min) & msk; + rv.u.max = (rd->u.min - rs->u.max) & msk; + rv.s.min = (rd->s.min - rs->s.min) & msk; + rv.s.max = (rd->s.max - rs->s.max) & msk; + + /* + * if at least one of the operands is not constant, + * then check for overflow + */ + if ((rd->u.min != rd->u.max || rs->u.min != rs->u.max) && + (rv.u.min > rd->u.min || rv.u.max > rd->u.max)) + eval_umax_bound(&rv, msk); + + if ((rd->s.min != rd->s.max || rs->s.min != rs->s.max) && + (((rs->s.min < 0 && rv.s.min < rd->s.min) || + rv.s.min > rd->s.min) || + ((rs->s.max < 0 && rv.s.max < rd->s.max) || + rv.s.max > rd->s.max))) + eval_smax_bound(&rv, msk); + + rd->s = rv.s; + rd->u = rv.u; +} + +static void +eval_lsh(struct bpf_reg_val *rd, const struct bpf_reg_val *rs, size_t opsz, + uint64_t msk) +{ + /* check if shift value is less then max result bits */ + if (rs->u.max >= opsz) { + eval_max_bound(rd, msk); + return; + } + + /* check for overflow */ + if (rd->u.max > RTE_LEN2MASK(opsz - rs->u.max, uint64_t)) + eval_umax_bound(rd, msk); + else { + rd->u.max <<= rs->u.max; + rd->u.min <<= rs->u.min; + } + + /* check that dreg values are and would remain always positive */ + if ((uint64_t)rd->s.min >> (opsz - 1) != 0 || rd->s.max >= + RTE_LEN2MASK(opsz - rs->u.max - 1, int64_t)) + eval_smax_bound(rd, msk); + else { + rd->s.max <<= rs->u.max; + rd->s.min <<= rs->u.min; + } +} + +static void +eval_rsh(struct bpf_reg_val *rd, const struct bpf_reg_val *rs, size_t opsz, + uint64_t msk) +{ + /* check if shift value is less then max result bits */ + if (rs->u.max >= opsz) { + eval_max_bound(rd, msk); + return; + } + + rd->u.max >>= rs->u.min; + rd->u.min >>= rs->u.max; + + /* check that dreg values are always positive */ + if ((uint64_t)rd->s.min >> (opsz - 1) != 0) + eval_smax_bound(rd, msk); + else { + rd->s.max >>= rs->u.min; + rd->s.min >>= rs->u.max; + } +} + +static void +eval_arsh(struct bpf_reg_val *rd, const struct bpf_reg_val *rs, size_t opsz, + uint64_t msk) +{ + uint32_t shv; + + /* check if shift value is less then max result bits */ + if (rs->u.max >= opsz) { + eval_max_bound(rd, msk); + return; + } + + rd->u.max = (int64_t)rd->u.max >> rs->u.min; + rd->u.min = (int64_t)rd->u.min >> rs->u.max; + + /* if we have 32-bit values - extend them to 64-bit */ + if (opsz == sizeof(uint32_t) * CHAR_BIT) { + rd->s.min <<= opsz; + rd->s.max <<= opsz; + shv = opsz; + } else + shv = 0; + + if (rd->s.min < 0) + rd->s.min = (rd->s.min >> (rs->u.min + shv)) & msk; + else + rd->s.min = (rd->s.min >> (rs->u.max + shv)) & msk; + + if (rd->s.max < 0) + rd->s.max = (rd->s.max >> (rs->u.max + shv)) & msk; + else + rd->s.max = (rd->s.max >> (rs->u.min + shv)) & msk; +} + +static uint64_t +eval_umax_bits(uint64_t v, size_t opsz) +{ + if (v == 0) + return 0; + + v = __builtin_clzll(v); + return RTE_LEN2MASK(opsz - v, uint64_t); +} + +/* estimate max possible value for (v1 & v2) */ +static uint64_t +eval_uand_max(uint64_t v1, uint64_t v2, size_t opsz) +{ + v1 = eval_umax_bits(v1, opsz); + v2 = eval_umax_bits(v2, opsz); + return (v1 & v2); +} + +/* estimate max possible value for (v1 | v2) */ +static uint64_t +eval_uor_max(uint64_t v1, uint64_t v2, size_t opsz) +{ + v1 = eval_umax_bits(v1, opsz); + v2 = eval_umax_bits(v2, opsz); + return (v1 | v2); +} + +static void +eval_and(struct bpf_reg_val *rd, const struct bpf_reg_val *rs, size_t opsz, + uint64_t msk) +{ + /* both operands are constants */ + if (rd->u.min == rd->u.max && rs->u.min == rs->u.max) { + rd->u.min &= rs->u.min; + rd->u.max &= rs->u.max; + } else { + rd->u.max = eval_uand_max(rd->u.max, rs->u.max, opsz); + rd->u.min &= rs->u.min; + } + + /* both operands are constants */ + if (rd->s.min == rd->s.max && rs->s.min == rs->s.max) { + rd->s.min &= rs->s.min; + rd->s.max &= rs->s.max; + /* at least one of operand is non-negative */ + } else if (rd->s.min >= 0 || rs->s.min >= 0) { + rd->s.max = eval_uand_max(rd->s.max & (msk >> 1), + rs->s.max & (msk >> 1), opsz); + rd->s.min &= rs->s.min; + } else + eval_smax_bound(rd, msk); +} + +static void +eval_or(struct bpf_reg_val *rd, const struct bpf_reg_val *rs, size_t opsz, + uint64_t msk) +{ + /* both operands are constants */ + if (rd->u.min == rd->u.max && rs->u.min == rs->u.max) { + rd->u.min |= rs->u.min; + rd->u.max |= rs->u.max; + } else { + rd->u.max = eval_uor_max(rd->u.max, rs->u.max, opsz); + rd->u.min |= rs->u.min; + } + + /* both operands are constants */ + if (rd->s.min == rd->s.max && rs->s.min == rs->s.max) { + rd->s.min |= rs->s.min; + rd->s.max |= rs->s.max; + + /* both operands are non-negative */ + } else if (rd->s.min >= 0 || rs->s.min >= 0) { + rd->s.max = eval_uor_max(rd->s.max, rs->s.max, opsz); + rd->s.min |= rs->s.min; + } else + eval_smax_bound(rd, msk); +} + +static void +eval_xor(struct bpf_reg_val *rd, const struct bpf_reg_val *rs, size_t opsz, + uint64_t msk) +{ + /* both operands are constants */ + if (rd->u.min == rd->u.max && rs->u.min == rs->u.max) { + rd->u.min ^= rs->u.min; + rd->u.max ^= rs->u.max; + } else { + rd->u.max = eval_uor_max(rd->u.max, rs->u.max, opsz); + rd->u.min = 0; + } + + /* both operands are constants */ + if (rd->s.min == rd->s.max && rs->s.min == rs->s.max) { + rd->s.min ^= rs->s.min; + rd->s.max ^= rs->s.max; + + /* both operands are non-negative */ + } else if (rd->s.min >= 0 || rs->s.min >= 0) { + rd->s.max = eval_uor_max(rd->s.max, rs->s.max, opsz); + rd->s.min = 0; + } else + eval_smax_bound(rd, msk); +} + +static void +eval_mul(struct bpf_reg_val *rd, const struct bpf_reg_val *rs, size_t opsz, + uint64_t msk) +{ + /* both operands are constants */ + if (rd->u.min == rd->u.max && rs->u.min == rs->u.max) { + rd->u.min = (rd->u.min * rs->u.min) & msk; + rd->u.max = (rd->u.max * rs->u.max) & msk; + /* check for overflow */ + } else if (rd->u.max <= msk >> opsz / 2 && rs->u.max <= msk >> opsz) { + rd->u.max *= rs->u.max; + rd->u.min *= rd->u.min; + } else + eval_umax_bound(rd, msk); + + /* both operands are constants */ + if (rd->s.min == rd->s.max && rs->s.min == rs->s.max) { + rd->s.min = (rd->s.min * rs->s.min) & msk; + rd->s.max = (rd->s.max * rs->s.max) & msk; + /* check that both operands are positive and no overflow */ + } else if (rd->s.min >= 0 && rs->s.min >= 0) { + rd->s.max *= rs->s.max; + rd->s.min *= rd->s.min; + } else + eval_smax_bound(rd, msk); +} + static const char * -eval_store(struct bpf_verifier *bvf, const struct ebpf_insn *ins) +eval_divmod(uint32_t op, struct bpf_reg_val *rd, struct bpf_reg_val *rs, + size_t opsz, uint64_t msk) { - if (ins->dst_reg == EBPF_REG_10) - return eval_stack(bvf, ins); + /* both operands are constants */ + if (rd->u.min == rd->u.max && rs->u.min == rs->u.max) { + if (rs->u.max == 0) + return "division by 0"; + if (op == BPF_DIV) { + rd->u.min /= rs->u.min; + rd->u.max /= rs->u.max; + } else { + rd->u.min %= rs->u.min; + rd->u.max %= rs->u.max; + } + } else { + if (op == BPF_MOD) + rd->u.max = RTE_MIN(rd->u.max, rs->u.max - 1); + else + rd->u.max = rd->u.max; + rd->u.min = 0; + } + + /* if we have 32-bit values - extend them to 64-bit */ + if (opsz == sizeof(uint32_t) * CHAR_BIT) { + rd->s.min = (int32_t)rd->s.min; + rd->s.max = (int32_t)rd->s.max; + rs->s.min = (int32_t)rs->s.min; + rs->s.max = (int32_t)rs->s.max; + } + + /* both operands are constants */ + if (rd->s.min == rd->s.max && rs->s.min == rs->s.max) { + if (rs->s.max == 0) + return "division by 0"; + if (op == BPF_DIV) { + rd->s.min /= rs->s.min; + rd->s.max /= rs->s.max; + } else { + rd->s.min %= rs->s.min; + rd->s.max %= rs->s.max; + } + } else if (op == BPF_MOD) { + rd->s.min = RTE_MAX(rd->s.max, 0); + rd->s.min = RTE_MIN(rd->s.min, 0); + } else + eval_smax_bound(rd, msk); + + rd->s.max &= msk; + rd->s.min &= msk; + return NULL; } +static void +eval_neg(struct bpf_reg_val *rd, size_t opsz, uint64_t msk) +{ + uint64_t ux, uy; + int64_t sx, sy; + + /* if we have 32-bit values - extend them to 64-bit */ + if (opsz == sizeof(uint32_t) * CHAR_BIT) { + rd->u.min = (int32_t)rd->u.min; + rd->u.max = (int32_t)rd->u.max; + } + + ux = -(int64_t)rd->u.min & msk; + uy = -(int64_t)rd->u.max & msk; + + rd->u.max = RTE_MAX(ux, uy); + rd->u.min = RTE_MIN(ux, uy); + + /* if we have 32-bit values - extend them to 64-bit */ + if (opsz == sizeof(uint32_t) * CHAR_BIT) { + rd->s.min = (int32_t)rd->s.min; + rd->s.max = (int32_t)rd->s.max; + } + + sx = -rd->s.min & msk; + sy = -rd->s.max & msk; + + rd->s.max = RTE_MAX(sx, sy); + rd->s.min = RTE_MIN(sx, sy); +} + +/* + * check that destination and source operand are in defined state. + */ +static const char * +eval_defined(const struct bpf_reg_val *dst, const struct bpf_reg_val *src) +{ + if (dst != NULL && dst->v.type == RTE_BPF_ARG_UNDEF) + return "dest reg value is undefined"; + if (src != NULL && src->v.type == RTE_BPF_ARG_UNDEF) + return "src reg value is undefined"; + return NULL; +} + +static const char * +eval_alu(struct bpf_verifier *bvf, const struct ebpf_insn *ins) +{ + uint64_t msk; + uint32_t op; + size_t opsz; + const char *err; + struct bpf_eval_state *st; + struct bpf_reg_val *rd, rs; + + opsz = (BPF_CLASS(ins->code) == BPF_ALU) ? + sizeof(uint32_t) : sizeof(uint64_t); + opsz = opsz * CHAR_BIT; + msk = RTE_LEN2MASK(opsz, uint64_t); + + st = bvf->evst; + rd = st->rv + ins->dst_reg; + + if (BPF_SRC(ins->code) == BPF_X) { + rs = st->rv[ins->src_reg]; + eval_apply_mask(&rs, msk); + } else + eval_fill_imm(&rs, msk, ins->imm); + + eval_apply_mask(rd, msk); + + op = BPF_OP(ins->code); + + err = eval_defined((op != EBPF_MOV) ? rd : NULL, + (op != BPF_NEG) ? &rs : NULL); + if (err != NULL) + return err; + + if (op == BPF_ADD) + eval_add(rd, &rs, msk); + else if (op == BPF_SUB) + eval_sub(rd, &rs, msk); + else if (op == BPF_LSH) + eval_lsh(rd, &rs, opsz, msk); + else if (op == BPF_RSH) + eval_rsh(rd, &rs, opsz, msk); + else if (op == EBPF_ARSH) + eval_arsh(rd, &rs, opsz, msk); + else if (op == BPF_AND) + eval_and(rd, &rs, opsz, msk); + else if (op == BPF_OR) + eval_or(rd, &rs, opsz, msk); + else if (op == BPF_XOR) + eval_xor(rd, &rs, opsz, msk); + else if (op == BPF_MUL) + eval_mul(rd, &rs, opsz, msk); + else if (op == BPF_DIV || op == BPF_MOD) + err = eval_divmod(op, rd, &rs, opsz, msk); + else if (op == BPF_NEG) + eval_neg(rd, opsz, msk); + else if (op == EBPF_MOV) + *rd = rs; + else + eval_max_bound(rd, msk); + + return err; +} + +static const char * +eval_bele(struct bpf_verifier *bvf, const struct ebpf_insn *ins) +{ + uint64_t msk; + struct bpf_eval_state *st; + struct bpf_reg_val *rd; + const char *err; + + msk = RTE_LEN2MASK(ins->imm, uint64_t); + + st = bvf->evst; + rd = st->rv + ins->dst_reg; + + err = eval_defined(rd, NULL); + if (err != NULL) + return err; + +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN + if (ins->code == (BPF_ALU | EBPF_END | EBPF_TO_BE)) + eval_max_bound(rd, msk); + else + eval_apply_mask(rd, msk); +#else + if (ins->code == (BPF_ALU | EBPF_END | EBPF_TO_LE)) + eval_max_bound(rd, msk); + else + eval_apply_mask(rd, msk); +#endif + + return NULL; +} + +static const char * +eval_ptr(struct bpf_verifier *bvf, struct bpf_reg_val *rm, uint32_t opsz, + uint32_t align, int16_t off) +{ + struct bpf_reg_val rv; + + /* calculate reg + offset */ + eval_fill_imm(&rv, rm->mask, off); + eval_add(rm, &rv, rm->mask); + + if (RTE_BPF_ARG_PTR_TYPE(rm->v.type) == 0) + return "destination is not a pointer"; + + if (rm->mask != UINT64_MAX) + return "pointer truncation"; + + if (rm->u.max + opsz > rm->v.size || + (uint64_t)rm->s.max + opsz > rm->v.size || + rm->s.min < 0) + return "memory boundary violation"; + + if (rm->u.max % align != 0) + return "unaligned memory access"; + + if (rm->v.type == RTE_BPF_ARG_PTR_STACK) { + + if (rm->u.max != rm->u.min || rm->s.max != rm->s.min || + rm->u.max != (uint64_t)rm->s.max) + return "stack access with variable offset"; + + bvf->stack_sz = RTE_MAX(bvf->stack_sz, rm->v.size - rm->u.max); + + /* pointer to mbuf */ + } else if (rm->v.type == RTE_BPF_ARG_PTR_MBUF) { + + if (rm->u.max != rm->u.min || rm->s.max != rm->s.min || + rm->u.max != (uint64_t)rm->s.max) + return "mbuf access with variable offset"; + } + + return NULL; +} + +static void +eval_max_load(struct bpf_reg_val *rv, uint64_t mask) +{ + eval_umax_bound(rv, mask); + + /* full 64-bit load */ + if (mask == UINT64_MAX) + eval_smax_bound(rv, mask); + + /* zero-extend load */ + rv->s.min = rv->u.min; + rv->s.max = rv->u.max; +} + + static const char * eval_load(struct bpf_verifier *bvf, const struct ebpf_insn *ins) { - if (ins->src_reg == EBPF_REG_10) - return eval_stack(bvf, ins); + uint32_t opsz; + uint64_t msk; + const char *err; + struct bpf_eval_state *st; + struct bpf_reg_val *rd, rs; + const struct bpf_reg_val *sv; + + st = bvf->evst; + rd = st->rv + ins->dst_reg; + rs = st->rv[ins->src_reg]; + opsz = bpf_size(BPF_SIZE(ins->code)); + msk = RTE_LEN2MASK(opsz * CHAR_BIT, uint64_t); + + err = eval_ptr(bvf, &rs, opsz, 1, ins->off); + if (err != NULL) + return err; + + if (rs.v.type == RTE_BPF_ARG_PTR_STACK) { + + sv = st->sv + rs.u.max / sizeof(uint64_t); + if (sv->v.type == RTE_BPF_ARG_UNDEF || sv->mask < msk) + return "undefined value on the stack"; + + *rd = *sv; + + /* pointer to mbuf */ + } else if (rs.v.type == RTE_BPF_ARG_PTR_MBUF) { + + if (rs.u.max == offsetof(struct rte_mbuf, next)) { + eval_fill_imm(rd, msk, 0); + rd->v = rs.v; + } else if (rs.u.max == offsetof(struct rte_mbuf, buf_addr)) { + eval_fill_imm(rd, msk, 0); + rd->v.type = RTE_BPF_ARG_PTR; + rd->v.size = rs.v.buf_size; + } else if (rs.u.max == offsetof(struct rte_mbuf, data_off)) { + eval_fill_imm(rd, msk, RTE_PKTMBUF_HEADROOM); + rd->v.type = RTE_BPF_ARG_RAW; + } else { + eval_max_load(rd, msk); + rd->v.type = RTE_BPF_ARG_RAW; + } + + /* pointer to raw data */ + } else { + eval_max_load(rd, msk); + rd->v.type = RTE_BPF_ARG_RAW; + } + return NULL; } static const char * +eval_mbuf_store(const struct bpf_reg_val *rv, uint32_t opsz) +{ + uint32_t i; + + static const struct { + size_t off; + size_t sz; + } mbuf_ro_fileds[] = { + { .off = offsetof(struct rte_mbuf, buf_addr), }, + { .off = offsetof(struct rte_mbuf, refcnt), }, + { .off = offsetof(struct rte_mbuf, nb_segs), }, + { .off = offsetof(struct rte_mbuf, buf_len), }, + { .off = offsetof(struct rte_mbuf, pool), }, + { .off = offsetof(struct rte_mbuf, next), }, + { .off = offsetof(struct rte_mbuf, priv_size), }, + }; + + for (i = 0; i != RTE_DIM(mbuf_ro_fileds) && + (mbuf_ro_fileds[i].off + mbuf_ro_fileds[i].sz <= + rv->u.max || rv->u.max + opsz <= mbuf_ro_fileds[i].off); + i++) + ; + + if (i != RTE_DIM(mbuf_ro_fileds)) + return "store to the read-only mbuf field"; + + return NULL; + +} + +static const char * +eval_store(struct bpf_verifier *bvf, const struct ebpf_insn *ins) +{ + uint32_t opsz; + uint64_t msk; + const char *err; + struct bpf_eval_state *st; + struct bpf_reg_val rd, rs, *sv; + + opsz = bpf_size(BPF_SIZE(ins->code)); + msk = RTE_LEN2MASK(opsz * CHAR_BIT, uint64_t); + + st = bvf->evst; + rd = st->rv[ins->dst_reg]; + + if (BPF_CLASS(ins->code) == BPF_STX) { + rs = st->rv[ins->src_reg]; + eval_apply_mask(&rs, msk); + } else + eval_fill_imm(&rs, msk, ins->imm); + + err = eval_defined(NULL, &rs); + if (err != NULL) + return err; + + err = eval_ptr(bvf, &rd, opsz, 1, ins->off); + if (err != NULL) + return err; + + if (rd.v.type == RTE_BPF_ARG_PTR_STACK) { + + sv = st->sv + rd.u.max / sizeof(uint64_t); + if (BPF_CLASS(ins->code) == BPF_STX && + BPF_MODE(ins->code) == EBPF_XADD) + eval_max_bound(sv, msk); + else + *sv = rs; + + /* pointer to mbuf */ + } else if (rd.v.type == RTE_BPF_ARG_PTR_MBUF) { + err = eval_mbuf_store(&rd, opsz); + if (err != NULL) + return err; + } + + return NULL; +} + +static const char * +eval_func_arg(struct bpf_verifier *bvf, const struct rte_bpf_arg *arg, + struct bpf_reg_val *rv) +{ + uint32_t i, n; + struct bpf_eval_state *st; + const char *err; + + st = bvf->evst; + + if (rv->v.type == RTE_BPF_ARG_UNDEF) + return "Undefined argument type"; + + if (arg->type != rv->v.type && + arg->type != RTE_BPF_ARG_RAW && + (arg->type != RTE_BPF_ARG_PTR || + RTE_BPF_ARG_PTR_TYPE(rv->v.type) == 0)) + return "Invalid argument type"; + + err = NULL; + + /* argument is a pointer */ + if (RTE_BPF_ARG_PTR_TYPE(arg->type) != 0) { + + err = eval_ptr(bvf, rv, arg->size, 1, 0); + + /* + * pointer to the variable on the stack is passed + * as an argument, mark stack space it occupies as initialized. + */ + if (err == NULL && rv->v.type == RTE_BPF_ARG_PTR_STACK) { + + i = rv->u.max / sizeof(uint64_t); + n = i + arg->size / sizeof(uint64_t); + while (i != n) { + eval_fill_max_bound(st->sv + i, UINT64_MAX); + i++; + }; + } + } + + return err; +} + +static const char * eval_call(struct bpf_verifier *bvf, const struct ebpf_insn *ins) { - uint32_t idx; + uint64_t msk; + uint32_t i, idx; + struct bpf_reg_val *rv; + const struct rte_bpf_xsym *xsym; + const char *err; idx = ins->imm; @@ -145,6 +940,144 @@ eval_call(struct bpf_verifier *bvf, const struct ebpf_insn *ins) /* for now don't support function calls on 32 bit platform */ if (sizeof(uint64_t) != sizeof(uintptr_t)) return "function calls are supported only for 64 bit apps"; + + xsym = bvf->prm->xsym + idx; + + /* evaluate function arguments */ + err = NULL; + for (i = 0; i != xsym->func.nb_args && err == NULL; i++) { + err = eval_func_arg(bvf, xsym->func.args + i, + bvf->evst->rv + EBPF_REG_1 + i); + } + + /* R1-R5 argument/scratch registers */ + for (i = EBPF_REG_1; i != EBPF_REG_6; i++) + bvf->evst->rv[i].v.type = RTE_BPF_ARG_UNDEF; + + /* update return value */ + + rv = bvf->evst->rv + EBPF_REG_0; + rv->v = xsym->func.ret; + msk = (rv->v.type == RTE_BPF_ARG_RAW) ? + RTE_LEN2MASK(rv->v.size * CHAR_BIT, uint64_t) : UINTPTR_MAX; + eval_max_bound(rv, msk); + rv->mask = msk; + + return err; +} + +static void +eval_jeq_jne(struct bpf_reg_val *trd, struct bpf_reg_val *trs) +{ + /* sreg is constant */ + if (trs->u.min == trs->u.max) { + trd->u = trs->u; + /* dreg is constant */ + } else if (trd->u.min == trd->u.max) { + trs->u = trd->u; + } else { + trd->u.max = RTE_MIN(trd->u.max, trs->u.max); + trd->u.min = RTE_MAX(trd->u.min, trs->u.min); + trs->u = trd->u; + } + + /* sreg is constant */ + if (trs->s.min == trs->s.max) { + trd->s = trs->s; + /* dreg is constant */ + } else if (trd->s.min == trd->s.max) { + trs->s = trd->s; + } else { + trd->s.max = RTE_MIN(trd->s.max, trs->s.max); + trd->s.min = RTE_MAX(trd->s.min, trs->s.min); + trs->s = trd->s; + } +} + +static void +eval_jgt_jle(struct bpf_reg_val *trd, struct bpf_reg_val *trs, + struct bpf_reg_val *frd, struct bpf_reg_val *frs) +{ + frd->u.max = RTE_MIN(frd->u.max, frs->u.min); + trd->u.min = RTE_MAX(trd->u.min, trs->u.min + 1); +} + +static void +eval_jlt_jge(struct bpf_reg_val *trd, struct bpf_reg_val *trs, + struct bpf_reg_val *frd, struct bpf_reg_val *frs) +{ + frd->u.min = RTE_MAX(frd->u.min, frs->u.min); + trd->u.max = RTE_MIN(trd->u.max, trs->u.max - 1); +} + +static void +eval_jsgt_jsle(struct bpf_reg_val *trd, struct bpf_reg_val *trs, + struct bpf_reg_val *frd, struct bpf_reg_val *frs) +{ + frd->s.max = RTE_MIN(frd->s.max, frs->s.min); + trd->s.min = RTE_MAX(trd->s.min, trs->s.min + 1); +} + +static void +eval_jslt_jsge(struct bpf_reg_val *trd, struct bpf_reg_val *trs, + struct bpf_reg_val *frd, struct bpf_reg_val *frs) +{ + frd->s.min = RTE_MAX(frd->s.min, frs->s.min); + trd->s.max = RTE_MIN(trd->s.max, trs->s.max - 1); +} + +static const char * +eval_jcc(struct bpf_verifier *bvf, const struct ebpf_insn *ins) +{ + uint32_t op; + const char *err; + struct bpf_eval_state *fst, *tst; + struct bpf_reg_val *frd, *frs, *trd, *trs; + struct bpf_reg_val rvf, rvt; + + tst = bvf->evst; + fst = bvf->evin->evst; + + frd = fst->rv + ins->dst_reg; + trd = tst->rv + ins->dst_reg; + + if (BPF_SRC(ins->code) == BPF_X) { + frs = fst->rv + ins->src_reg; + trs = tst->rv + ins->src_reg; + } else { + frs = &rvf; + trs = &rvt; + eval_fill_imm(frs, UINT64_MAX, ins->imm); + eval_fill_imm(trs, UINT64_MAX, ins->imm); + } + + err = eval_defined(trd, trs); + if (err != NULL) + return err; + + op = BPF_OP(ins->code); + + if (op == BPF_JEQ) + eval_jeq_jne(trd, trs); + else if (op == EBPF_JNE) + eval_jeq_jne(frd, frs); + else if (op == BPF_JGT) + eval_jgt_jle(trd, trs, frd, frs); + else if (op == EBPF_JLE) + eval_jgt_jle(frd, frs, trd, trs); + else if (op == EBPF_JLT) + eval_jlt_jge(trd, trs, frd, frs); + else if (op == BPF_JGE) + eval_jlt_jge(frd, frs, trd, trs); + else if (op == EBPF_JSGT) + eval_jsgt_jsle(trd, trs, frd, frs); + else if (op == EBPF_JSLE) + eval_jsgt_jsle(frd, frs, trd, trs); + else if (op == EBPF_JLT) + eval_jslt_jsge(trd, trs, frd, frs); + else if (op == EBPF_JSGE) + eval_jslt_jsge(frd, frs, trd, trs); + return NULL; } @@ -157,256 +1090,306 @@ static const struct bpf_ins_check ins_chk[UINT8_MAX] = { .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, }, [(BPF_ALU | BPF_SUB | BPF_K)] = { .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, }, [(BPF_ALU | BPF_AND | BPF_K)] = { .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, }, [(BPF_ALU | BPF_OR | BPF_K)] = { .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, }, [(BPF_ALU | BPF_LSH | BPF_K)] = { .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, }, [(BPF_ALU | BPF_RSH | BPF_K)] = { .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, }, [(BPF_ALU | BPF_XOR | BPF_K)] = { .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, }, [(BPF_ALU | BPF_MUL | BPF_K)] = { .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, }, [(BPF_ALU | EBPF_MOV | BPF_K)] = { .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, }, [(BPF_ALU | BPF_DIV | BPF_K)] = { .mask = { .dreg = WRT_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = 0}, .imm = { .min = 1, .max = UINT32_MAX}, + .eval = eval_alu, }, [(BPF_ALU | BPF_MOD | BPF_K)] = { .mask = { .dreg = WRT_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = 0}, .imm = { .min = 1, .max = UINT32_MAX}, + .eval = eval_alu, }, /* ALU IMM 64-bit instructions */ [(EBPF_ALU64 | BPF_ADD | BPF_K)] = { .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, }, [(EBPF_ALU64 | BPF_SUB | BPF_K)] = { .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, }, [(EBPF_ALU64 | BPF_AND | BPF_K)] = { .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, }, [(EBPF_ALU64 | BPF_OR | BPF_K)] = { .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, }, [(EBPF_ALU64 | BPF_LSH | BPF_K)] = { .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, }, [(EBPF_ALU64 | BPF_RSH | BPF_K)] = { .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, }, [(EBPF_ALU64 | EBPF_ARSH | BPF_K)] = { .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, }, [(EBPF_ALU64 | BPF_XOR | BPF_K)] = { .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, }, [(EBPF_ALU64 | BPF_MUL | BPF_K)] = { .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, }, [(EBPF_ALU64 | EBPF_MOV | BPF_K)] = { .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, }, [(EBPF_ALU64 | BPF_DIV | BPF_K)] = { .mask = { .dreg = WRT_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = 0}, .imm = { .min = 1, .max = UINT32_MAX}, + .eval = eval_alu, }, [(EBPF_ALU64 | BPF_MOD | BPF_K)] = { .mask = { .dreg = WRT_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = 0}, .imm = { .min = 1, .max = UINT32_MAX}, + .eval = eval_alu, }, /* ALU REG 32-bit instructions */ [(BPF_ALU | BPF_ADD | BPF_X)] = { .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = 0}, + .eval = eval_alu, }, [(BPF_ALU | BPF_SUB | BPF_X)] = { .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = 0}, + .eval = eval_alu, }, [(BPF_ALU | BPF_AND | BPF_X)] = { .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = 0}, + .eval = eval_alu, }, [(BPF_ALU | BPF_OR | BPF_X)] = { .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = 0}, + .eval = eval_alu, }, [(BPF_ALU | BPF_LSH | BPF_X)] = { .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = 0}, + .eval = eval_alu, }, [(BPF_ALU | BPF_RSH | BPF_X)] = { .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = 0}, + .eval = eval_alu, }, [(BPF_ALU | BPF_XOR | BPF_X)] = { .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = 0}, + .eval = eval_alu, }, [(BPF_ALU | BPF_MUL | BPF_X)] = { .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = 0}, + .eval = eval_alu, }, [(BPF_ALU | BPF_DIV | BPF_X)] = { .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = 0}, + .eval = eval_alu, }, [(BPF_ALU | BPF_MOD | BPF_X)] = { .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = 0}, + .eval = eval_alu, }, [(BPF_ALU | EBPF_MOV | BPF_X)] = { .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = 0}, + .eval = eval_alu, }, [(BPF_ALU | BPF_NEG)] = { .mask = { .dreg = WRT_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = 0}, + .eval = eval_alu, }, [(BPF_ALU | EBPF_END | EBPF_TO_BE)] = { .mask = { .dreg = WRT_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = 0}, .imm = { .min = 16, .max = 64}, .check = check_alu_bele, + .eval = eval_bele, }, [(BPF_ALU | EBPF_END | EBPF_TO_LE)] = { .mask = { .dreg = WRT_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = 0}, .imm = { .min = 16, .max = 64}, .check = check_alu_bele, + .eval = eval_bele, }, /* ALU REG 64-bit instructions */ [(EBPF_ALU64 | BPF_ADD | BPF_X)] = { .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = 0}, + .eval = eval_alu, }, [(EBPF_ALU64 | BPF_SUB | BPF_X)] = { .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = 0}, + .eval = eval_alu, }, [(EBPF_ALU64 | BPF_AND | BPF_X)] = { .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = 0}, + .eval = eval_alu, }, [(EBPF_ALU64 | BPF_OR | BPF_X)] = { .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = 0}, + .eval = eval_alu, }, [(EBPF_ALU64 | BPF_LSH | BPF_X)] = { .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = 0}, + .eval = eval_alu, }, [(EBPF_ALU64 | BPF_RSH | BPF_X)] = { .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = 0}, + .eval = eval_alu, }, [(EBPF_ALU64 | EBPF_ARSH | BPF_X)] = { .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = 0}, + .eval = eval_alu, }, [(EBPF_ALU64 | BPF_XOR | BPF_X)] = { .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = 0}, + .eval = eval_alu, }, [(EBPF_ALU64 | BPF_MUL | BPF_X)] = { .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = 0}, + .eval = eval_alu, }, [(EBPF_ALU64 | BPF_DIV | BPF_X)] = { .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = 0}, + .eval = eval_alu, }, [(EBPF_ALU64 | BPF_MOD | BPF_X)] = { .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = 0}, + .eval = eval_alu, }, [(EBPF_ALU64 | EBPF_MOV | BPF_X)] = { .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = 0}, + .eval = eval_alu, }, [(EBPF_ALU64 | BPF_NEG)] = { .mask = { .dreg = WRT_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = 0}, + .eval = eval_alu, }, /* load instructions */ [(BPF_LDX | BPF_MEM | BPF_B)] = { @@ -438,6 +1421,7 @@ static const struct bpf_ins_check ins_chk[UINT8_MAX] = { .mask = { .dreg = WRT_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_ld_imm64, }, /* store REG instructions */ [(BPF_STX | BPF_MEM | BPF_B)] = { @@ -513,92 +1497,110 @@ static const struct bpf_ins_check ins_chk[UINT8_MAX] = { .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = UINT16_MAX}, .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_jcc, }, [(BPF_JMP | EBPF_JNE | BPF_K)] = { .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = UINT16_MAX}, .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_jcc, }, [(BPF_JMP | BPF_JGT | BPF_K)] = { .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = UINT16_MAX}, .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_jcc, }, [(BPF_JMP | EBPF_JLT | BPF_K)] = { .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = UINT16_MAX}, .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_jcc, }, [(BPF_JMP | BPF_JGE | BPF_K)] = { .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = UINT16_MAX}, .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_jcc, }, [(BPF_JMP | EBPF_JLE | BPF_K)] = { .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = UINT16_MAX}, .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_jcc, }, [(BPF_JMP | EBPF_JSGT | BPF_K)] = { .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = UINT16_MAX}, .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_jcc, }, [(BPF_JMP | EBPF_JSLT | BPF_K)] = { .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = UINT16_MAX}, .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_jcc, }, [(BPF_JMP | EBPF_JSGE | BPF_K)] = { .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = UINT16_MAX}, .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_jcc, }, [(BPF_JMP | EBPF_JSLE | BPF_K)] = { .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = UINT16_MAX}, .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_jcc, }, [(BPF_JMP | BPF_JSET | BPF_K)] = { .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, .off = { .min = 0, .max = UINT16_MAX}, .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_jcc, }, /* jcc REG instructions */ [(BPF_JMP | BPF_JEQ | BPF_X)] = { .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, .off = { .min = 0, .max = UINT16_MAX}, .imm = { .min = 0, .max = 0}, + .eval = eval_jcc, }, [(BPF_JMP | EBPF_JNE | BPF_X)] = { .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, .off = { .min = 0, .max = UINT16_MAX}, .imm = { .min = 0, .max = 0}, + .eval = eval_jcc, }, [(BPF_JMP | BPF_JGT | BPF_X)] = { .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, .off = { .min = 0, .max = UINT16_MAX}, .imm = { .min = 0, .max = 0}, + .eval = eval_jcc, }, [(BPF_JMP | EBPF_JLT | BPF_X)] = { .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, .off = { .min = 0, .max = UINT16_MAX}, .imm = { .min = 0, .max = 0}, + .eval = eval_jcc, }, [(BPF_JMP | BPF_JGE | BPF_X)] = { .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, .off = { .min = 0, .max = UINT16_MAX}, .imm = { .min = 0, .max = 0}, + .eval = eval_jcc, }, [(BPF_JMP | EBPF_JLE | BPF_X)] = { .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, .off = { .min = 0, .max = UINT16_MAX}, .imm = { .min = 0, .max = 0}, + .eval = eval_jcc, }, [(BPF_JMP | EBPF_JSGT | BPF_X)] = { .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, .off = { .min = 0, .max = UINT16_MAX}, .imm = { .min = 0, .max = 0}, + .eval = eval_jcc, }, [(BPF_JMP | EBPF_JSLT | BPF_X)] = { .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, @@ -609,16 +1611,19 @@ static const struct bpf_ins_check ins_chk[UINT8_MAX] = { .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, .off = { .min = 0, .max = UINT16_MAX}, .imm = { .min = 0, .max = 0}, + .eval = eval_jcc, }, [(BPF_JMP | EBPF_JSLE | BPF_X)] = { .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, .off = { .min = 0, .max = UINT16_MAX}, .imm = { .min = 0, .max = 0}, + .eval = eval_jcc, }, [(BPF_JMP | BPF_JSET | BPF_X)] = { .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, .off = { .min = 0, .max = UINT16_MAX}, .imm = { .min = 0, .max = 0}, + .eval = eval_jcc, }, /* call instruction */ [(BPF_JMP | EBPF_CALL)] = { @@ -632,6 +1637,7 @@ static const struct bpf_ins_check ins_chk[UINT8_MAX] = { .mask = { .dreg = ZERO_REG, .sreg = ZERO_REG}, .off = { .min = 0, .max = 0}, .imm = { .min = 0, .max = 0}, + .eval = eval_exit, }, }; @@ -1046,7 +2052,7 @@ save_eval_state(struct bpf_verifier *bvf, struct inst_node *node) st = pull_eval_state(bvf); if (st == NULL) { RTE_BPF_LOG(ERR, - "%s: internal error (out of space) at pc: %u", + "%s: internal error (out of space) at pc: %u\n", __func__, get_node_idx(bvf, node)); return -ENOMEM; } @@ -1078,6 +2084,32 @@ restore_eval_state(struct bpf_verifier *bvf, struct inst_node *node) push_eval_state(bvf); } +static void +log_eval_state(const struct bpf_verifier *bvf, const struct ebpf_insn *ins, + uint32_t pc, int32_t loglvl) +{ + const struct bpf_eval_state *st; + const struct bpf_reg_val *rv; + + rte_log(loglvl, rte_bpf_logtype, "%s(pc=%u):\n", __func__, pc); + + st = bvf->evst; + rv = st->rv + ins->dst_reg; + + rte_log(loglvl, rte_bpf_logtype, + "r%u={\n" + "\tv={type=%u, size=%zu},\n" + "\tmask=0x%" PRIx64 ",\n" + "\tu={min=0x%" PRIx64 ", max=0x%" PRIx64 "},\n" + "\ts={min=%" PRId64 ", max=%" PRId64 "},\n" + "};\n", + ins->dst_reg, + rv->v.type, rv->v.size, + rv->mask, + rv->u.min, rv->u.max, + rv->s.min, rv->s.max); +} + /* * Do second pass through CFG and try to evaluate instructions * via each possible path. @@ -1096,23 +2128,56 @@ evaluate(struct bpf_verifier *bvf) const struct ebpf_insn *ins; struct inst_node *next, *node; - node = bvf->in; + /* initial state of frame pointer */ + static const struct bpf_reg_val rvfp = { + .v = { + .type = RTE_BPF_ARG_PTR_STACK, + .size = MAX_BPF_STACK_SIZE, + }, + .mask = UINT64_MAX, + .u = {.min = MAX_BPF_STACK_SIZE, .max = MAX_BPF_STACK_SIZE}, + .s = {.min = MAX_BPF_STACK_SIZE, .max = MAX_BPF_STACK_SIZE}, + }; + + bvf->evst->rv[EBPF_REG_1].v = bvf->prm->prog_arg; + bvf->evst->rv[EBPF_REG_1].mask = UINT64_MAX; + if (bvf->prm->prog_arg.type == RTE_BPF_ARG_RAW) + eval_max_bound(bvf->evst->rv + EBPF_REG_1, UINT64_MAX); + + bvf->evst->rv[EBPF_REG_10] = rvfp; + ins = bvf->prm->ins; + node = bvf->in; + next = node; rc = 0; while (node != NULL && rc == 0) { - /* current node evaluation */ - idx = get_node_idx(bvf, node); - op = ins[idx].code; + /* + * current node evaluation, make sure we evaluate + * each node only once. + */ + if (next != NULL) { + + bvf->evin = node; + idx = get_node_idx(bvf, node); + op = ins[idx].code; - if (ins_chk[op].eval != NULL) { - err = ins_chk[op].eval(bvf, ins + idx); - if (err != NULL) { - RTE_BPF_LOG(ERR, "%s: %s at pc: %u\n", - __func__, err, idx); - rc = -EINVAL; + /* for jcc node make a copy of evaluatoion state */ + if (node->nb_edge > 1) + rc |= save_eval_state(bvf, node); + + if (ins_chk[op].eval != NULL && rc == 0) { + err = ins_chk[op].eval(bvf, ins + idx); + if (err != NULL) { + RTE_BPF_LOG(ERR, "%s: %s at pc: %u\n", + __func__, err, idx); + rc = -EINVAL; + } } + + log_eval_state(bvf, ins + idx, idx, RTE_LOG_DEBUG); + bvf->evin = NULL; } /* proceed through CFG */ @@ -1120,9 +2185,8 @@ evaluate(struct bpf_verifier *bvf) if (next != NULL) { /* proceed with next child */ - if (node->cur_edge != node->nb_edge) - rc |= save_eval_state(bvf, node); - else if (node->evst != NULL) + if (node->cur_edge == node->nb_edge && + node->evst != NULL) restore_eval_state(bvf, node); next->prev_node = get_node_idx(bvf, node); diff --git a/lib/librte_bpf/meson.build b/lib/librte_bpf/meson.build index de9de009..bc0cd78f 100644 --- a/lib/librte_bpf/meson.build +++ b/lib/librte_bpf/meson.build @@ -8,7 +8,7 @@ sources = files('bpf.c', 'bpf_pkt.c', 'bpf_validate.c') -if arch_subdir == 'x86' +if arch_subdir == 'x86' and cc.sizeof('void *') == 8 sources += files('bpf_jit_x86.c') endif diff --git a/lib/librte_bpf/rte_bpf.h b/lib/librte_bpf/rte_bpf.h index 1249a992..ad62ef2c 100644 --- a/lib/librte_bpf/rte_bpf.h +++ b/lib/librte_bpf/rte_bpf.h @@ -40,7 +40,11 @@ enum rte_bpf_arg_type { */ struct rte_bpf_arg { enum rte_bpf_arg_type type; - size_t size; /**< for pointer types, size of data it points to */ + /** + * for ptr type - max size of data buffer it points to + * for raw type - the size (in bytes) of the value + */ + size_t size; size_t buf_size; /**< for mbuf ptr type, max size of rte_mbuf data buffer */ }; @@ -66,10 +70,19 @@ struct rte_bpf_xsym { const char *name; /**< name */ enum rte_bpf_xtype type; /**< type */ union { - uint64_t (*func)(uint64_t, uint64_t, uint64_t, + struct { + uint64_t (*val)(uint64_t, uint64_t, uint64_t, uint64_t, uint64_t); - void *var; - }; /**< value */ + uint32_t nb_args; + struct rte_bpf_arg args[EBPF_FUNC_MAX_ARGS]; + /**< Function arguments descriptions. */ + struct rte_bpf_arg ret; /**< function return value. */ + } func; + struct { + void *val; /**< actual memory location */ + struct rte_bpf_arg desc; /**< type, size, etc. */ + } var; /**< external variable */ + }; }; /** diff --git a/lib/librte_cmdline/cmdline_parse.c b/lib/librte_cmdline/cmdline_parse.c index 961f9bef..9666e90c 100644 --- a/lib/librte_cmdline/cmdline_parse.c +++ b/lib/librte_cmdline/cmdline_parse.c @@ -208,9 +208,6 @@ cmdline_parse(struct cmdline *cl, const char * buf) int err = CMDLINE_PARSE_NOMATCH; int tok; cmdline_parse_ctx_t *ctx; -#ifdef RTE_LIBRTE_CMDLINE_DEBUG - char debug_buf[BUFSIZ]; -#endif char *result_buf = result.buf; if (!cl || !buf) @@ -250,10 +247,8 @@ cmdline_parse(struct cmdline *cl, const char * buf) return linelen; } -#ifdef RTE_LIBRTE_CMDLINE_DEBUG - strlcpy(debug_buf, buf, (linelen > 64 ? 64 : linelen)); - debug_printf("Parse line : len=%d, <%s>\n", linelen, debug_buf); -#endif + debug_printf("Parse line : len=%d, <%.*s>\n", + linelen, linelen > 64 ? 64 : linelen, buf); /* parse it !! */ inst = ctx[inst_num]; diff --git a/lib/librte_compat/Makefile b/lib/librte_compat/Makefile index 0c57533c..61089fe7 100644 --- a/lib/librte_compat/Makefile +++ b/lib/librte_compat/Makefile @@ -1,33 +1,6 @@ -# BSD LICENSE -# -# Copyright(c) 2013 Neil Horman <nhorman@tuxdriver.com> -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in -# the documentation and/or other materials provided with the -# distribution. -# * Neither the name of Intel Corporation nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2013 Neil Horman <nhorman@tuxdriver.com> +# All rights reserved. include $(RTE_SDK)/mk/rte.vars.mk diff --git a/lib/librte_compressdev/rte_comp.c b/lib/librte_compressdev/rte_comp.c index d596ba87..98ad0cfd 100644 --- a/lib/librte_compressdev/rte_comp.c +++ b/lib/librte_compressdev/rte_comp.c @@ -14,8 +14,12 @@ rte_comp_get_feature_name(uint64_t flag) return "STATEFUL_COMPRESSION"; case RTE_COMP_FF_STATEFUL_DECOMPRESSION: return "STATEFUL_DECOMPRESSION"; - case RTE_COMP_FF_MBUF_SCATTER_GATHER: - return "MBUF_SCATTER_GATHER"; + case RTE_COMP_FF_OOP_SGL_IN_SGL_OUT: + return "OOP_SGL_IN_SGL_OUT"; + case RTE_COMP_FF_OOP_SGL_IN_LB_OUT: + return "OOP_SGL_IN_LB_OUT"; + case RTE_COMP_FF_OOP_LB_IN_SGL_OUT: + return "OOP_LB_IN_SGL_OUT"; case RTE_COMP_FF_MULTI_PKT_CHECKSUM: return "MULTI_PKT_CHECKSUM"; case RTE_COMP_FF_ADLER32_CHECKSUM: @@ -32,6 +36,10 @@ rte_comp_get_feature_name(uint64_t flag) return "SHA2_SHA256_HASH"; case RTE_COMP_FF_SHAREABLE_PRIV_XFORM: return "SHAREABLE_PRIV_XFORM"; + case RTE_COMP_FF_HUFFMAN_FIXED: + return "HUFFMAN_FIXED"; + case RTE_COMP_FF_HUFFMAN_DYNAMIC: + return "HUFFMAN_DYNAMIC"; default: return NULL; } diff --git a/lib/librte_compressdev/rte_comp.h b/lib/librte_compressdev/rte_comp.h index 5b513c77..ee9056ea 100644 --- a/lib/librte_compressdev/rte_comp.h +++ b/lib/librte_compressdev/rte_comp.h @@ -30,28 +30,43 @@ extern "C" { /**< Stateful compression is supported */ #define RTE_COMP_FF_STATEFUL_DECOMPRESSION (1ULL << 1) /**< Stateful decompression is supported */ -#define RTE_COMP_FF_MBUF_SCATTER_GATHER (1ULL << 2) -/**< Scatter-gather mbufs are supported */ -#define RTE_COMP_FF_ADLER32_CHECKSUM (1ULL << 3) +#define RTE_COMP_FF_OOP_SGL_IN_SGL_OUT (1ULL << 2) +/**< Out-of-place Scatter-gather (SGL) buffers, + * with multiple segments, are supported in input and output + */ +#define RTE_COMP_FF_OOP_SGL_IN_LB_OUT (1ULL << 3) +/**< Out-of-place Scatter-gather (SGL) buffers are supported + * in input, combined with linear buffers (LB), with a + * single segment, in output + */ +#define RTE_COMP_FF_OOP_LB_IN_SGL_OUT (1ULL << 4) +/**< Out-of-place Scatter-gather (SGL) buffers are supported + * in output, combined with linear buffers (LB) in input + */ +#define RTE_COMP_FF_ADLER32_CHECKSUM (1ULL << 5) /**< Adler-32 Checksum is supported */ -#define RTE_COMP_FF_CRC32_CHECKSUM (1ULL << 4) +#define RTE_COMP_FF_CRC32_CHECKSUM (1ULL << 6) /**< CRC32 Checksum is supported */ -#define RTE_COMP_FF_CRC32_ADLER32_CHECKSUM (1ULL << 5) +#define RTE_COMP_FF_CRC32_ADLER32_CHECKSUM (1ULL << 7) /**< Adler-32/CRC32 Checksum is supported */ -#define RTE_COMP_FF_MULTI_PKT_CHECKSUM (1ULL << 6) +#define RTE_COMP_FF_MULTI_PKT_CHECKSUM (1ULL << 8) /**< Generation of checksum across multiple stateless packets is supported */ -#define RTE_COMP_FF_SHA1_HASH (1ULL << 7) +#define RTE_COMP_FF_SHA1_HASH (1ULL << 9) /**< SHA1 Hash is supported */ -#define RTE_COMP_FF_SHA2_SHA256_HASH (1ULL << 8) +#define RTE_COMP_FF_SHA2_SHA256_HASH (1ULL << 10) /**< SHA256 Hash of SHA2 family is supported */ -#define RTE_COMP_FF_NONCOMPRESSED_BLOCKS (1ULL << 9) +#define RTE_COMP_FF_NONCOMPRESSED_BLOCKS (1ULL << 11) /**< Creation of non-compressed blocks using RTE_COMP_LEVEL_NONE is supported */ -#define RTE_COMP_FF_SHAREABLE_PRIV_XFORM (1ULL << 10) +#define RTE_COMP_FF_SHAREABLE_PRIV_XFORM (1ULL << 12) /**< Private xforms created by the PMD can be shared * across multiple stateless operations. If not set, then app needs * to create as many priv_xforms as it expects to have stateless * operations in-flight. */ +#define RTE_COMP_FF_HUFFMAN_FIXED (1ULL << 13) +/**< Fixed huffman encoding is supported */ +#define RTE_COMP_FF_HUFFMAN_DYNAMIC (1ULL << 14) +/**< Dynamic huffman encoding is supported */ /** Status of comp operation */ enum rte_comp_op_status { diff --git a/lib/librte_compressdev/rte_compressdev.c b/lib/librte_compressdev/rte_compressdev.c index 6a38917d..9091dd6e 100644 --- a/lib/librte_compressdev/rte_compressdev.c +++ b/lib/librte_compressdev/rte_compressdev.c @@ -764,10 +764,7 @@ rte_compressdev_name_get(uint8_t dev_id) return dev->data->name; } -RTE_INIT(rte_compressdev_log); - -static void -rte_compressdev_log(void) +RTE_INIT(rte_compressdev_log) { compressdev_logtype = rte_log_register("lib.compressdev"); if (compressdev_logtype >= 0) diff --git a/lib/librte_cryptodev/Makefile b/lib/librte_cryptodev/Makefile index bba8dee9..c1148887 100644 --- a/lib/librte_cryptodev/Makefile +++ b/lib/librte_cryptodev/Makefile @@ -23,6 +23,7 @@ SYMLINK-y-include += rte_crypto.h SYMLINK-y-include += rte_crypto_sym.h SYMLINK-y-include += rte_cryptodev.h SYMLINK-y-include += rte_cryptodev_pmd.h +SYMLINK-y-include += rte_crypto_asym.h # versioning export map EXPORT_MAP := rte_cryptodev_version.map diff --git a/lib/librte_cryptodev/meson.build b/lib/librte_cryptodev/meson.build index bd5fed89..295f509e 100644 --- a/lib/librte_cryptodev/meson.build +++ b/lib/librte_cryptodev/meson.build @@ -6,5 +6,6 @@ sources = files('rte_cryptodev.c', 'rte_cryptodev_pmd.c') headers = files('rte_cryptodev.h', 'rte_cryptodev_pmd.h', 'rte_crypto.h', - 'rte_crypto_sym.h') + 'rte_crypto_sym.h', + 'rte_crypto_asym.h') deps += ['kvargs', 'mbuf'] diff --git a/lib/librte_cryptodev/rte_crypto.h b/lib/librte_cryptodev/rte_crypto.h index 25404264..fd5ef3a8 100644 --- a/lib/librte_cryptodev/rte_crypto.h +++ b/lib/librte_cryptodev/rte_crypto.h @@ -23,6 +23,7 @@ extern "C" { #include <rte_common.h> #include "rte_crypto_sym.h" +#include "rte_crypto_asym.h" /** Crypto operation types */ enum rte_crypto_op_type { @@ -30,6 +31,8 @@ enum rte_crypto_op_type { /**< Undefined operation type */ RTE_CRYPTO_OP_TYPE_SYMMETRIC, /**< Symmetric operation */ + RTE_CRYPTO_OP_TYPE_ASYMMETRIC + /**< Asymmetric operation */ }; /** Status of crypto operation */ @@ -73,26 +76,37 @@ enum rte_crypto_op_sess_type { * rte_cryptodev_enqueue_burst() / rte_cryptodev_dequeue_burst() . */ struct rte_crypto_op { - uint8_t type; - /**< operation type */ - uint8_t status; - /**< - * operation status - this is reset to - * RTE_CRYPTO_OP_STATUS_NOT_PROCESSED on allocation from mempool and - * will be set to RTE_CRYPTO_OP_STATUS_SUCCESS after crypto operation - * is successfully processed by a crypto PMD - */ - uint8_t sess_type; - /**< operation session type */ - uint16_t private_data_offset; - /**< Offset to indicate start of private data (if any). The offset - * is counted from the start of the rte_crypto_op including IV. - * The private data may be used by the application to store - * information which should remain untouched in the library/driver - */ - - uint8_t reserved[3]; - /**< Reserved bytes to fill 64 bits for future additions */ + __extension__ + union { + uint64_t raw; + __extension__ + struct { + uint8_t type; + /**< operation type */ + uint8_t status; + /**< + * operation status - this is reset to + * RTE_CRYPTO_OP_STATUS_NOT_PROCESSED on allocation + * from mempool and will be set to + * RTE_CRYPTO_OP_STATUS_SUCCESS after crypto operation + * is successfully processed by a crypto PMD + */ + uint8_t sess_type; + /**< operation session type */ + uint8_t reserved[3]; + /**< Reserved bytes to fill 64 bits for + * future additions + */ + uint16_t private_data_offset; + /**< Offset to indicate start of private data (if any). + * The offset is counted from the start of the + * rte_crypto_op including IV. + * The private data may be used by the application + * to store information which should remain untouched + * in the library/driver + */ + }; + }; struct rte_mempool *mempool; /**< crypto operation mempool which operation is allocated from */ @@ -103,6 +117,10 @@ struct rte_crypto_op { union { struct rte_crypto_sym_op sym[0]; /**< Symmetric operation parameters */ + + struct rte_crypto_asym_op asym[0]; + /**< Asymmetric operation parameters */ + }; /**< operation specific parameters */ }; @@ -123,6 +141,9 @@ __rte_crypto_op_reset(struct rte_crypto_op *op, enum rte_crypto_op_type type) case RTE_CRYPTO_OP_TYPE_SYMMETRIC: __rte_crypto_sym_op_reset(op->sym); break; + case RTE_CRYPTO_OP_TYPE_ASYMMETRIC: + memset(op->asym, 0, sizeof(struct rte_crypto_asym_op)); + break; case RTE_CRYPTO_OP_TYPE_UNDEFINED: default: break; @@ -289,9 +310,14 @@ __rte_crypto_op_get_priv_data(struct rte_crypto_op *op, uint32_t size) if (likely(op->mempool != NULL)) { priv_size = __rte_crypto_op_get_priv_data_size(op->mempool); - if (likely(priv_size >= size)) - return (void *)((uint8_t *)(op + 1) + + if (likely(priv_size >= size)) { + if (op->type == RTE_CRYPTO_OP_TYPE_SYMMETRIC) + return (void *)((uint8_t *)(op + 1) + sizeof(struct rte_crypto_sym_op)); + if (op->type == RTE_CRYPTO_OP_TYPE_ASYMMETRIC) + return (void *)((uint8_t *)(op + 1) + + sizeof(struct rte_crypto_asym_op)); + } } return NULL; @@ -394,6 +420,24 @@ rte_crypto_op_attach_sym_session(struct rte_crypto_op *op, return __rte_crypto_sym_op_attach_sym_session(op->sym, sess); } +/** + * Attach a asymmetric session to a crypto operation + * + * @param op crypto operation, must be of type asymmetric + * @param sess cryptodev session + */ +static inline int +rte_crypto_op_attach_asym_session(struct rte_crypto_op *op, + struct rte_cryptodev_asym_session *sess) +{ + if (unlikely(op->type != RTE_CRYPTO_OP_TYPE_ASYMMETRIC)) + return -1; + + op->sess_type = RTE_CRYPTO_OP_WITH_SESSION; + op->asym->session = sess; + return 0; +} + #ifdef __cplusplus } #endif diff --git a/lib/librte_cryptodev/rte_crypto_asym.h b/lib/librte_cryptodev/rte_crypto_asym.h new file mode 100644 index 00000000..5e185b2d --- /dev/null +++ b/lib/librte_cryptodev/rte_crypto_asym.h @@ -0,0 +1,496 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Cavium Networks + */ + +#ifndef _RTE_CRYPTO_ASYM_H_ +#define _RTE_CRYPTO_ASYM_H_ + +/** + * @file rte_crypto_asym.h + * + * RTE Definitions for Asymmetric Cryptography + * + * Defines asymmetric algorithms and modes, as well as supported + * asymmetric crypto operations. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <string.h> +#include <stdint.h> + +#include <rte_memory.h> +#include <rte_mempool.h> +#include <rte_common.h> + +typedef struct rte_crypto_param_t { + uint8_t *data; + /**< pointer to buffer holding data */ + rte_iova_t iova; + /**< IO address of data buffer */ + size_t length; + /**< length of data in bytes */ +} rte_crypto_param; + +/** asym xform type name strings */ +extern const char * +rte_crypto_asym_xform_strings[]; + +/** asym operations type name strings */ +extern const char * +rte_crypto_asym_op_strings[]; + +/** + * Asymmetric crypto transformation types. + * Each xform type maps to one asymmetric algorithm + * performing specific operation + * + */ +enum rte_crypto_asym_xform_type { + RTE_CRYPTO_ASYM_XFORM_UNSPECIFIED = 0, + /**< Invalid xform. */ + RTE_CRYPTO_ASYM_XFORM_NONE, + /**< Xform type None. + * May be supported by PMD to support + * passthrough op for debugging purpose. + * if xform_type none , op_type is disregarded. + */ + RTE_CRYPTO_ASYM_XFORM_RSA, + /**< RSA. Performs Encrypt, Decrypt, Sign and Verify. + * Refer to rte_crypto_asym_op_type + */ + RTE_CRYPTO_ASYM_XFORM_DH, + /**< Diffie-Hellman. + * Performs Key Generate and Shared Secret Compute. + * Refer to rte_crypto_asym_op_type + */ + RTE_CRYPTO_ASYM_XFORM_DSA, + /**< Digital Signature Algorithm + * Performs Signature Generation and Verification. + * Refer to rte_crypto_asym_op_type + */ + RTE_CRYPTO_ASYM_XFORM_MODINV, + /**< Modular Inverse + * Perform Modulus inverse b^(-1) mod n + */ + RTE_CRYPTO_ASYM_XFORM_MODEX, + /**< Modular Exponentiation + * Perform Modular Exponentiation b^e mod n + */ + RTE_CRYPTO_ASYM_XFORM_TYPE_LIST_END + /**< End of list */ +}; + +/** + * Asymmetric crypto operation type variants + */ +enum rte_crypto_asym_op_type { + RTE_CRYPTO_ASYM_OP_ENCRYPT, + /**< Asymmetric Encrypt operation */ + RTE_CRYPTO_ASYM_OP_DECRYPT, + /**< Asymmetric Decrypt operation */ + RTE_CRYPTO_ASYM_OP_SIGN, + /**< Signature Generation operation */ + RTE_CRYPTO_ASYM_OP_VERIFY, + /**< Signature Verification operation */ + RTE_CRYPTO_ASYM_OP_PRIVATE_KEY_GENERATE, + /**< DH Private Key generation operation */ + RTE_CRYPTO_ASYM_OP_PUBLIC_KEY_GENERATE, + /**< DH Public Key generation operation */ + RTE_CRYPTO_ASYM_OP_SHARED_SECRET_COMPUTE, + /**< DH Shared Secret compute operation */ + RTE_CRYPTO_ASYM_OP_LIST_END +}; + +/** + * Padding types for RSA signature. + */ +enum rte_crypto_rsa_padding_type { + RTE_CRYPTO_RSA_PADDING_NONE = 0, + /**< RSA no padding scheme */ + RTE_CRYPTO_RSA_PKCS1_V1_5_BT0, + /**< RSA PKCS#1 V1.5 Block Type 0 padding scheme + * as descibed in rfc2313 + */ + RTE_CRYPTO_RSA_PKCS1_V1_5_BT1, + /**< RSA PKCS#1 V1.5 Block Type 01 padding scheme + * as descibed in rfc2313 + */ + RTE_CRYPTO_RSA_PKCS1_V1_5_BT2, + /**< RSA PKCS#1 V1.5 Block Type 02 padding scheme + * as descibed in rfc2313 + */ + RTE_CRYPTO_RSA_PADDING_OAEP, + /**< RSA PKCS#1 OAEP padding scheme */ + RTE_CRYPTO_RSA_PADDING_PSS, + /**< RSA PKCS#1 PSS padding scheme */ + RTE_CRYPTO_RSA_PADDING_TYPE_LIST_END +}; + +/** + * RSA private key type enumeration + * + * enumerates private key format required to perform RSA crypto + * transform. + * + */ +enum rte_crypto_rsa_priv_key_type { + RTE_RSA_KEY_TYPE_EXP, + /**< RSA private key is an exponent */ + RTE_RSA_KET_TYPE_QT, + /**< RSA private key is in quintuple format + * See rte_crypto_rsa_priv_key_qt + */ +}; + +/** + * Structure describing RSA private key in quintuple format. + * See PKCS V1.5 RSA Cryptography Standard. + */ +struct rte_crypto_rsa_priv_key_qt { + rte_crypto_param p; + /**< p - Private key component P + * Private key component of RSA parameter required for CRT method + * of private key operations in Octet-string network byte order + * format. + */ + + rte_crypto_param q; + /**< q - Private key component Q + * Private key component of RSA parameter required for CRT method + * of private key operations in Octet-string network byte order + * format. + */ + + rte_crypto_param dP; + /**< dP - Private CRT component + * Private CRT component of RSA parameter required for CRT method + * RSA private key operations in Octet-string network byte order + * format. + * dP = d mod ( p - 1 ) + */ + + rte_crypto_param dQ; + /**< dQ - Private CRT component + * Private CRT component of RSA parameter required for CRT method + * RSA private key operations in Octet-string network byte order + * format. + * dQ = d mod ( q - 1 ) + */ + + rte_crypto_param qInv; + /**< qInv - Private CRT component + * Private CRT component of RSA parameter required for CRT method + * RSA private key operations in Octet-string network byte order + * format. + * qInv = inv q mod p + */ +}; + +/** + * Asymmetric RSA transform data + * + * Structure describing RSA xform params + * + */ +struct rte_crypto_rsa_xform { + rte_crypto_param n; + /**< n - Prime modulus + * Prime modulus data of RSA operation in Octet-string network + * byte order format. + */ + + rte_crypto_param e; + /**< e - Public key exponent + * Public key exponent used for RSA public key operations in Octet- + * string network byte order format. + */ + + enum rte_crypto_rsa_priv_key_type key_type; + + __extension__ + union { + rte_crypto_param d; + /**< d - Private key exponent + * Private key exponent used for RSA + * private key operations in + * Octet-string network byte order format. + */ + + struct rte_crypto_rsa_priv_key_qt qt; + /**< qt - Private key in quintuple format */ + }; +}; + +/** + * Asymmetric Modular exponentiation transform data + * + * Structure describing modular exponentation xform param + * + */ +struct rte_crypto_modex_xform { + rte_crypto_param modulus; + /**< modulus + * Prime modulus of the modexp transform operation in octet-string + * network byte order format. + */ + + rte_crypto_param exponent; + /**< exponent + * Private exponent of the modexp transform operation in + * octet-string network byte order format. + */ +}; + +/** + * Asymmetric modular inverse transform operation + * + * Structure describing modulus inverse xform params + * + */ +struct rte_crypto_modinv_xform { + rte_crypto_param modulus; + /**< + * Pointer to the prime modulus data for modular + * inverse operation in octet-string network byte + * order format. + */ +}; + +/** + * Asymmetric DH transform data + * + * Structure describing deffie-hellman xform params + * + */ +struct rte_crypto_dh_xform { + enum rte_crypto_asym_op_type type; + /**< Setup xform for key generate or shared secret compute */ + + rte_crypto_param p; + /**< p : Prime modulus data + * DH prime modulous data in octet-string network byte order format. + * + */ + + rte_crypto_param g; + /**< g : Generator + * DH group generator data in octet-string network byte order + * format. + * + */ +}; + +/** + * Asymmetric Digital Signature transform operation + * + * Structure describing DSA xform params + * + */ +struct rte_crypto_dsa_xform { + rte_crypto_param p; + /**< p - Prime modulus + * Prime modulus data for DSA operation in Octet-string network byte + * order format. + */ + rte_crypto_param q; + /**< q : Order of the subgroup. + * Order of the subgroup data in Octet-string network byte order + * format. + * (p-1) % q = 0 + */ + rte_crypto_param g; + /**< g: Generator of the subgroup + * Generator data in Octet-string network byte order format. + */ + rte_crypto_param x; + /**< x: Private key of the signer in octet-string network + * byte order format. + * Used when app has pre-defined private key. + * Valid only when xform chain is DSA ONLY. + * if xform chain is DH private key generate + DSA, then DSA sign + * compute will use internally generated key. + */ +}; + +/** + * Operations params for modular operations: + * exponentiation and invert + * + */ +struct rte_crypto_mod_op_param { + rte_crypto_param base; + /**< + * Pointer to base of modular exponentiation/inversion data in + * Octet-string network byte order format. + */ +}; + +/** + * Asymmetric crypto transform data + * + * Structure describing asym xforms. + */ +struct rte_crypto_asym_xform { + struct rte_crypto_asym_xform *next; + /**< Pointer to next xform to set up xform chain.*/ + enum rte_crypto_asym_xform_type xform_type; + /**< Asymmetric crypto transform */ + + __extension__ + union { + struct rte_crypto_rsa_xform rsa; + /**< RSA xform parameters */ + + struct rte_crypto_modex_xform modex; + /**< Modular Exponentiation xform parameters */ + + struct rte_crypto_modinv_xform modinv; + /**< Modulus Inverse xform parameters */ + + struct rte_crypto_dh_xform dh; + /**< DH xform parameters */ + + struct rte_crypto_dsa_xform dsa; + /**< DSA xform parameters */ + }; +}; + +struct rte_cryptodev_asym_session; + +/** + * RSA operation params + * + */ +struct rte_crypto_rsa_op_param { + enum rte_crypto_asym_op_type op_type; + /**< Type of RSA operation for transform */; + + rte_crypto_param message; + /**< + * Pointer to data + * - to be encrypted for RSA public encrypt. + * - to be decrypted for RSA private decrypt. + * - to be signed for RSA sign generation. + * - to be authenticated for RSA sign verification. + */ + + rte_crypto_param sign; + /**< + * Pointer to RSA signature data. If operation is RSA + * sign @ref RTE_CRYPTO_ASYM_OP_SIGN, buffer will be + * over-written with generated signature. + * + * Length of the signature data will be equal to the + * RSA prime modulus length. + */ + + enum rte_crypto_rsa_padding_type pad; + /**< RSA padding scheme to be used for transform */ + + enum rte_crypto_auth_algorithm md; + /**< Hash algorithm to be used for data hash if padding + * scheme is either OAEP or PSS. Valid hash algorithms + * are: + * MD5, SHA1, SHA224, SHA256, SHA384, SHA512 + */ + + enum rte_crypto_auth_algorithm mgf1md; + /**< + * Hash algorithm to be used for mask generation if + * padding scheme is either OAEP or PSS. If padding + * scheme is unspecified data hash algorithm is used + * for mask generation. Valid hash algorithms are: + * MD5, SHA1, SHA224, SHA256, SHA384, SHA512 + */ +}; + +/** + * Diffie-Hellman Operations params. + * @note: + */ +struct rte_crypto_dh_op_param { + rte_crypto_param pub_key; + /**< + * Output generated public key when xform type is + * DH PUB_KEY_GENERATION. + * Input peer public key when xform type is DH + * SHARED_SECRET_COMPUTATION + * pub_key is in octet-string network byte order format. + * + */ + + rte_crypto_param priv_key; + /**< + * Output generated private key if xform type is + * DH PRIVATE_KEY_GENERATION + * Input when xform type is DH SHARED_SECRET_COMPUTATION. + * priv_key is in octet-string network byte order format. + * + */ + + rte_crypto_param shared_secret; + /**< + * Output with calculated shared secret + * when dh xform set up with op type = SHARED_SECRET_COMPUTATION. + * shared_secret is an octet-string network byte order format. + * + */ +}; + +/** + * DSA Operations params + * + */ +struct rte_crypto_dsa_op_param { + enum rte_crypto_asym_op_type op_type; + /**< Signature Generation or Verification */ + rte_crypto_param message; + /**< input message to be signed or verified */ + rte_crypto_param r; + /**< dsa sign component 'r' value + * + * output if op_type = sign generate, + * input if op_type = sign verify + */ + rte_crypto_param s; + /**< dsa sign component 's' value + * + * output if op_type = sign generate, + * input if op_type = sign verify + */ + rte_crypto_param y; + /**< y : Public key of the signer. + * Public key data of the signer in Octet-string network byte order + * format. + * y = g^x mod p + */ +}; + +/** + * Asymmetric Cryptographic Operation. + * + * Structure describing asymmetric crypto operation params. + * + */ +struct rte_crypto_asym_op { + struct rte_cryptodev_asym_session *session; + /**< Handle for the initialised session context */ + + __extension__ + union { + struct rte_crypto_rsa_op_param rsa; + struct rte_crypto_mod_op_param modex; + struct rte_crypto_mod_op_param modinv; + struct rte_crypto_dh_op_param dh; + struct rte_crypto_dsa_op_param dsa; + }; +} __rte_cache_aligned; + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_CRYPTO_ASYM_H_ */ diff --git a/lib/librte_cryptodev/rte_cryptodev.c b/lib/librte_cryptodev/rte_cryptodev.c index 7e582124..63ae23f0 100644 --- a/lib/librte_cryptodev/rte_cryptodev.c +++ b/lib/librte_cryptodev/rte_cryptodev.c @@ -166,6 +166,31 @@ rte_crypto_aead_operation_strings[] = { [RTE_CRYPTO_AEAD_OP_DECRYPT] = "decrypt" }; +/** + * Asymmetric crypto transform operation strings identifiers. + */ +const char *rte_crypto_asym_xform_strings[] = { + [RTE_CRYPTO_ASYM_XFORM_NONE] = "none", + [RTE_CRYPTO_ASYM_XFORM_RSA] = "rsa", + [RTE_CRYPTO_ASYM_XFORM_MODEX] = "modexp", + [RTE_CRYPTO_ASYM_XFORM_MODINV] = "modinv", + [RTE_CRYPTO_ASYM_XFORM_DH] = "dh", + [RTE_CRYPTO_ASYM_XFORM_DSA] = "dsa", +}; + +/** + * Asymmetric crypto operation strings identifiers. + */ +const char *rte_crypto_asym_op_strings[] = { + [RTE_CRYPTO_ASYM_OP_ENCRYPT] = "encrypt", + [RTE_CRYPTO_ASYM_OP_DECRYPT] = "decrypt", + [RTE_CRYPTO_ASYM_OP_SIGN] = "sign", + [RTE_CRYPTO_ASYM_OP_VERIFY] = "verify", + [RTE_CRYPTO_ASYM_OP_PRIVATE_KEY_GENERATE] = "priv_key_generate", + [RTE_CRYPTO_ASYM_OP_PUBLIC_KEY_GENERATE] = "pub_key_generate", + [RTE_CRYPTO_ASYM_OP_SHARED_SECRET_COMPUTE] = "sharedsecret_compute", +}; + int rte_cryptodev_get_cipher_algo_enum(enum rte_crypto_cipher_algorithm *algo_enum, const char *algo_string) @@ -217,6 +242,24 @@ rte_cryptodev_get_aead_algo_enum(enum rte_crypto_aead_algorithm *algo_enum, return -1; } +int __rte_experimental +rte_cryptodev_asym_get_xform_enum(enum rte_crypto_asym_xform_type *xform_enum, + const char *xform_string) +{ + unsigned int i; + + for (i = 1; i < RTE_DIM(rte_crypto_asym_xform_strings); i++) { + if (strcmp(xform_string, + rte_crypto_asym_xform_strings[i]) == 0) { + *xform_enum = (enum rte_crypto_asym_xform_type) i; + return 0; + } + } + + /* Invalid string */ + return -1; +} + /** * The crypto auth operation strings identifiers. * It could be used in application command line. @@ -287,6 +330,28 @@ param_range_check(uint16_t size, const struct rte_crypto_param_range *range) return -1; } +const struct rte_cryptodev_asymmetric_xform_capability * __rte_experimental +rte_cryptodev_asym_capability_get(uint8_t dev_id, + const struct rte_cryptodev_asym_capability_idx *idx) +{ + const struct rte_cryptodev_capabilities *capability; + struct rte_cryptodev_info dev_info; + unsigned int i = 0; + + memset(&dev_info, 0, sizeof(struct rte_cryptodev_info)); + rte_cryptodev_info_get(dev_id, &dev_info); + + while ((capability = &dev_info.capabilities[i++])->op != + RTE_CRYPTO_OP_TYPE_UNDEFINED) { + if (capability->op != RTE_CRYPTO_OP_TYPE_ASYMMETRIC) + continue; + + if (capability->asym.xform_capa.xform_type == idx->type) + return &capability->asym.xform_capa; + } + return NULL; +}; + int rte_cryptodev_sym_capability_check_cipher( const struct rte_cryptodev_symmetric_capability *capability, @@ -338,6 +403,42 @@ rte_cryptodev_sym_capability_check_aead( return 0; } +int __rte_experimental +rte_cryptodev_asym_xform_capability_check_optype( + const struct rte_cryptodev_asymmetric_xform_capability *capability, + enum rte_crypto_asym_op_type op_type) +{ + if (capability->op_types & (1 << op_type)) + return 1; + + return 0; +} + +int __rte_experimental +rte_cryptodev_asym_xform_capability_check_modlen( + const struct rte_cryptodev_asymmetric_xform_capability *capability, + uint16_t modlen) +{ + /* no need to check for limits, if min or max = 0 */ + if (capability->modlen.min != 0) { + if (modlen < capability->modlen.min) + return -1; + } + + if (capability->modlen.max != 0) { + if (modlen > capability->modlen.max) + return -1; + } + + /* in any case, check if given modlen is module increment */ + if (capability->modlen.increment != 0) { + if (modlen % (capability->modlen.increment)) + return -1; + } + + return 0; +} + const char * rte_cryptodev_get_feature_name(uint64_t flag) @@ -361,8 +462,16 @@ rte_cryptodev_get_feature_name(uint64_t flag) return "CPU_AESNI"; case RTE_CRYPTODEV_FF_HW_ACCELERATED: return "HW_ACCELERATED"; - case RTE_CRYPTODEV_FF_MBUF_SCATTER_GATHER: - return "MBUF_SCATTER_GATHER"; + case RTE_CRYPTODEV_FF_IN_PLACE_SGL: + return "IN_PLACE_SGL"; + case RTE_CRYPTODEV_FF_OOP_SGL_IN_SGL_OUT: + return "OOP_SGL_IN_SGL_OUT"; + case RTE_CRYPTODEV_FF_OOP_SGL_IN_LB_OUT: + return "OOP_SGL_IN_LB_OUT"; + case RTE_CRYPTODEV_FF_OOP_LB_IN_SGL_OUT: + return "OOP_LB_IN_SGL_OUT"; + case RTE_CRYPTODEV_FF_OOP_LB_IN_LB_OUT: + return "OOP_LB_IN_LB_OUT"; case RTE_CRYPTODEV_FF_CPU_NEON: return "CPU_NEON"; case RTE_CRYPTODEV_FF_CPU_ARM_CE: @@ -703,50 +812,6 @@ rte_cryptodev_queue_pairs_config(struct rte_cryptodev *dev, uint16_t nb_qpairs, } int -rte_cryptodev_queue_pair_start(uint8_t dev_id, uint16_t queue_pair_id) -{ - struct rte_cryptodev *dev; - - if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) { - CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id); - return -EINVAL; - } - - dev = &rte_crypto_devices[dev_id]; - if (queue_pair_id >= dev->data->nb_queue_pairs) { - CDEV_LOG_ERR("Invalid queue_pair_id=%d", queue_pair_id); - return -EINVAL; - } - - RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->queue_pair_start, -ENOTSUP); - - return dev->dev_ops->queue_pair_start(dev, queue_pair_id); - -} - -int -rte_cryptodev_queue_pair_stop(uint8_t dev_id, uint16_t queue_pair_id) -{ - struct rte_cryptodev *dev; - - if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) { - CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id); - return -EINVAL; - } - - dev = &rte_crypto_devices[dev_id]; - if (queue_pair_id >= dev->data->nb_queue_pairs) { - CDEV_LOG_ERR("Invalid queue_pair_id=%d", queue_pair_id); - return -EINVAL; - } - - RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->queue_pair_stop, -ENOTSUP); - - return dev->dev_ops->queue_pair_stop(dev, queue_pair_id); - -} - -int rte_cryptodev_configure(uint8_t dev_id, struct rte_cryptodev_config *config) { struct rte_cryptodev *dev; @@ -966,6 +1031,7 @@ rte_cryptodev_info_get(uint8_t dev_id, struct rte_cryptodev_info *dev_info) (*dev->dev_ops->dev_infos_get)(dev, dev_info); dev_info->driver_name = dev->device->driver->name; + dev_info->device = dev->device; } @@ -1098,8 +1164,46 @@ rte_cryptodev_sym_session_init(uint8_t dev_id, index = dev->driver_id; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->sym_session_configure, -ENOTSUP); + if (sess->sess_private_data[index] == NULL) { - ret = dev->dev_ops->session_configure(dev, xforms, sess, mp); + ret = dev->dev_ops->sym_session_configure(dev, xforms, + sess, mp); + if (ret < 0) { + CDEV_LOG_ERR( + "dev_id %d failed to configure session details", + dev_id); + return ret; + } + } + + return 0; +} + +int __rte_experimental +rte_cryptodev_asym_session_init(uint8_t dev_id, + struct rte_cryptodev_asym_session *sess, + struct rte_crypto_asym_xform *xforms, + struct rte_mempool *mp) +{ + struct rte_cryptodev *dev; + uint8_t index; + int ret; + + dev = rte_cryptodev_pmd_get_dev(dev_id); + + if (sess == NULL || xforms == NULL || dev == NULL) + return -EINVAL; + + index = dev->driver_id; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->asym_session_configure, + -ENOTSUP); + + if (sess->sess_private_data[index] == NULL) { + ret = dev->dev_ops->asym_session_configure(dev, + xforms, + sess, mp); if (ret < 0) { CDEV_LOG_ERR( "dev_id %d failed to configure session details", @@ -1123,70 +1227,53 @@ rte_cryptodev_sym_session_create(struct rte_mempool *mp) } /* Clear device session pointer. - * Include the flag indicating presence of private data + * Include the flag indicating presence of user data */ memset(sess, 0, (sizeof(void *) * nb_drivers) + sizeof(uint8_t)); return sess; } -int -rte_cryptodev_queue_pair_attach_sym_session(uint8_t dev_id, uint16_t qp_id, - struct rte_cryptodev_sym_session *sess) +struct rte_cryptodev_asym_session * __rte_experimental +rte_cryptodev_asym_session_create(struct rte_mempool *mp) { - struct rte_cryptodev *dev; + struct rte_cryptodev_asym_session *sess; - if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) { - CDEV_LOG_ERR("Invalid dev_id=%d", dev_id); - return -EINVAL; + /* Allocate a session structure from the session pool */ + if (rte_mempool_get(mp, (void **)&sess)) { + CDEV_LOG_ERR("couldn't get object from session mempool"); + return NULL; } - dev = &rte_crypto_devices[dev_id]; - - /* The API is optional, not returning error if driver do not suuport */ - RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->qp_attach_session, 0); - - void *sess_priv = get_session_private_data(sess, dev->driver_id); - - if (dev->dev_ops->qp_attach_session(dev, qp_id, sess_priv)) { - CDEV_LOG_ERR("dev_id %d failed to attach qp: %d with session", - dev_id, qp_id); - return -EPERM; - } + /* Clear device session pointer. + * Include the flag indicating presence of private data + */ + memset(sess, 0, (sizeof(void *) * nb_drivers) + sizeof(uint8_t)); - return 0; + return sess; } int -rte_cryptodev_queue_pair_detach_sym_session(uint8_t dev_id, uint16_t qp_id, +rte_cryptodev_sym_session_clear(uint8_t dev_id, struct rte_cryptodev_sym_session *sess) { struct rte_cryptodev *dev; - if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) { - CDEV_LOG_ERR("Invalid dev_id=%d", dev_id); - return -EINVAL; - } - - dev = &rte_crypto_devices[dev_id]; + dev = rte_cryptodev_pmd_get_dev(dev_id); - /* The API is optional, not returning error if driver do not suuport */ - RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->qp_detach_session, 0); + if (dev == NULL || sess == NULL) + return -EINVAL; - void *sess_priv = get_session_private_data(sess, dev->driver_id); + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->sym_session_clear, -ENOTSUP); - if (dev->dev_ops->qp_detach_session(dev, qp_id, sess_priv)) { - CDEV_LOG_ERR("dev_id %d failed to detach qp: %d from session", - dev_id, qp_id); - return -EPERM; - } + dev->dev_ops->sym_session_clear(dev, sess); return 0; } -int -rte_cryptodev_sym_session_clear(uint8_t dev_id, - struct rte_cryptodev_sym_session *sess) +int __rte_experimental +rte_cryptodev_asym_session_clear(uint8_t dev_id, + struct rte_cryptodev_asym_session *sess) { struct rte_cryptodev *dev; @@ -1195,7 +1282,9 @@ rte_cryptodev_sym_session_clear(uint8_t dev_id, if (dev == NULL || sess == NULL) return -EINVAL; - dev->dev_ops->session_clear(dev, sess); + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->asym_session_clear, -ENOTSUP); + + dev->dev_ops->asym_session_clear(dev, sess); return 0; } @@ -1212,7 +1301,7 @@ rte_cryptodev_sym_session_free(struct rte_cryptodev_sym_session *sess) /* Check that all device private data has been freed */ for (i = 0; i < nb_drivers; i++) { - sess_priv = get_session_private_data(sess, i); + sess_priv = get_sym_session_private_data(sess, i); if (sess_priv != NULL) return -EBUSY; } @@ -1224,27 +1313,51 @@ rte_cryptodev_sym_session_free(struct rte_cryptodev_sym_session *sess) return 0; } -unsigned int -rte_cryptodev_get_header_session_size(void) +int __rte_experimental +rte_cryptodev_asym_session_free(struct rte_cryptodev_asym_session *sess) { - return rte_cryptodev_sym_get_header_session_size(); + uint8_t i; + void *sess_priv; + struct rte_mempool *sess_mp; + + if (sess == NULL) + return -EINVAL; + + /* Check that all device private data has been freed */ + for (i = 0; i < nb_drivers; i++) { + sess_priv = get_asym_session_private_data(sess, i); + if (sess_priv != NULL) + return -EBUSY; + } + + /* Return session to mempool */ + sess_mp = rte_mempool_from_obj(sess); + rte_mempool_put(sess_mp, sess); + + return 0; } + unsigned int rte_cryptodev_sym_get_header_session_size(void) { /* * Header contains pointers to the private data * of all registered drivers, and a flag which - * indicates presence of private data + * indicates presence of user data */ return ((sizeof(void *) * nb_drivers) + sizeof(uint8_t)); } -unsigned int -rte_cryptodev_get_private_session_size(uint8_t dev_id) +unsigned int __rte_experimental +rte_cryptodev_asym_get_header_session_size(void) { - return rte_cryptodev_sym_get_private_session_size(dev_id); + /* + * Header contains pointers to the private data + * of all registered drivers, and a flag which + * indicates presence of private data + */ + return ((sizeof(void *) * nb_drivers) + sizeof(uint8_t)); } unsigned int @@ -1259,10 +1372,10 @@ rte_cryptodev_sym_get_private_session_size(uint8_t dev_id) dev = rte_cryptodev_pmd_get_dev(dev_id); - if (*dev->dev_ops->session_get_size == NULL) + if (*dev->dev_ops->sym_session_get_size == NULL) return 0; - priv_sess_size = (*dev->dev_ops->session_get_size)(dev); + priv_sess_size = (*dev->dev_ops->sym_session_get_size)(dev); /* * If size is less than session header size, @@ -1276,32 +1389,55 @@ rte_cryptodev_sym_get_private_session_size(uint8_t dev_id) } +unsigned int __rte_experimental +rte_cryptodev_asym_get_private_session_size(uint8_t dev_id) +{ + struct rte_cryptodev *dev; + unsigned int header_size = sizeof(void *) * nb_drivers; + unsigned int priv_sess_size; + + if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) + return 0; + + dev = rte_cryptodev_pmd_get_dev(dev_id); + + if (*dev->dev_ops->asym_session_get_size == NULL) + return 0; + + priv_sess_size = (*dev->dev_ops->asym_session_get_size)(dev); + if (priv_sess_size < header_size) + return header_size; + + return priv_sess_size; + +} + int __rte_experimental -rte_cryptodev_sym_session_set_private_data( +rte_cryptodev_sym_session_set_user_data( struct rte_cryptodev_sym_session *sess, void *data, uint16_t size) { uint16_t off_set = sizeof(void *) * nb_drivers; - uint8_t *private_data_present = (uint8_t *)sess + off_set; + uint8_t *user_data_present = (uint8_t *)sess + off_set; if (sess == NULL) return -EINVAL; - *private_data_present = 1; + *user_data_present = 1; off_set += sizeof(uint8_t); rte_memcpy((uint8_t *)sess + off_set, data, size); return 0; } void * __rte_experimental -rte_cryptodev_sym_session_get_private_data( +rte_cryptodev_sym_session_get_user_data( struct rte_cryptodev_sym_session *sess) { uint16_t off_set = sizeof(void *) * nb_drivers; - uint8_t *private_data_present = (uint8_t *)sess + off_set; + uint8_t *user_data_present = (uint8_t *)sess + off_set; - if (sess == NULL || !*private_data_present) + if (sess == NULL || !*user_data_present) return NULL; off_set += sizeof(uint8_t); @@ -1335,9 +1471,17 @@ rte_crypto_op_pool_create(const char *name, enum rte_crypto_op_type type, struct rte_crypto_op_pool_private *priv; unsigned elt_size = sizeof(struct rte_crypto_op) + - sizeof(struct rte_crypto_sym_op) + priv_size; + if (type == RTE_CRYPTO_OP_TYPE_SYMMETRIC) { + elt_size += sizeof(struct rte_crypto_sym_op); + } else if (type == RTE_CRYPTO_OP_TYPE_ASYMMETRIC) { + elt_size += sizeof(struct rte_crypto_asym_op); + } else { + CDEV_LOG_ERR("Invalid op_type\n"); + return NULL; + } + /* lookup mempool in case already allocated */ struct rte_mempool *mp = rte_mempool_lookup(name); diff --git a/lib/librte_cryptodev/rte_cryptodev.h b/lib/librte_cryptodev/rte_cryptodev.h index 92ce6d49..4099823f 100644 --- a/lib/librte_cryptodev/rte_cryptodev.h +++ b/lib/librte_cryptodev/rte_cryptodev.h @@ -1,32 +1,5 @@ -/*- - * - * Copyright(c) 2015-2017 Intel Corporation. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015-2017 Intel Corporation. */ #ifndef _RTE_CRYPTODEV_H_ @@ -65,7 +38,6 @@ extern const char **rte_cyptodev_names; RTE_FMT(RTE_FMT_HEAD(__VA_ARGS__,) "\n", \ RTE_FMT_TAIL(__VA_ARGS__,))) -#ifdef RTE_LIBRTE_CRYPTODEV_DEBUG #define CDEV_LOG_DEBUG(...) \ RTE_LOG(DEBUG, CRYPTODEV, \ RTE_FMT("%s() line %u: " RTE_FMT_HEAD(__VA_ARGS__,) "\n", \ @@ -76,13 +48,6 @@ extern const char **rte_cyptodev_names; RTE_FMT("[%s] %s: " RTE_FMT_HEAD(__VA_ARGS__,) "\n", \ dev, __func__, RTE_FMT_TAIL(__VA_ARGS__,))) -#else -#define CDEV_LOG_DEBUG(...) (void)0 -#define CDEV_PMD_TRACE(...) (void)0 -#endif - - - /** * A macro that points to an offset from the start * of the crypto operation structure (rte_crypto_op) @@ -178,6 +143,35 @@ struct rte_cryptodev_symmetric_capability { }; }; +/** + * Asymmetric Xform Crypto Capability + * + */ +struct rte_cryptodev_asymmetric_xform_capability { + enum rte_crypto_asym_xform_type xform_type; + /**< Transform type: RSA/MODEXP/DH/DSA/MODINV */ + + uint32_t op_types; + /**< bitmask for supported rte_crypto_asym_op_type */ + + __extension__ + union { + struct rte_crypto_param_range modlen; + /**< Range of modulus length supported by modulus based xform. + * Value 0 mean implementation default + */ + }; +}; + +/** + * Asymmetric Crypto Capability + * + */ +struct rte_cryptodev_asymmetric_capability { + struct rte_cryptodev_asymmetric_xform_capability xform_capa; +}; + + /** Structure used to capture a capability of a crypto device */ struct rte_cryptodev_capabilities { enum rte_crypto_op_type op; @@ -187,6 +181,8 @@ struct rte_cryptodev_capabilities { union { struct rte_cryptodev_symmetric_capability sym; /**< Symmetric operation capability parameters */ + struct rte_cryptodev_asymmetric_capability asym; + /**< Asymmetric operation capability parameters */ }; }; @@ -201,7 +197,17 @@ struct rte_cryptodev_sym_capability_idx { }; /** - * Provide capabilities available for defined device and algorithm + * Structure used to describe asymmetric crypto xforms + * Each xform maps to one asym algorithm. + * + */ +struct rte_cryptodev_asym_capability_idx { + enum rte_crypto_asym_xform_type type; + /**< Asymmetric xform (algo) type */ +}; + +/** + * Provide capabilities available for defined device and algorithm * * @param dev_id The identifier of the device. * @param idx Description of crypto algorithms. @@ -215,6 +221,20 @@ rte_cryptodev_sym_capability_get(uint8_t dev_id, const struct rte_cryptodev_sym_capability_idx *idx); /** + * Provide capabilities available for defined device and xform + * + * @param dev_id The identifier of the device. + * @param idx Description of asym crypto xform. + * + * @return + * - Return description of the asymmetric crypto capability if exist. + * - Return NULL if the capability not exist. + */ +const struct rte_cryptodev_asymmetric_xform_capability * __rte_experimental +rte_cryptodev_asym_capability_get(uint8_t dev_id, + const struct rte_cryptodev_asym_capability_idx *idx); + +/** * Check if key size and initial vector are supported * in crypto cipher capability * @@ -270,6 +290,36 @@ rte_cryptodev_sym_capability_check_aead( uint16_t iv_size); /** + * Check if op type is supported + * + * @param capability Description of the asymmetric crypto capability. + * @param op_type op type + * + * @return + * - Return 1 if the op type is supported + * - Return 0 if unsupported + */ +int __rte_experimental +rte_cryptodev_asym_xform_capability_check_optype( + const struct rte_cryptodev_asymmetric_xform_capability *capability, + enum rte_crypto_asym_op_type op_type); + +/** + * Check if modulus length is in supported range + * + * @param capability Description of the asymmetric crypto capability. + * @param modlen modulus length. + * + * @return + * - Return 0 if the parameters are in range of the capability. + * - Return -1 if the parameters are out of range of the capability. + */ +int __rte_experimental +rte_cryptodev_asym_xform_capability_check_modlen( + const struct rte_cryptodev_asymmetric_xform_capability *capability, + uint16_t modlen); + +/** * Provide the cipher algorithm enum, given an algorithm string * * @param algo_enum A pointer to the cipher algorithm @@ -314,6 +364,22 @@ int rte_cryptodev_get_aead_algo_enum(enum rte_crypto_aead_algorithm *algo_enum, const char *algo_string); +/** + * Provide the Asymmetric xform enum, given an xform string + * + * @param xform_enum A pointer to the xform type + * enum to be filled + * @param xform_string xform string + * + * @return + * - Return -1 if string is not valid + * - Return 0 if the string is valid + */ +int __rte_experimental +rte_cryptodev_asym_get_xform_enum(enum rte_crypto_asym_xform_type *xform_enum, + const char *xform_string); + + /** Macro used at end of crypto PMD list */ #define RTE_CRYPTODEV_END_OF_CAPABILITIES_LIST() \ { RTE_CRYPTO_OP_TYPE_UNDEFINED } @@ -327,31 +393,50 @@ rte_cryptodev_get_aead_algo_enum(enum rte_crypto_aead_algorithm *algo_enum, * * Keep these flags synchronised with rte_cryptodev_get_feature_name() */ -#define RTE_CRYPTODEV_FF_SYMMETRIC_CRYPTO (1ULL << 0) +#define RTE_CRYPTODEV_FF_SYMMETRIC_CRYPTO (1ULL << 0) /**< Symmetric crypto operations are supported */ -#define RTE_CRYPTODEV_FF_ASYMMETRIC_CRYPTO (1ULL << 1) +#define RTE_CRYPTODEV_FF_ASYMMETRIC_CRYPTO (1ULL << 1) /**< Asymmetric crypto operations are supported */ -#define RTE_CRYPTODEV_FF_SYM_OPERATION_CHAINING (1ULL << 2) +#define RTE_CRYPTODEV_FF_SYM_OPERATION_CHAINING (1ULL << 2) /**< Chaining symmetric crypto operations are supported */ -#define RTE_CRYPTODEV_FF_CPU_SSE (1ULL << 3) +#define RTE_CRYPTODEV_FF_CPU_SSE (1ULL << 3) /**< Utilises CPU SIMD SSE instructions */ -#define RTE_CRYPTODEV_FF_CPU_AVX (1ULL << 4) +#define RTE_CRYPTODEV_FF_CPU_AVX (1ULL << 4) /**< Utilises CPU SIMD AVX instructions */ -#define RTE_CRYPTODEV_FF_CPU_AVX2 (1ULL << 5) +#define RTE_CRYPTODEV_FF_CPU_AVX2 (1ULL << 5) /**< Utilises CPU SIMD AVX2 instructions */ -#define RTE_CRYPTODEV_FF_CPU_AESNI (1ULL << 6) +#define RTE_CRYPTODEV_FF_CPU_AESNI (1ULL << 6) /**< Utilises CPU AES-NI instructions */ -#define RTE_CRYPTODEV_FF_HW_ACCELERATED (1ULL << 7) -/**< Operations are off-loaded to an external hardware accelerator */ -#define RTE_CRYPTODEV_FF_CPU_AVX512 (1ULL << 8) +#define RTE_CRYPTODEV_FF_HW_ACCELERATED (1ULL << 7) +/**< Operations are off-loaded to an + * external hardware accelerator + */ +#define RTE_CRYPTODEV_FF_CPU_AVX512 (1ULL << 8) /**< Utilises CPU SIMD AVX512 instructions */ -#define RTE_CRYPTODEV_FF_MBUF_SCATTER_GATHER (1ULL << 9) -/**< Scatter-gather mbufs are supported */ -#define RTE_CRYPTODEV_FF_CPU_NEON (1ULL << 10) +#define RTE_CRYPTODEV_FF_IN_PLACE_SGL (1ULL << 9) +/**< In-place Scatter-gather (SGL) buffers, with multiple segments, + * are supported + */ +#define RTE_CRYPTODEV_FF_OOP_SGL_IN_SGL_OUT (1ULL << 10) +/**< Out-of-place Scatter-gather (SGL) buffers are + * supported in input and output + */ +#define RTE_CRYPTODEV_FF_OOP_SGL_IN_LB_OUT (1ULL << 11) +/**< Out-of-place Scatter-gather (SGL) buffers are supported + * in input, combined with linear buffers (LB), with a + * single segment in output + */ +#define RTE_CRYPTODEV_FF_OOP_LB_IN_SGL_OUT (1ULL << 12) +/**< Out-of-place Scatter-gather (SGL) buffers are supported + * in output, combined with linear buffers (LB) in input + */ +#define RTE_CRYPTODEV_FF_OOP_LB_IN_LB_OUT (1ULL << 13) +/**< Out-of-place linear buffers (LB) are supported in input and output */ +#define RTE_CRYPTODEV_FF_CPU_NEON (1ULL << 14) /**< Utilises CPU NEON instructions */ -#define RTE_CRYPTODEV_FF_CPU_ARM_CE (1ULL << 11) +#define RTE_CRYPTODEV_FF_CPU_ARM_CE (1ULL << 15) /**< Utilises ARM CPU Cryptographic Extensions */ -#define RTE_CRYPTODEV_FF_SECURITY (1ULL << 12) +#define RTE_CRYPTODEV_FF_SECURITY (1ULL << 16) /**< Support Security Protocol Processing */ @@ -369,9 +454,9 @@ rte_cryptodev_get_feature_name(uint64_t flag); /** Crypto device information */ struct rte_cryptodev_info { - const char *driver_name; /**< Driver name. */ - uint8_t driver_id; /**< Driver identifier */ - struct rte_pci_device *pci_dev; /**< PCI information. */ + const char *driver_name; /**< Driver name. */ + uint8_t driver_id; /**< Driver identifier */ + struct rte_device *device; /**< Generic device information. */ uint64_t feature_flags; /**< Feature flags exposes HW/SW features for the given device */ @@ -382,12 +467,17 @@ struct rte_cryptodev_info { unsigned max_nb_queue_pairs; /**< Maximum number of queues pairs supported by device. */ + uint16_t min_mbuf_headroom_req; + /**< Minimum mbuf headroom required by device */ + + uint16_t min_mbuf_tailroom_req; + /**< Minimum mbuf tailroom required by device */ + struct { unsigned max_nb_sessions; - /**< Maximum number of sessions supported by device. */ - unsigned int max_nb_sessions_per_qp; - /**< Maximum number of sessions per queue pair. - * Default 0 for infinite sessions + /**< Maximum number of sessions supported by device. + * If 0, the device does not have any limitation in + * number of sessions that can be used. */ } sym; }; @@ -603,43 +693,6 @@ rte_cryptodev_queue_pair_setup(uint8_t dev_id, uint16_t queue_pair_id, struct rte_mempool *session_pool); /** - * @deprecated - * Start a specified queue pair of a device. It is used - * when deferred_start flag of the specified queue is true. - * - * @param dev_id The identifier of the device - * @param queue_pair_id The index of the queue pair to start. The value - * must be in the range [0, nb_queue_pair - 1] - * previously supplied to - * rte_crypto_dev_configure(). - * @return - * - 0: Success, the transmit queue is correctly set up. - * - -EINVAL: The dev_id or the queue_id out of range. - * - -ENOTSUP: The function not supported in PMD driver. - */ -__rte_deprecated -extern int -rte_cryptodev_queue_pair_start(uint8_t dev_id, uint16_t queue_pair_id); - -/** - * @deprecated - * Stop specified queue pair of a device - * - * @param dev_id The identifier of the device - * @param queue_pair_id The index of the queue pair to stop. The value - * must be in the range [0, nb_queue_pair - 1] - * previously supplied to - * rte_cryptodev_configure(). - * @return - * - 0: Success, the transmit queue is correctly set up. - * - -EINVAL: The dev_id or the queue_id out of range. - * - -ENOTSUP: The function not supported in PMD driver. - */ -__rte_deprecated -extern int -rte_cryptodev_queue_pair_stop(uint8_t dev_id, uint16_t queue_pair_id); - -/** * Get the number of queue pairs on a specific crypto device * * @param dev_id Crypto device identifier. @@ -902,9 +955,14 @@ rte_cryptodev_enqueue_burst(uint8_t dev_id, uint16_t qp_id, */ struct rte_cryptodev_sym_session { __extension__ void *sess_private_data[0]; - /**< Private session material */ + /**< Private symmetric session material */ }; +/** Cryptodev asymmetric crypto session */ +struct rte_cryptodev_asym_session { + __extension__ void *sess_private_data[0]; + /**< Private asymmetric session material */ +}; /** * Create symmetric crypto session header (generic with no private data) @@ -919,6 +977,18 @@ struct rte_cryptodev_sym_session * rte_cryptodev_sym_session_create(struct rte_mempool *mempool); /** + * Create asymmetric crypto session header (generic with no private data) + * + * @param mempool mempool to allocate asymmetric session + * objects from + * @return + * - On success return pointer to asym-session + * - On failure returns NULL + */ +struct rte_cryptodev_asym_session * __rte_experimental +rte_cryptodev_asym_session_create(struct rte_mempool *mempool); + +/** * Frees symmetric crypto session header, after checking that all * the device private data has been freed, returning it * to its original mempool. @@ -934,6 +1004,21 @@ int rte_cryptodev_sym_session_free(struct rte_cryptodev_sym_session *sess); /** + * Frees asymmetric crypto session header, after checking that all + * the device private data has been freed, returning it + * to its original mempool. + * + * @param sess Session header to be freed. + * + * @return + * - 0 if successful. + * - -EINVAL if session is NULL. + * - -EBUSY if not all device private data has been freed. + */ +int __rte_experimental +rte_cryptodev_asym_session_free(struct rte_cryptodev_asym_session *sess); + +/** * Fill out private data for the device id, based on its device type. * * @param dev_id ID of device that we want the session to be used on @@ -945,7 +1030,8 @@ rte_cryptodev_sym_session_free(struct rte_cryptodev_sym_session *sess); * @return * - On success, zero. * - -EINVAL if input parameters are invalid. - * - -ENOTSUP if crypto device does not support the crypto transform. + * - -ENOTSUP if crypto device does not support the crypto transform or + * does not support symmetric operations. * - -ENOMEM if the private session could not be allocated. */ int @@ -955,6 +1041,27 @@ rte_cryptodev_sym_session_init(uint8_t dev_id, struct rte_mempool *mempool); /** + * Initialize asymmetric session on a device with specific asymmetric xform + * + * @param dev_id ID of device that we want the session to be used on + * @param sess Session to be set up on a device + * @param xforms Asymmetric crypto transform operations to apply on flow + * processed with this session + * @param mempool Mempool to be used for internal allocation. + * + * @return + * - On success, zero. + * - -EINVAL if input parameters are invalid. + * - -ENOTSUP if crypto device does not support the crypto transform. + * - -ENOMEM if the private session could not be allocated. + */ +int __rte_experimental +rte_cryptodev_asym_session_init(uint8_t dev_id, + struct rte_cryptodev_asym_session *sess, + struct rte_crypto_asym_xform *xforms, + struct rte_mempool *mempool); + +/** * Frees private data for the device id, based on its device type, * returning it to its mempool. It is the application's responsibility * to ensure that private session data is not cleared while there are @@ -966,44 +1073,43 @@ rte_cryptodev_sym_session_init(uint8_t dev_id, * @return * - 0 if successful. * - -EINVAL if device is invalid or session is NULL. + * - -ENOTSUP if crypto device does not support symmetric operations. */ int rte_cryptodev_sym_session_clear(uint8_t dev_id, struct rte_cryptodev_sym_session *sess); /** - * @deprecated - * Get the size of the header session, for all registered drivers. + * Frees resources held by asymmetric session during rte_cryptodev_session_init * + * @param dev_id ID of device that uses the asymmetric session. + * @param sess Asymmetric session setup on device using + * rte_cryptodev_session_init * @return - * Size of the header session. + * - 0 if successful. + * - -EINVAL if device is invalid or session is NULL. */ -__rte_deprecated -unsigned int -rte_cryptodev_get_header_session_size(void); +int __rte_experimental +rte_cryptodev_asym_session_clear(uint8_t dev_id, + struct rte_cryptodev_asym_session *sess); /** - * @deprecated - * Get the size of the private session data for a device. - * - * @param dev_id The device identifier. + * Get the size of the header session, for all registered drivers. * * @return - * - Size of the private data, if successful - * - 0 if device is invalid or does not have private session + * Size of the symmetric eader session. */ -__rte_deprecated unsigned int -rte_cryptodev_get_private_session_size(uint8_t dev_id); +rte_cryptodev_sym_get_header_session_size(void); /** - * Get the size of the header session, for all registered drivers. + * Get the size of the asymmetric session header, for all registered drivers. * * @return - * Size of the symmetric eader session. + * Size of the asymmetric header session. */ -unsigned int -rte_cryptodev_sym_get_header_session_size(void); +unsigned int __rte_experimental +rte_cryptodev_asym_get_header_session_size(void); /** * Get the size of the private symmetric session data @@ -1020,40 +1126,17 @@ unsigned int rte_cryptodev_sym_get_private_session_size(uint8_t dev_id); /** - * @deprecated - * Attach queue pair with sym session. - * - * @param dev_id Device to which the session will be attached. - * @param qp_id Queue pair to which the session will be attached. - * @param session Session pointer previously allocated by - * *rte_cryptodev_sym_session_create*. - * - * @return - * - On success, zero. - * - On failure, a negative value. - */ -__rte_deprecated -int -rte_cryptodev_queue_pair_attach_sym_session(uint8_t dev_id, uint16_t qp_id, - struct rte_cryptodev_sym_session *session); - -/** - * @deprecated - * Detach queue pair with sym session. + * Get the size of the private data for asymmetric session + * on device * - * @param dev_id Device to which the session is attached. - * @param qp_id Queue pair to which the session is attached. - * @param session Session pointer previously allocated by - * *rte_cryptodev_sym_session_create*. + * @param dev_id The device identifier. * * @return - * - On success, zero. - * - On failure, a negative value. + * - Size of the asymmetric private data, if successful + * - 0 if device is invalid or does not have private session */ -__rte_deprecated -int -rte_cryptodev_queue_pair_detach_sym_session(uint8_t dev_id, uint16_t qp_id, - struct rte_cryptodev_sym_session *session); +unsigned int __rte_experimental +rte_cryptodev_asym_get_private_session_size(uint8_t dev_id); /** * Provide driver identifier. @@ -1076,35 +1159,35 @@ int rte_cryptodev_driver_id_get(const char *name); const char *rte_cryptodev_driver_name_get(uint8_t driver_id); /** - * Set private data for a session. + * Store user data in a session. * * @param sess Session pointer allocated by * *rte_cryptodev_sym_session_create*. - * @param data Pointer to the private data. - * @param size Size of the private data. + * @param data Pointer to the user data. + * @param size Size of the user data. * * @return * - On success, zero. * - On failure, a negative value. */ int __rte_experimental -rte_cryptodev_sym_session_set_private_data( +rte_cryptodev_sym_session_set_user_data( struct rte_cryptodev_sym_session *sess, void *data, uint16_t size); /** - * Get private data of a session. + * Get user data stored in a session. * * @param sess Session pointer allocated by * *rte_cryptodev_sym_session_create*. * * @return - * - On success return pointer to private data. + * - On success return pointer to user data. * - On failure returns NULL. */ void * __rte_experimental -rte_cryptodev_sym_session_get_private_data( +rte_cryptodev_sym_session_get_user_data( struct rte_cryptodev_sym_session *sess); #ifdef __cplusplus diff --git a/lib/librte_cryptodev/rte_cryptodev_pmd.c b/lib/librte_cryptodev/rte_cryptodev_pmd.c index f2aac24b..2088ac3f 100644 --- a/lib/librte_cryptodev/rte_cryptodev_pmd.c +++ b/lib/librte_cryptodev/rte_cryptodev_pmd.c @@ -66,13 +66,6 @@ rte_cryptodev_pmd_parse_input_args( goto free_kvlist; ret = rte_kvargs_process(kvlist, - RTE_CRYPTODEV_PMD_MAX_NB_SESS_ARG, - &rte_cryptodev_pmd_parse_uint_arg, - ¶ms->max_nb_sessions); - if (ret < 0) - goto free_kvlist; - - ret = rte_kvargs_process(kvlist, RTE_CRYPTODEV_PMD_SOCKET_ID_ARG, &rte_cryptodev_pmd_parse_uint_arg, ¶ms->socket_id); @@ -109,10 +102,9 @@ rte_cryptodev_pmd_create(const char *name, device->driver->name, name); CDEV_LOG_INFO("[%s] - Initialisation parameters - name: %s," - "socket id: %d, max queue pairs: %u, max sessions: %u", + "socket id: %d, max queue pairs: %u", device->driver->name, name, - params->socket_id, params->max_nb_queue_pairs, - params->max_nb_sessions); + params->socket_id, params->max_nb_queue_pairs); /* allocate device structure */ cryptodev = rte_cryptodev_pmd_allocate(name, params->socket_id); diff --git a/lib/librte_cryptodev/rte_cryptodev_pmd.h b/lib/librte_cryptodev/rte_cryptodev_pmd.h index 69d77693..6ff49d64 100644 --- a/lib/librte_cryptodev/rte_cryptodev_pmd.h +++ b/lib/librte_cryptodev/rte_cryptodev_pmd.h @@ -1,32 +1,5 @@ -/*- - * - * Copyright(c) 2015-2016 Intel Corporation. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015-2016 Intel Corporation. */ #ifndef _RTE_CRYPTODEV_PMD_H_ @@ -59,18 +32,15 @@ extern "C" { #define RTE_CRYPTODEV_PMD_DEFAULT_MAX_NB_QUEUE_PAIRS 8 -#define RTE_CRYPTODEV_PMD_DEFAULT_MAX_NB_SESSIONS 2048 #define RTE_CRYPTODEV_PMD_NAME_ARG ("name") #define RTE_CRYPTODEV_PMD_MAX_NB_QP_ARG ("max_nb_queue_pairs") -#define RTE_CRYPTODEV_PMD_MAX_NB_SESS_ARG ("max_nb_sessions") #define RTE_CRYPTODEV_PMD_SOCKET_ID_ARG ("socket_id") static const char * const cryptodev_pmd_valid_params[] = { RTE_CRYPTODEV_PMD_NAME_ARG, RTE_CRYPTODEV_PMD_MAX_NB_QP_ARG, - RTE_CRYPTODEV_PMD_MAX_NB_SESS_ARG, RTE_CRYPTODEV_PMD_SOCKET_ID_ARG }; @@ -83,7 +53,6 @@ struct rte_cryptodev_pmd_init_params { size_t private_data_size; int socket_id; unsigned int max_nb_queue_pairs; - unsigned int max_nb_sessions; }; /** Global structure used for maintaining state of allocated crypto devices */ @@ -216,28 +185,6 @@ typedef void (*cryptodev_info_get_t)(struct rte_cryptodev *dev, struct rte_cryptodev_info *dev_info); /** - * Start queue pair of a device. - * - * @param dev Crypto device pointer - * @param qp_id Queue Pair Index - * - * @return Returns 0 on success. - */ -typedef int (*cryptodev_queue_pair_start_t)(struct rte_cryptodev *dev, - uint16_t qp_id); - -/** - * Stop queue pair of a device. - * - * @param dev Crypto device pointer - * @param qp_id Queue Pair Index - * - * @return Returns 0 on success. - */ -typedef int (*cryptodev_queue_pair_stop_t)(struct rte_cryptodev *dev, - uint16_t qp_id); - -/** * Setup a queue pair for a device. * * @param dev Crypto device pointer @@ -302,6 +249,17 @@ typedef int (*cryptodev_sym_create_session_pool_t)( */ typedef unsigned (*cryptodev_sym_get_session_private_size_t)( struct rte_cryptodev *dev); +/** + * Get the size of a asymmetric cryptodev session + * + * @param dev Crypto device pointer + * + * @return + * - On success returns the size of the session structure for device + * - On failure returns 0 + */ +typedef unsigned int (*cryptodev_asym_get_session_private_size_t)( + struct rte_cryptodev *dev); /** * Configure a Crypto session on a device. @@ -321,7 +279,24 @@ typedef int (*cryptodev_sym_configure_session_t)(struct rte_cryptodev *dev, struct rte_crypto_sym_xform *xform, struct rte_cryptodev_sym_session *session, struct rte_mempool *mp); - +/** + * Configure a Crypto asymmetric session on a device. + * + * @param dev Crypto device pointer + * @param xform Single or chain of crypto xforms + * @param priv_sess Pointer to cryptodev's private session structure + * @param mp Mempool where the private session is allocated + * + * @return + * - Returns 0 if private session structure have been created successfully. + * - Returns -EINVAL if input parameters are invalid. + * - Returns -ENOTSUP if crypto device does not support the crypto transform. + * - Returns -ENOMEM if the private session could not be allocated. + */ +typedef int (*cryptodev_asym_configure_session_t)(struct rte_cryptodev *dev, + struct rte_crypto_asym_xform *xform, + struct rte_cryptodev_asym_session *session, + struct rte_mempool *mp); /** * Free driver private session data. * @@ -330,32 +305,14 @@ typedef int (*cryptodev_sym_configure_session_t)(struct rte_cryptodev *dev, */ typedef void (*cryptodev_sym_free_session_t)(struct rte_cryptodev *dev, struct rte_cryptodev_sym_session *sess); - -/** - * Optional API for drivers to attach sessions with queue pair. - * @param dev Crypto device pointer - * @param qp_id queue pair id for attaching session - * @param priv_sess Pointer to cryptodev's private session structure - * @return - * - Return 0 on success - */ -typedef int (*cryptodev_sym_queue_pair_attach_session_t)( - struct rte_cryptodev *dev, - uint16_t qp_id, - void *session_private); - /** - * Optional API for drivers to detach sessions from queue pair. + * Free asymmetric session private data. + * * @param dev Crypto device pointer - * @param qp_id queue pair id for detaching session - * @param priv_sess Pointer to cryptodev's private session structure - * @return - * - Return 0 on success + * @param sess Cryptodev session structure */ -typedef int (*cryptodev_sym_queue_pair_detach_session_t)( - struct rte_cryptodev *dev, - uint16_t qp_id, - void *session_private); +typedef void (*cryptodev_asym_free_session_t)(struct rte_cryptodev *dev, + struct rte_cryptodev_asym_session *sess); /** Crypto device operations function pointer table */ struct rte_cryptodev_ops { @@ -375,23 +332,21 @@ struct rte_cryptodev_ops { /**< Set up a device queue pair. */ cryptodev_queue_pair_release_t queue_pair_release; /**< Release a queue pair. */ - cryptodev_queue_pair_start_t queue_pair_start; - /**< Start a queue pair. */ - cryptodev_queue_pair_stop_t queue_pair_stop; - /**< Stop a queue pair. */ cryptodev_queue_pair_count_t queue_pair_count; /**< Get count of the queue pairs. */ - cryptodev_sym_get_session_private_size_t session_get_size; + cryptodev_sym_get_session_private_size_t sym_session_get_size; /**< Return private session. */ - cryptodev_sym_configure_session_t session_configure; + cryptodev_asym_get_session_private_size_t asym_session_get_size; + /**< Return asym session private size. */ + cryptodev_sym_configure_session_t sym_session_configure; /**< Configure a Crypto session. */ - cryptodev_sym_free_session_t session_clear; + cryptodev_asym_configure_session_t asym_session_configure; + /**< Configure asymmetric Crypto session. */ + cryptodev_sym_free_session_t sym_session_clear; + /**< Clear a Crypto sessions private data. */ + cryptodev_asym_free_session_t asym_session_clear; /**< Clear a Crypto sessions private data. */ - cryptodev_sym_queue_pair_attach_session_t qp_attach_session; - /**< Attach session to queue pair. */ - cryptodev_sym_queue_pair_detach_session_t qp_detach_session; - /**< Detach session from queue pair. */ }; @@ -516,20 +471,32 @@ uint8_t rte_cryptodev_allocate_driver(struct cryptodev_driver *crypto_drv, #define RTE_PMD_REGISTER_CRYPTO_DRIVER(crypto_drv, drv, driver_id)\ -RTE_INIT(init_ ##driver_id);\ -static void init_ ##driver_id(void)\ +RTE_INIT(init_ ##driver_id)\ {\ driver_id = rte_cryptodev_allocate_driver(&crypto_drv, &(drv));\ } static inline void * -get_session_private_data(const struct rte_cryptodev_sym_session *sess, +get_sym_session_private_data(const struct rte_cryptodev_sym_session *sess, + uint8_t driver_id) { + return sess->sess_private_data[driver_id]; +} + +static inline void +set_sym_session_private_data(struct rte_cryptodev_sym_session *sess, + uint8_t driver_id, void *private_data) +{ + sess->sess_private_data[driver_id] = private_data; +} + +static inline void * +get_asym_session_private_data(const struct rte_cryptodev_asym_session *sess, uint8_t driver_id) { return sess->sess_private_data[driver_id]; } static inline void -set_session_private_data(struct rte_cryptodev_sym_session *sess, +set_asym_session_private_data(struct rte_cryptodev_asym_session *sess, uint8_t driver_id, void *private_data) { sess->sess_private_data[driver_id] = private_data; diff --git a/lib/librte_cryptodev/rte_cryptodev_version.map b/lib/librte_cryptodev/rte_cryptodev_version.map index be8f4c1a..7ca00735 100644 --- a/lib/librte_cryptodev/rte_cryptodev_version.map +++ b/lib/librte_cryptodev/rte_cryptodev_version.map @@ -22,8 +22,6 @@ DPDK_16.04 { rte_cryptodev_stop; rte_cryptodev_queue_pair_count; rte_cryptodev_queue_pair_setup; - rte_cryptodev_queue_pair_start; - rte_cryptodev_queue_pair_stop; rte_crypto_op_pool_create; local: *; @@ -52,8 +50,6 @@ DPDK_17.05 { rte_cryptodev_get_auth_algo_enum; rte_cryptodev_get_cipher_algo_enum; - rte_cryptodev_queue_pair_attach_sym_session; - rte_cryptodev_queue_pair_detach_sym_session; } DPDK_17.02; @@ -65,8 +61,6 @@ DPDK_17.08 { rte_cryptodev_driver_id_get; rte_cryptodev_driver_name_get; rte_cryptodev_get_aead_algo_enum; - rte_cryptodev_get_header_session_size; - rte_cryptodev_get_private_session_size; rte_cryptodev_sym_capability_check_aead; rte_cryptodev_sym_session_init; rte_cryptodev_sym_session_clear; @@ -97,6 +91,18 @@ DPDK_18.05 { EXPERIMENTAL { global: - rte_cryptodev_sym_session_get_private_data; - rte_cryptodev_sym_session_set_private_data; + rte_cryptodev_asym_capability_get; + rte_cryptodev_asym_get_header_session_size; + rte_cryptodev_asym_get_private_session_size; + rte_cryptodev_asym_get_xform_enum; + rte_cryptodev_asym_session_clear; + rte_cryptodev_asym_session_create; + rte_cryptodev_asym_session_free; + rte_cryptodev_asym_session_init; + rte_cryptodev_asym_xform_capability_check_modlen; + rte_cryptodev_asym_xform_capability_check_optype; + rte_cryptodev_sym_session_get_user_data; + rte_cryptodev_sym_session_set_user_data; + rte_crypto_asym_op_strings; + rte_crypto_asym_xform_strings; }; diff --git a/lib/librte_eal/bsdapp/eal/Makefile b/lib/librte_eal/bsdapp/eal/Makefile index 3fd33f1e..d27da3d1 100644 --- a/lib/librte_eal/bsdapp/eal/Makefile +++ b/lib/librte_eal/bsdapp/eal/Makefile @@ -18,10 +18,11 @@ CFLAGS += $(WERROR_FLAGS) -O3 LDLIBS += -lexecinfo LDLIBS += -lpthread LDLIBS += -lgcc_s +LDLIBS += -lrte_kvargs EXPORT_MAP := ../../rte_eal_version.map -LIBABIVER := 7 +LIBABIVER := 8 # specific to bsdapp exec-env SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) := eal.c @@ -52,12 +53,14 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_hypervisor.c SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_string_fns.c SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_hexdump.c SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_devargs.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_class.c SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_bus.c SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_dev.c SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_options.c SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_thread.c SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_proc.c SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_fbarray.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_uuid.c SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_malloc.c SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += malloc_elem.c SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += malloc_heap.c diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c index dc279542..d7ae9d68 100644 --- a/lib/librte_eal/bsdapp/eal/eal.c +++ b/lib/librte_eal/bsdapp/eal/eal.c @@ -147,19 +147,9 @@ eal_get_runtime_dir(void) } /* Return user provided mbuf pool ops name */ -const char * __rte_experimental -rte_eal_mbuf_user_pool_ops(void) -{ - return internal_config.user_mbuf_pool_ops_name; -} - -/* Return mbuf pool ops name */ const char * -rte_eal_mbuf_default_mempool_ops(void) +rte_eal_mbuf_user_pool_ops(void) { - if (internal_config.user_mbuf_pool_ops_name == NULL) - return RTE_MBUF_DEFAULT_MEMPOOL_OPS; - return internal_config.user_mbuf_pool_ops_name; } @@ -286,12 +276,17 @@ eal_proc_type_detect(void) enum rte_proc_type_t ptype = RTE_PROC_PRIMARY; const char *pathname = eal_runtime_config_path(); - /* if we can open the file but not get a write-lock we are a secondary - * process. NOTE: if we get a file handle back, we keep that open - * and don't close it to prevent a race condition between multiple opens */ - if (((mem_cfg_fd = open(pathname, O_RDWR)) >= 0) && - (fcntl(mem_cfg_fd, F_SETLK, &wr_lock) < 0)) - ptype = RTE_PROC_SECONDARY; + /* if there no shared config, there can be no secondary processes */ + if (!internal_config.no_shconf) { + /* if we can open the file but not get a write-lock we are a + * secondary process. NOTE: if we get a file handle back, we + * keep that open and don't close it to prevent a race condition + * between multiple opens. + */ + if (((mem_cfg_fd = open(pathname, O_RDWR)) >= 0) && + (fcntl(mem_cfg_fd, F_SETLK, &wr_lock) < 0)) + ptype = RTE_PROC_SECONDARY; + } RTE_LOG(INFO, EAL, "Auto-detected process type: %s\n", ptype == RTE_PROC_PRIMARY ? "PRIMARY" : "SECONDARY"); @@ -468,6 +463,14 @@ eal_parse_args(int argc, char **argv) } } + /* create runtime data directory */ + if (internal_config.no_shconf == 0 && + eal_create_runtime_dir() < 0) { + RTE_LOG(ERR, EAL, "Cannot create runtime directory\n"); + ret = -1; + goto out; + } + if (eal_adjust_config(&internal_config) != 0) { ret = -1; goto out; @@ -600,13 +603,6 @@ rte_eal_init(int argc, char **argv) return -1; } - /* create runtime data directory */ - if (eal_create_runtime_dir() < 0) { - rte_eal_init_alert("Cannot create runtime directory\n"); - rte_errno = EACCES; - return -1; - } - /* FreeBSD always uses legacy memory model */ internal_config.legacy_mem = true; @@ -625,6 +621,11 @@ rte_eal_init(int argc, char **argv) rte_config_init(); + if (rte_eal_intr_init() < 0) { + rte_eal_init_alert("Cannot init interrupt-handling thread\n"); + return -1; + } + /* Put mp channel init before bus scan so that we can init the vdev * bus through mp channel in the secondary process before the bus scan. */ @@ -713,11 +714,6 @@ rte_eal_init(int argc, char **argv) return -1; } - if (rte_eal_intr_init() < 0) { - rte_eal_init_alert("Cannot init interrupt-handling thread\n"); - return -1; - } - if (rte_eal_timer_init() < 0) { rte_eal_init_alert("Cannot init HPET or TSC timers\n"); rte_errno = ENOTSUP; @@ -866,21 +862,21 @@ int rte_vfio_clear_group(__rte_unused int vfio_group_fd) return 0; } -int __rte_experimental +int rte_vfio_dma_map(uint64_t __rte_unused vaddr, __rte_unused uint64_t iova, __rte_unused uint64_t len) { return -1; } -int __rte_experimental +int rte_vfio_dma_unmap(uint64_t __rte_unused vaddr, uint64_t __rte_unused iova, __rte_unused uint64_t len) { return -1; } -int __rte_experimental +int rte_vfio_get_group_num(__rte_unused const char *sysfs_base, __rte_unused const char *dev_addr, __rte_unused int *iommu_group_num) @@ -888,45 +884,45 @@ rte_vfio_get_group_num(__rte_unused const char *sysfs_base, return -1; } -int __rte_experimental +int rte_vfio_get_container_fd(void) { return -1; } -int __rte_experimental +int rte_vfio_get_group_fd(__rte_unused int iommu_group_num) { return -1; } -int __rte_experimental +int rte_vfio_container_create(void) { return -1; } -int __rte_experimental +int rte_vfio_container_destroy(__rte_unused int container_fd) { return -1; } -int __rte_experimental +int rte_vfio_container_group_bind(__rte_unused int container_fd, __rte_unused int iommu_group_num) { return -1; } -int __rte_experimental +int rte_vfio_container_group_unbind(__rte_unused int container_fd, __rte_unused int iommu_group_num) { return -1; } -int __rte_experimental +int rte_vfio_container_dma_map(__rte_unused int container_fd, __rte_unused uint64_t vaddr, __rte_unused uint64_t iova, @@ -935,7 +931,7 @@ rte_vfio_container_dma_map(__rte_unused int container_fd, return -1; } -int __rte_experimental +int rte_vfio_container_dma_unmap(__rte_unused int container_fd, __rte_unused uint64_t vaddr, __rte_unused uint64_t iova, diff --git a/lib/librte_eal/bsdapp/eal/eal_alarm.c b/lib/librte_eal/bsdapp/eal/eal_alarm.c index eb3913c9..51ea4b8c 100644 --- a/lib/librte_eal/bsdapp/eal/eal_alarm.c +++ b/lib/librte_eal/bsdapp/eal/eal_alarm.c @@ -1,31 +1,314 @@ /* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2014 Intel Corporation + * Copyright(c) 2010-2018 Intel Corporation */ + +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <stdio.h> #include <stdlib.h> +#include <string.h> +#include <time.h> #include <errno.h> #include <rte_alarm.h> +#include <rte_cycles.h> #include <rte_common.h> +#include <rte_errno.h> +#include <rte_interrupts.h> +#include <rte_spinlock.h> + #include "eal_private.h" +#include "eal_alarm_private.h" + +#define NS_PER_US 1000 + +#ifdef CLOCK_MONOTONIC_RAW /* Defined in glibc bits/time.h */ +#define CLOCK_TYPE_ID CLOCK_MONOTONIC_RAW +#else +#define CLOCK_TYPE_ID CLOCK_MONOTONIC +#endif + +struct alarm_entry { + LIST_ENTRY(alarm_entry) next; + struct rte_intr_handle handle; + struct timespec time; + rte_eal_alarm_callback cb_fn; + void *cb_arg; + volatile uint8_t executing; + volatile pthread_t executing_id; +}; + +static LIST_HEAD(alarm_list, alarm_entry) alarm_list = LIST_HEAD_INITIALIZER(); +static rte_spinlock_t alarm_list_lk = RTE_SPINLOCK_INITIALIZER; + +static struct rte_intr_handle intr_handle = {.fd = -1 }; +static void eal_alarm_callback(void *arg); int rte_eal_alarm_init(void) { + intr_handle.type = RTE_INTR_HANDLE_ALARM; + + /* on FreeBSD, timers don't use fd's, and their identifiers are stored + * in separate namespace from fd's, so using any value is OK. however, + * EAL interrupts handler expects fd's to be unique, so use an actual fd + * to guarantee unique timer identifier. + */ + intr_handle.fd = open("/dev/zero", O_RDONLY); + + return 0; +} + +static inline int +timespec_cmp(const struct timespec *now, const struct timespec *at) +{ + if (now->tv_sec < at->tv_sec) + return -1; + if (now->tv_sec > at->tv_sec) + return 1; + if (now->tv_nsec < at->tv_nsec) + return -1; + if (now->tv_nsec > at->tv_nsec) + return 1; return 0; } +static inline uint64_t +diff_ns(struct timespec *now, struct timespec *at) +{ + uint64_t now_ns, at_ns; + + if (timespec_cmp(now, at) >= 0) + return 0; + + now_ns = now->tv_sec * NS_PER_S + now->tv_nsec; + at_ns = at->tv_sec * NS_PER_S + at->tv_nsec; + + return at_ns - now_ns; +} int -rte_eal_alarm_set(uint64_t us __rte_unused, - rte_eal_alarm_callback cb_fn __rte_unused, - void *cb_arg __rte_unused) +eal_alarm_get_timeout_ns(uint64_t *val) { - return -ENOTSUP; + struct alarm_entry *ap; + struct timespec now; + + if (clock_gettime(CLOCK_TYPE_ID, &now) < 0) + return -1; + + if (LIST_EMPTY(&alarm_list)) + return -1; + + ap = LIST_FIRST(&alarm_list); + + *val = diff_ns(&now, &ap->time); + + return 0; +} + +static int +unregister_current_callback(void) +{ + struct alarm_entry *ap; + int ret = 0; + + if (!LIST_EMPTY(&alarm_list)) { + ap = LIST_FIRST(&alarm_list); + + do { + ret = rte_intr_callback_unregister(&intr_handle, + eal_alarm_callback, &ap->time); + } while (ret == -EAGAIN); + } + + return ret; } +static int +register_first_callback(void) +{ + struct alarm_entry *ap; + int ret = 0; + + if (!LIST_EMPTY(&alarm_list)) { + ap = LIST_FIRST(&alarm_list); + + /* register a new callback */ + ret = rte_intr_callback_register(&intr_handle, + eal_alarm_callback, &ap->time); + } + return ret; +} + +static void +eal_alarm_callback(void *arg __rte_unused) +{ + struct timespec now; + struct alarm_entry *ap; + + rte_spinlock_lock(&alarm_list_lk); + ap = LIST_FIRST(&alarm_list); + + if (clock_gettime(CLOCK_TYPE_ID, &now) < 0) + return; + + while (ap != NULL && timespec_cmp(&now, &ap->time) >= 0) { + ap->executing = 1; + ap->executing_id = pthread_self(); + rte_spinlock_unlock(&alarm_list_lk); + + ap->cb_fn(ap->cb_arg); + + rte_spinlock_lock(&alarm_list_lk); + + LIST_REMOVE(ap, next); + free(ap); + + ap = LIST_FIRST(&alarm_list); + } + + /* timer has been deleted from the kqueue, so recreate it if needed */ + register_first_callback(); + + rte_spinlock_unlock(&alarm_list_lk); +} + + int -rte_eal_alarm_cancel(rte_eal_alarm_callback cb_fn __rte_unused, - void *cb_arg __rte_unused) +rte_eal_alarm_set(uint64_t us, rte_eal_alarm_callback cb_fn, void *cb_arg) { - return -ENOTSUP; + struct alarm_entry *ap, *new_alarm; + struct timespec now; + uint64_t ns; + int ret = 0; + + /* check parameters, also ensure us won't cause a uint64_t overflow */ + if (us < 1 || us > (UINT64_MAX - US_PER_S) || cb_fn == NULL) + return -EINVAL; + + new_alarm = calloc(1, sizeof(*new_alarm)); + if (new_alarm == NULL) + return -ENOMEM; + + /* use current time to calculate absolute time of alarm */ + clock_gettime(CLOCK_TYPE_ID, &now); + + ns = us * NS_PER_US; + + new_alarm->cb_fn = cb_fn; + new_alarm->cb_arg = cb_arg; + new_alarm->time.tv_nsec = (now.tv_nsec + ns) % NS_PER_S; + new_alarm->time.tv_sec = now.tv_sec + ((now.tv_nsec + ns) / NS_PER_S); + + rte_spinlock_lock(&alarm_list_lk); + + if (LIST_EMPTY(&alarm_list)) + LIST_INSERT_HEAD(&alarm_list, new_alarm, next); + else { + LIST_FOREACH(ap, &alarm_list, next) { + if (timespec_cmp(&new_alarm->time, &ap->time) < 0) { + LIST_INSERT_BEFORE(ap, new_alarm, next); + break; + } + if (LIST_NEXT(ap, next) == NULL) { + LIST_INSERT_AFTER(ap, new_alarm, next); + break; + } + } + } + + /* re-register first callback just in case */ + register_first_callback(); + + rte_spinlock_unlock(&alarm_list_lk); + + return ret; +} + +int +rte_eal_alarm_cancel(rte_eal_alarm_callback cb_fn, void *cb_arg) +{ + struct alarm_entry *ap, *ap_prev; + int count = 0; + int err = 0; + int executing; + + if (!cb_fn) { + rte_errno = EINVAL; + return -1; + } + + do { + executing = 0; + rte_spinlock_lock(&alarm_list_lk); + /* remove any matches at the start of the list */ + while (1) { + ap = LIST_FIRST(&alarm_list); + if (ap == NULL) + break; + if (cb_fn != ap->cb_fn) + break; + if (cb_arg != ap->cb_arg && cb_arg != (void *) -1) + break; + if (ap->executing == 0) { + LIST_REMOVE(ap, next); + free(ap); + count++; + } else { + /* If calling from other context, mark that + * alarm is executing so loop can spin till it + * finish. Otherwise we are trying to cancel + * ourselves - mark it by EINPROGRESS. + */ + if (pthread_equal(ap->executing_id, + pthread_self()) == 0) + executing++; + else + err = EINPROGRESS; + + break; + } + } + ap_prev = ap; + + /* now go through list, removing entries not at start */ + LIST_FOREACH(ap, &alarm_list, next) { + /* this won't be true first time through */ + if (cb_fn == ap->cb_fn && + (cb_arg == (void *)-1 || + cb_arg == ap->cb_arg)) { + if (ap->executing == 0) { + LIST_REMOVE(ap, next); + free(ap); + count++; + ap = ap_prev; + } else if (pthread_equal(ap->executing_id, + pthread_self()) == 0) { + executing++; + } else { + err = EINPROGRESS; + } + } + ap_prev = ap; + } + rte_spinlock_unlock(&alarm_list_lk); + } while (executing != 0); + + if (count == 0 && err == 0) + rte_errno = ENOENT; + else if (err) + rte_errno = err; + + rte_spinlock_lock(&alarm_list_lk); + + /* unregister if no alarms left, otherwise re-register first */ + if (LIST_EMPTY(&alarm_list)) + unregister_current_callback(); + else + register_first_callback(); + + rte_spinlock_unlock(&alarm_list_lk); + + return count; } diff --git a/lib/librte_eal/bsdapp/eal/eal_alarm_private.h b/lib/librte_eal/bsdapp/eal/eal_alarm_private.h new file mode 100644 index 00000000..65c71151 --- /dev/null +++ b/lib/librte_eal/bsdapp/eal/eal_alarm_private.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#ifndef EAL_ALARM_PRIVATE_H +#define EAL_ALARM_PRIVATE_H + +#include <inttypes.h> + +/* + * FreeBSD needs a back-channel communication mechanism between interrupt and + * alarm thread, because on FreeBSD, timer period is set up inside the interrupt + * API and not inside alarm API like on Linux. + */ + +int +eal_alarm_get_timeout_ns(uint64_t *val); + +#endif // EAL_ALARM_PRIVATE_H diff --git a/lib/librte_eal/bsdapp/eal/eal_hugepage_info.c b/lib/librte_eal/bsdapp/eal/eal_hugepage_info.c index 836feb67..1e8f5df2 100644 --- a/lib/librte_eal/bsdapp/eal/eal_hugepage_info.c +++ b/lib/librte_eal/bsdapp/eal/eal_hugepage_info.c @@ -101,6 +101,10 @@ eal_hugepage_info_init(void) hpi->num_pages[0] = num_buffers; hpi->lock_descriptor = fd; + /* for no shared files mode, do not create shared memory config */ + if (internal_config.no_shconf) + return 0; + tmp_hpi = create_shared_memory(eal_hugepage_info_path(), sizeof(internal_config.hugepage_info)); if (tmp_hpi == NULL ) { diff --git a/lib/librte_eal/bsdapp/eal/eal_interrupts.c b/lib/librte_eal/bsdapp/eal/eal_interrupts.c index 290d53ab..2feee2d5 100644 --- a/lib/librte_eal/bsdapp/eal/eal_interrupts.c +++ b/lib/librte_eal/bsdapp/eal/eal_interrupts.c @@ -1,51 +1,479 @@ /* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2014 Intel Corporation + * Copyright(c) 2010-2018 Intel Corporation */ +#include <string.h> +#include <sys/types.h> +#include <sys/event.h> +#include <sys/queue.h> +#include <unistd.h> + +#include <rte_errno.h> +#include <rte_lcore.h> +#include <rte_spinlock.h> #include <rte_common.h> #include <rte_interrupts.h> + #include "eal_private.h" +#include "eal_alarm_private.h" + +#define MAX_INTR_EVENTS 16 + +/** + * union buffer for reading on different devices + */ +union rte_intr_read_buffer { + char charbuf[16]; /* for others */ +}; + +TAILQ_HEAD(rte_intr_cb_list, rte_intr_callback); +TAILQ_HEAD(rte_intr_source_list, rte_intr_source); + +struct rte_intr_callback { + TAILQ_ENTRY(rte_intr_callback) next; + rte_intr_callback_fn cb_fn; /**< callback address */ + void *cb_arg; /**< parameter for callback */ +}; + +struct rte_intr_source { + TAILQ_ENTRY(rte_intr_source) next; + struct rte_intr_handle intr_handle; /**< interrupt handle */ + struct rte_intr_cb_list callbacks; /**< user callbacks */ + uint32_t active; +}; + +/* global spinlock for interrupt data operation */ +static rte_spinlock_t intr_lock = RTE_SPINLOCK_INITIALIZER; + +/* interrupt sources list */ +static struct rte_intr_source_list intr_sources; + +/* interrupt handling thread */ +static pthread_t intr_thread; + +static volatile int kq = -1; + +static int +intr_source_to_kevent(const struct rte_intr_handle *ih, struct kevent *ke) +{ + /* alarm callbacks are special case */ + if (ih->type == RTE_INTR_HANDLE_ALARM) { + uint64_t timeout_ns; + + /* get soonest alarm timeout */ + if (eal_alarm_get_timeout_ns(&timeout_ns) < 0) + return -1; + + ke->filter = EVFILT_TIMER; + /* timers are one shot */ + ke->flags |= EV_ONESHOT; + ke->fflags = NOTE_NSECONDS; + ke->data = timeout_ns; + } else { + ke->filter = EVFILT_READ; + } + ke->ident = ih->fd; + + return 0; +} int rte_intr_callback_register(const struct rte_intr_handle *intr_handle, - rte_intr_callback_fn cb, - void *cb_arg) + rte_intr_callback_fn cb, void *cb_arg) { - RTE_SET_USED(intr_handle); - RTE_SET_USED(cb); - RTE_SET_USED(cb_arg); + struct rte_intr_callback *callback = NULL; + struct rte_intr_source *src = NULL; + int ret, add_event; - return -ENOTSUP; + /* first do parameter checking */ + if (intr_handle == NULL || intr_handle->fd < 0 || cb == NULL) { + RTE_LOG(ERR, EAL, + "Registering with invalid input parameter\n"); + return -EINVAL; + } + if (kq < 0) { + RTE_LOG(ERR, EAL, "Kqueue is not active: %d\n", kq); + return -ENODEV; + } + + /* allocate a new interrupt callback entity */ + callback = calloc(1, sizeof(*callback)); + if (callback == NULL) { + RTE_LOG(ERR, EAL, "Can not allocate memory\n"); + return -ENOMEM; + } + callback->cb_fn = cb; + callback->cb_arg = cb_arg; + + rte_spinlock_lock(&intr_lock); + + /* check if there is at least one callback registered for the fd */ + TAILQ_FOREACH(src, &intr_sources, next) { + if (src->intr_handle.fd == intr_handle->fd) { + /* we had no interrupts for this */ + if (TAILQ_EMPTY(&src->callbacks)) + add_event = 1; + + TAILQ_INSERT_TAIL(&(src->callbacks), callback, next); + ret = 0; + break; + } + } + + /* no existing callbacks for this - add new source */ + if (src == NULL) { + src = calloc(1, sizeof(*src)); + if (src == NULL) { + RTE_LOG(ERR, EAL, "Can not allocate memory\n"); + ret = -ENOMEM; + goto fail; + } else { + src->intr_handle = *intr_handle; + TAILQ_INIT(&src->callbacks); + TAILQ_INSERT_TAIL(&(src->callbacks), callback, next); + TAILQ_INSERT_TAIL(&intr_sources, src, next); + add_event = 1; + ret = 0; + } + } + + /* add events to the queue. timer events are special as we need to + * re-set the timer. + */ + if (add_event || src->intr_handle.type == RTE_INTR_HANDLE_ALARM) { + struct kevent ke; + + memset(&ke, 0, sizeof(ke)); + ke.flags = EV_ADD; /* mark for addition to the queue */ + + if (intr_source_to_kevent(intr_handle, &ke) < 0) { + RTE_LOG(ERR, EAL, "Cannot convert interrupt handle to kevent\n"); + ret = -ENODEV; + goto fail; + } + + /** + * add the intr file descriptor into wait list. + */ + if (kevent(kq, &ke, 1, NULL, 0, NULL) < 0) { + /* currently, nic_uio does not support interrupts, so + * this error will always be triggered and output to the + * user. so, don't output it unless debug log level set. + */ + if (errno == ENODEV) + RTE_LOG(DEBUG, EAL, "Interrupt handle %d not supported\n", + src->intr_handle.fd); + else + RTE_LOG(ERR, EAL, "Error adding fd %d " + "kevent, %s\n", + src->intr_handle.fd, + strerror(errno)); + ret = -errno; + goto fail; + } + } + rte_spinlock_unlock(&intr_lock); + + return ret; +fail: + /* clean up */ + if (src != NULL) { + TAILQ_REMOVE(&(src->callbacks), callback, next); + if (TAILQ_EMPTY(&(src->callbacks))) { + TAILQ_REMOVE(&intr_sources, src, next); + free(src); + } + } + free(callback); + rte_spinlock_unlock(&intr_lock); + return ret; } int rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle, - rte_intr_callback_fn cb, - void *cb_arg) + rte_intr_callback_fn cb_fn, void *cb_arg) { - RTE_SET_USED(intr_handle); - RTE_SET_USED(cb); - RTE_SET_USED(cb_arg); + int ret; + struct rte_intr_source *src; + struct rte_intr_callback *cb, *next; - return -ENOTSUP; + /* do parameter checking first */ + if (intr_handle == NULL || intr_handle->fd < 0) { + RTE_LOG(ERR, EAL, + "Unregistering with invalid input parameter\n"); + return -EINVAL; + } + if (kq < 0) { + RTE_LOG(ERR, EAL, "Kqueue is not active\n"); + return -ENODEV; + } + + rte_spinlock_lock(&intr_lock); + + /* check if the insterrupt source for the fd is existent */ + TAILQ_FOREACH(src, &intr_sources, next) + if (src->intr_handle.fd == intr_handle->fd) + break; + + /* No interrupt source registered for the fd */ + if (src == NULL) { + ret = -ENOENT; + + /* interrupt source has some active callbacks right now. */ + } else if (src->active != 0) { + ret = -EAGAIN; + + /* ok to remove. */ + } else { + struct kevent ke; + + ret = 0; + + /* remove it from the kqueue */ + memset(&ke, 0, sizeof(ke)); + ke.flags = EV_DELETE; /* mark for deletion from the queue */ + + if (intr_source_to_kevent(intr_handle, &ke) < 0) { + RTE_LOG(ERR, EAL, "Cannot convert to kevent\n"); + ret = -ENODEV; + goto out; + } + + /** + * remove intr file descriptor from wait list. + */ + if (kevent(kq, &ke, 1, NULL, 0, NULL) < 0) { + RTE_LOG(ERR, EAL, "Error removing fd %d kevent, %s\n", + src->intr_handle.fd, strerror(errno)); + /* removing non-existent even is an expected condition + * in some circumstances (e.g. oneshot events). + */ + } + + /*walk through the callbacks and remove all that match. */ + for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) { + next = TAILQ_NEXT(cb, next); + if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 || + cb->cb_arg == cb_arg)) { + TAILQ_REMOVE(&src->callbacks, cb, next); + free(cb); + ret++; + } + } + + /* all callbacks for that source are removed. */ + if (TAILQ_EMPTY(&src->callbacks)) { + TAILQ_REMOVE(&intr_sources, src, next); + free(src); + } + } +out: + rte_spinlock_unlock(&intr_lock); + + return ret; } int -rte_intr_enable(const struct rte_intr_handle *intr_handle __rte_unused) +rte_intr_enable(const struct rte_intr_handle *intr_handle) { - return -ENOTSUP; + if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV) + return 0; + + if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0) + return -1; + + switch (intr_handle->type) { + /* not used at this moment */ + case RTE_INTR_HANDLE_ALARM: + return -1; + /* not used at this moment */ + case RTE_INTR_HANDLE_DEV_EVENT: + return -1; + /* unknown handle type */ + default: + RTE_LOG(ERR, EAL, + "Unknown handle type of fd %d\n", + intr_handle->fd); + return -1; + } + + return 0; } int -rte_intr_disable(const struct rte_intr_handle *intr_handle __rte_unused) +rte_intr_disable(const struct rte_intr_handle *intr_handle) { - return -ENOTSUP; + if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV) + return 0; + + if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0) + return -1; + + switch (intr_handle->type) { + /* not used at this moment */ + case RTE_INTR_HANDLE_ALARM: + return -1; + /* not used at this moment */ + case RTE_INTR_HANDLE_DEV_EVENT: + return -1; + /* unknown handle type */ + default: + RTE_LOG(ERR, EAL, + "Unknown handle type of fd %d\n", + intr_handle->fd); + return -1; + } + + return 0; +} + +static void +eal_intr_process_interrupts(struct kevent *events, int nfds) +{ + struct rte_intr_callback active_cb; + union rte_intr_read_buffer buf; + struct rte_intr_callback *cb; + struct rte_intr_source *src; + bool call = false; + int n, bytes_read; + + for (n = 0; n < nfds; n++) { + int event_fd = events[n].ident; + + rte_spinlock_lock(&intr_lock); + TAILQ_FOREACH(src, &intr_sources, next) + if (src->intr_handle.fd == event_fd) + break; + if (src == NULL) { + rte_spinlock_unlock(&intr_lock); + continue; + } + + /* mark this interrupt source as active and release the lock. */ + src->active = 1; + rte_spinlock_unlock(&intr_lock); + + /* set the length to be read dor different handle type */ + switch (src->intr_handle.type) { + case RTE_INTR_HANDLE_ALARM: + bytes_read = 0; + call = true; + break; + case RTE_INTR_HANDLE_VDEV: + case RTE_INTR_HANDLE_EXT: + bytes_read = 0; + call = true; + break; + case RTE_INTR_HANDLE_DEV_EVENT: + bytes_read = 0; + call = true; + break; + default: + bytes_read = 1; + break; + } + + if (bytes_read > 0) { + /** + * read out to clear the ready-to-be-read flag + * for epoll_wait. + */ + bytes_read = read(event_fd, &buf, bytes_read); + if (bytes_read < 0) { + if (errno == EINTR || errno == EWOULDBLOCK) + continue; + + RTE_LOG(ERR, EAL, "Error reading from file " + "descriptor %d: %s\n", + event_fd, + strerror(errno)); + } else if (bytes_read == 0) + RTE_LOG(ERR, EAL, "Read nothing from file " + "descriptor %d\n", event_fd); + else + call = true; + } + + /* grab a lock, again to call callbacks and update status. */ + rte_spinlock_lock(&intr_lock); + + if (call) { + /* Finally, call all callbacks. */ + TAILQ_FOREACH(cb, &src->callbacks, next) { + + /* make a copy and unlock. */ + active_cb = *cb; + rte_spinlock_unlock(&intr_lock); + + /* call the actual callback */ + active_cb.cb_fn(active_cb.cb_arg); + + /*get the lock back. */ + rte_spinlock_lock(&intr_lock); + } + } + + /* we done with that interrupt source, release it. */ + src->active = 0; + rte_spinlock_unlock(&intr_lock); + } +} + +static void * +eal_intr_thread_main(void *arg __rte_unused) +{ + struct kevent events[MAX_INTR_EVENTS]; + int nfds; + + /* host thread, never break out */ + for (;;) { + /* do not change anything, just wait */ + nfds = kevent(kq, NULL, 0, events, MAX_INTR_EVENTS, NULL); + + /* kevent fail */ + if (nfds < 0) { + if (errno == EINTR) + continue; + RTE_LOG(ERR, EAL, + "kevent returns with fail\n"); + break; + } + /* kevent timeout, will never happen here */ + else if (nfds == 0) + continue; + + /* kevent has at least one fd ready to read */ + eal_intr_process_interrupts(events, nfds); + } + close(kq); + kq = -1; + return NULL; } int rte_eal_intr_init(void) { - return 0; + int ret = 0; + + /* init the global interrupt source head */ + TAILQ_INIT(&intr_sources); + + kq = kqueue(); + if (kq < 0) { + RTE_LOG(ERR, EAL, "Cannot create kqueue instance\n"); + return -1; + } + + /* create the host thread to wait/handle the interrupt */ + ret = rte_ctrl_thread_create(&intr_thread, "eal-intr-thread", NULL, + eal_intr_thread_main, NULL); + if (ret != 0) { + rte_errno = -ret; + RTE_LOG(ERR, EAL, + "Failed to create thread for interrupt handling\n"); + } + + return ret; } int diff --git a/lib/librte_eal/bsdapp/eal/eal_memory.c b/lib/librte_eal/bsdapp/eal/eal_memory.c index a5e03478..16d2bc7c 100644 --- a/lib/librte_eal/bsdapp/eal/eal_memory.c +++ b/lib/librte_eal/bsdapp/eal/eal_memory.c @@ -12,6 +12,7 @@ #include <rte_eal.h> #include <rte_eal_memconfig.h> +#include <rte_errno.h> #include <rte_log.h> #include <rte_string_fns.h> #include "eal_private.h" @@ -104,6 +105,8 @@ rte_eal_hugepage_init(void) /* map all hugepages and sort them */ for (i = 0; i < internal_config.num_hugepage_sizes; i ++){ struct hugepage_info *hpi; + rte_iova_t prev_end = 0; + int prev_ms_idx = -1; uint64_t page_sz, mem_needed; unsigned int n_pages, max_pages; @@ -124,10 +127,27 @@ rte_eal_hugepage_init(void) int error; size_t sysctl_size = sizeof(physaddr); char physaddr_str[64]; + bool is_adjacent; + + /* first, check if this segment is IOVA-adjacent to + * the previous one. + */ + snprintf(physaddr_str, sizeof(physaddr_str), + "hw.contigmem.physaddr.%d", j); + error = sysctlbyname(physaddr_str, &physaddr, + &sysctl_size, NULL, 0); + if (error < 0) { + RTE_LOG(ERR, EAL, "Failed to get physical addr for buffer %u " + "from %s\n", j, hpi->hugedir); + return -1; + } + + is_adjacent = prev_end != 0 && physaddr == prev_end; + prev_end = physaddr + hpi->hugepage_sz; for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) { - bool empty; + bool empty, need_hole; msl = &mcfg->memsegs[msl_idx]; arr = &msl->memseg_arr; @@ -136,20 +156,23 @@ rte_eal_hugepage_init(void) empty = arr->count == 0; - /* we need 1, plus hole if not empty */ + /* we need a hole if this isn't an empty memseg + * list, and if previous segment was not + * adjacent to current one. + */ + need_hole = !empty && !is_adjacent; + + /* we need 1, plus hole if not adjacent */ ms_idx = rte_fbarray_find_next_n_free(arr, - 0, 1 + (empty ? 1 : 0)); + 0, 1 + (need_hole ? 1 : 0)); /* memseg list is full? */ if (ms_idx < 0) continue; - /* leave some space between memsegs, they are - * not IOVA contiguous, so they shouldn't be VA - * contiguous either. - */ - if (!empty) + if (need_hole && prev_ms_idx == ms_idx - 1) ms_idx++; + prev_ms_idx = ms_idx; break; } @@ -178,16 +201,6 @@ rte_eal_hugepage_init(void) return -1; } - snprintf(physaddr_str, sizeof(physaddr_str), "hw.contigmem" - ".physaddr.%d", j); - error = sysctlbyname(physaddr_str, &physaddr, &sysctl_size, - NULL, 0); - if (error < 0) { - RTE_LOG(ERR, EAL, "Failed to get physical addr for buffer %u " - "from %s\n", j, hpi->hugedir); - return -1; - } - seg->addr = addr; seg->iova = physaddr; seg->hugepage_sz = page_sz; @@ -200,7 +213,7 @@ rte_eal_hugepage_init(void) RTE_LOG(INFO, EAL, "Mapped memory segment %u @ %p: physaddr:0x%" PRIx64", len %zu\n", - seg_idx, addr, physaddr, page_sz); + seg_idx++, addr, physaddr, page_sz); total_mem += seg->len; } @@ -288,3 +301,217 @@ rte_eal_using_phys_addrs(void) { return 0; } + +static uint64_t +get_mem_amount(uint64_t page_sz, uint64_t max_mem) +{ + uint64_t area_sz, max_pages; + + /* limit to RTE_MAX_MEMSEG_PER_LIST pages or RTE_MAX_MEM_MB_PER_LIST */ + max_pages = RTE_MAX_MEMSEG_PER_LIST; + max_mem = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20, max_mem); + + area_sz = RTE_MIN(page_sz * max_pages, max_mem); + + /* make sure the list isn't smaller than the page size */ + area_sz = RTE_MAX(area_sz, page_sz); + + return RTE_ALIGN(area_sz, page_sz); +} + +#define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i" +static int +alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz, + int n_segs, int socket_id, int type_msl_idx) +{ + char name[RTE_FBARRAY_NAME_LEN]; + + snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id, + type_msl_idx); + if (rte_fbarray_init(&msl->memseg_arr, name, n_segs, + sizeof(struct rte_memseg))) { + RTE_LOG(ERR, EAL, "Cannot allocate memseg list: %s\n", + rte_strerror(rte_errno)); + return -1; + } + + msl->page_sz = page_sz; + msl->socket_id = socket_id; + msl->base_va = NULL; + + RTE_LOG(DEBUG, EAL, "Memseg list allocated: 0x%zxkB at socket %i\n", + (size_t)page_sz >> 10, socket_id); + + return 0; +} + +static int +alloc_va_space(struct rte_memseg_list *msl) +{ + uint64_t page_sz; + size_t mem_sz; + void *addr; + int flags = 0; + +#ifdef RTE_ARCH_PPC_64 + flags |= MAP_HUGETLB; +#endif + + page_sz = msl->page_sz; + mem_sz = page_sz * msl->memseg_arr.len; + + addr = eal_get_virtual_area(msl->base_va, &mem_sz, page_sz, 0, flags); + if (addr == NULL) { + if (rte_errno == EADDRNOTAVAIL) + RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - please use '--base-virtaddr' option\n", + (unsigned long long)mem_sz, msl->base_va); + else + RTE_LOG(ERR, EAL, "Cannot reserve memory\n"); + return -1; + } + msl->base_va = addr; + + return 0; +} + + +static int +memseg_primary_init(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int hpi_idx, msl_idx = 0; + struct rte_memseg_list *msl; + uint64_t max_mem, total_mem; + + /* no-huge does not need this at all */ + if (internal_config.no_hugetlbfs) + return 0; + + /* FreeBSD has an issue where core dump will dump the entire memory + * contents, including anonymous zero-page memory. Therefore, while we + * will be limiting total amount of memory to RTE_MAX_MEM_MB, we will + * also be further limiting total memory amount to whatever memory is + * available to us through contigmem driver (plus spacing blocks). + * + * so, at each stage, we will be checking how much memory we are + * preallocating, and adjust all the values accordingly. + */ + + max_mem = (uint64_t)RTE_MAX_MEM_MB << 20; + total_mem = 0; + + /* create memseg lists */ + for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes; + hpi_idx++) { + uint64_t max_type_mem, total_type_mem = 0; + uint64_t avail_mem; + int type_msl_idx, max_segs, avail_segs, total_segs = 0; + struct hugepage_info *hpi; + uint64_t hugepage_sz; + + hpi = &internal_config.hugepage_info[hpi_idx]; + hugepage_sz = hpi->hugepage_sz; + + /* no NUMA support on FreeBSD */ + + /* check if we've already exceeded total memory amount */ + if (total_mem >= max_mem) + break; + + /* first, calculate theoretical limits according to config */ + max_type_mem = RTE_MIN(max_mem - total_mem, + (uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20); + max_segs = RTE_MAX_MEMSEG_PER_TYPE; + + /* now, limit all of that to whatever will actually be + * available to us, because without dynamic allocation support, + * all of that extra memory will be sitting there being useless + * and slowing down core dumps in case of a crash. + * + * we need (N*2)-1 segments because we cannot guarantee that + * each segment will be IOVA-contiguous with the previous one, + * so we will allocate more and put spaces inbetween segments + * that are non-contiguous. + */ + avail_segs = (hpi->num_pages[0] * 2) - 1; + avail_mem = avail_segs * hugepage_sz; + + max_type_mem = RTE_MIN(avail_mem, max_type_mem); + max_segs = RTE_MIN(avail_segs, max_segs); + + type_msl_idx = 0; + while (total_type_mem < max_type_mem && + total_segs < max_segs) { + uint64_t cur_max_mem, cur_mem; + unsigned int n_segs; + + if (msl_idx >= RTE_MAX_MEMSEG_LISTS) { + RTE_LOG(ERR, EAL, + "No more space in memseg lists, please increase %s\n", + RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); + return -1; + } + + msl = &mcfg->memsegs[msl_idx++]; + + cur_max_mem = max_type_mem - total_type_mem; + + cur_mem = get_mem_amount(hugepage_sz, + cur_max_mem); + n_segs = cur_mem / hugepage_sz; + + if (alloc_memseg_list(msl, hugepage_sz, n_segs, + 0, type_msl_idx)) + return -1; + + total_segs += msl->memseg_arr.len; + total_type_mem = total_segs * hugepage_sz; + type_msl_idx++; + + if (alloc_va_space(msl)) { + RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n"); + return -1; + } + } + total_mem += total_type_mem; + } + return 0; +} + +static int +memseg_secondary_init(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int msl_idx = 0; + struct rte_memseg_list *msl; + + for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) { + + msl = &mcfg->memsegs[msl_idx]; + + /* skip empty memseg lists */ + if (msl->memseg_arr.len == 0) + continue; + + if (rte_fbarray_attach(&msl->memseg_arr)) { + RTE_LOG(ERR, EAL, "Cannot attach to primary process memseg lists\n"); + return -1; + } + + /* preallocate VA space */ + if (alloc_va_space(msl)) { + RTE_LOG(ERR, EAL, "Cannot preallocate VA space for hugepage memory\n"); + return -1; + } + } + + return 0; +} + +int +rte_eal_memseg_init(void) +{ + return rte_eal_process_type() == RTE_PROC_PRIMARY ? + memseg_primary_init() : + memseg_secondary_init(); +} diff --git a/lib/librte_eal/bsdapp/eal/meson.build b/lib/librte_eal/bsdapp/eal/meson.build index 47e16a64..3945b529 100644 --- a/lib/librte_eal/bsdapp/eal/meson.build +++ b/lib/librte_eal/bsdapp/eal/meson.build @@ -16,3 +16,5 @@ env_sources = files('eal_alarm.c', 'eal_memory.c', 'eal_dev.c' ) + +deps += ['kvargs'] diff --git a/lib/librte_eal/common/Makefile b/lib/librte_eal/common/Makefile index 48f870f2..cca68826 100644 --- a/lib/librte_eal/common/Makefile +++ b/lib/librte_eal/common/Makefile @@ -11,12 +11,12 @@ INC += rte_per_lcore.h rte_random.h INC += rte_tailq.h rte_interrupts.h rte_alarm.h INC += rte_string_fns.h rte_version.h INC += rte_eal_memconfig.h rte_malloc_heap.h -INC += rte_hexdump.h rte_devargs.h rte_bus.h rte_dev.h +INC += rte_hexdump.h rte_devargs.h rte_bus.h rte_dev.h rte_class.h INC += rte_pci_dev_feature_defs.h rte_pci_dev_features.h INC += rte_malloc.h rte_keepalive.h rte_time.h INC += rte_service.h rte_service_component.h INC += rte_bitmap.h rte_vfio.h rte_hypervisor.h rte_test.h -INC += rte_reciprocal.h rte_fbarray.h +INC += rte_reciprocal.h rte_fbarray.h rte_uuid.h GENERIC_INC := rte_atomic.h rte_byteorder.h rte_cycles.h rte_prefetch.h GENERIC_INC += rte_spinlock.h rte_memcpy.h rte_cpuflags.h rte_rwlock.h diff --git a/lib/librte_eal/common/eal_common_class.c b/lib/librte_eal/common/eal_common_class.c new file mode 100644 index 00000000..404a9065 --- /dev/null +++ b/lib/librte_eal/common/eal_common_class.c @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2018 Gaëtan Rivet + */ + +#include <stdio.h> +#include <string.h> +#include <sys/queue.h> + +#include <rte_class.h> +#include <rte_debug.h> + +struct rte_class_list rte_class_list = + TAILQ_HEAD_INITIALIZER(rte_class_list); + +__rte_experimental void +rte_class_register(struct rte_class *class) +{ + RTE_VERIFY(class); + RTE_VERIFY(class->name && strlen(class->name)); + + TAILQ_INSERT_TAIL(&rte_class_list, class, next); + RTE_LOG(DEBUG, EAL, "Registered [%s] device class.\n", class->name); +} + +__rte_experimental void +rte_class_unregister(struct rte_class *class) +{ + TAILQ_REMOVE(&rte_class_list, class, next); + RTE_LOG(DEBUG, EAL, "Unregistered [%s] device class.\n", class->name); +} + +__rte_experimental +struct rte_class * +rte_class_find(const struct rte_class *start, rte_class_cmp_t cmp, + const void *data) +{ + struct rte_class *cls; + + if (start != NULL) + cls = TAILQ_NEXT(start, next); + else + cls = TAILQ_FIRST(&rte_class_list); + while (cls != NULL) { + if (cmp(cls, data) == 0) + break; + cls = TAILQ_NEXT(cls, next); + } + return cls; +} + +static int +cmp_class_name(const struct rte_class *class, const void *_name) +{ + const char *name = _name; + + return strcmp(class->name, name); +} + +__rte_experimental +struct rte_class * +rte_class_find_by_name(const char *name) +{ + return rte_class_find(NULL, cmp_class_name, (const void *)name); +} diff --git a/lib/librte_eal/common/eal_common_dev.c b/lib/librte_eal/common/eal_common_dev.c index 61cb3b16..678dbcac 100644 --- a/lib/librte_eal/common/eal_common_dev.c +++ b/lib/librte_eal/common/eal_common_dev.c @@ -10,9 +10,12 @@ #include <rte_compat.h> #include <rte_bus.h> +#include <rte_class.h> #include <rte_dev.h> #include <rte_devargs.h> #include <rte_debug.h> +#include <rte_errno.h> +#include <rte_kvargs.h> #include <rte_log.h> #include <rte_spinlock.h> #include <rte_malloc.h> @@ -42,17 +45,27 @@ static struct dev_event_cb_list dev_event_cbs; /* spinlock for device callbacks */ static rte_spinlock_t dev_event_lock = RTE_SPINLOCK_INITIALIZER; -static int cmp_detached_dev_name(const struct rte_device *dev, - const void *_name) -{ - const char *name = _name; +struct dev_next_ctx { + struct rte_dev_iterator *it; + const char *bus_str; + const char *cls_str; +}; - /* skip attached devices */ - if (dev->driver != NULL) - return 1; +#define CTX(it, bus_str, cls_str) \ + (&(const struct dev_next_ctx){ \ + .it = it, \ + .bus_str = bus_str, \ + .cls_str = cls_str, \ + }) - return strcmp(dev->name, name); -} +#define ITCTX(ptr) \ + (((struct dev_next_ctx *)(intptr_t)ptr)->it) + +#define BUSCTX(ptr) \ + (((struct dev_next_ctx *)(intptr_t)ptr)->bus_str) + +#define CLSCTX(ptr) \ + (((struct dev_next_ctx *)(intptr_t)ptr)->cls_str) static int cmp_dev_name(const struct rte_device *dev, const void *_name) { @@ -138,8 +151,8 @@ int __rte_experimental rte_eal_hotplug_add(const char *busname, const char *devn if (da == NULL) return -ENOMEM; - ret = rte_devargs_parse(da, "%s:%s,%s", - busname, devname, devargs); + ret = rte_devargs_parsef(da, "%s:%s,%s", + busname, devname, devargs); if (ret) goto err_devarg; @@ -151,14 +164,19 @@ int __rte_experimental rte_eal_hotplug_add(const char *busname, const char *devn if (ret) goto err_devarg; - dev = bus->find_device(NULL, cmp_detached_dev_name, devname); + dev = bus->find_device(NULL, cmp_dev_name, devname); if (dev == NULL) { - RTE_LOG(ERR, EAL, "Cannot find unplugged device (%s)\n", + RTE_LOG(ERR, EAL, "Cannot find device (%s)\n", devname); ret = -ENODEV; goto err_devarg; } + if (dev->driver != NULL) { + RTE_LOG(ERR, EAL, "Device is already plugged\n"); + return -EEXIST; + } + ret = bus->plug(dev); if (ret) { RTE_LOG(ERR, EAL, "Driver cannot attach the device (%s)\n", @@ -200,6 +218,11 @@ rte_eal_hotplug_remove(const char *busname, const char *devname) return -EINVAL; } + if (dev->driver == NULL) { + RTE_LOG(ERR, EAL, "Device is already unplugged\n"); + return -ENOENT; + } + ret = bus->unplug(dev); if (ret) RTE_LOG(ERR, EAL, "Driver cannot detach the device (%s)\n", @@ -343,3 +366,201 @@ dev_callback_process(char *device_name, enum rte_dev_event_type event) } rte_spinlock_unlock(&dev_event_lock); } + +__rte_experimental +int +rte_dev_iterator_init(struct rte_dev_iterator *it, + const char *dev_str) +{ + struct rte_devargs devargs; + struct rte_class *cls = NULL; + struct rte_bus *bus = NULL; + + /* Having both bus_str and cls_str NULL is illegal, + * marking this iterator as invalid unless + * everything goes well. + */ + it->bus_str = NULL; + it->cls_str = NULL; + + devargs.data = dev_str; + if (rte_devargs_layers_parse(&devargs, dev_str)) + goto get_out; + + bus = devargs.bus; + cls = devargs.cls; + /* The string should have at least + * one layer specified. + */ + if (bus == NULL && cls == NULL) { + RTE_LOG(ERR, EAL, + "Either bus or class must be specified.\n"); + rte_errno = EINVAL; + goto get_out; + } + if (bus != NULL && bus->dev_iterate == NULL) { + RTE_LOG(ERR, EAL, "Bus %s not supported\n", bus->name); + rte_errno = ENOTSUP; + goto get_out; + } + if (cls != NULL && cls->dev_iterate == NULL) { + RTE_LOG(ERR, EAL, "Class %s not supported\n", cls->name); + rte_errno = ENOTSUP; + goto get_out; + } + it->bus_str = devargs.bus_str; + it->cls_str = devargs.cls_str; + it->dev_str = dev_str; + it->bus = bus; + it->cls = cls; + it->device = NULL; + it->class_device = NULL; +get_out: + return -rte_errno; +} + +static char * +dev_str_sane_copy(const char *str) +{ + size_t end; + char *copy; + + end = strcspn(str, ",/"); + if (str[end] == ',') { + copy = strdup(&str[end + 1]); + } else { + /* '/' or '\0' */ + copy = strdup(""); + } + if (copy == NULL) { + rte_errno = ENOMEM; + } else { + char *slash; + + slash = strchr(copy, '/'); + if (slash != NULL) + slash[0] = '\0'; + } + return copy; +} + +static int +class_next_dev_cmp(const struct rte_class *cls, + const void *ctx) +{ + struct rte_dev_iterator *it; + const char *cls_str = NULL; + void *dev; + + if (cls->dev_iterate == NULL) + return 1; + it = ITCTX(ctx); + cls_str = CLSCTX(ctx); + dev = it->class_device; + /* it->cls_str != NULL means a class + * was specified in the devstr. + */ + if (it->cls_str != NULL && cls != it->cls) + return 1; + /* If an error occurred previously, + * no need to test further. + */ + if (rte_errno != 0) + return -1; + dev = cls->dev_iterate(dev, cls_str, it); + it->class_device = dev; + return dev == NULL; +} + +static int +bus_next_dev_cmp(const struct rte_bus *bus, + const void *ctx) +{ + struct rte_device *dev = NULL; + struct rte_class *cls = NULL; + struct rte_dev_iterator *it; + const char *bus_str = NULL; + + if (bus->dev_iterate == NULL) + return 1; + it = ITCTX(ctx); + bus_str = BUSCTX(ctx); + dev = it->device; + /* it->bus_str != NULL means a bus + * was specified in the devstr. + */ + if (it->bus_str != NULL && bus != it->bus) + return 1; + /* If an error occurred previously, + * no need to test further. + */ + if (rte_errno != 0) + return -1; + if (it->cls_str == NULL) { + dev = bus->dev_iterate(dev, bus_str, it); + goto end; + } + /* cls_str != NULL */ + if (dev == NULL) { +next_dev_on_bus: + dev = bus->dev_iterate(dev, bus_str, it); + it->device = dev; + } + if (dev == NULL) + return 1; + if (it->cls != NULL) + cls = TAILQ_PREV(it->cls, rte_class_list, next); + cls = rte_class_find(cls, class_next_dev_cmp, ctx); + if (cls != NULL) { + it->cls = cls; + goto end; + } + goto next_dev_on_bus; +end: + it->device = dev; + return dev == NULL; +} +__rte_experimental +struct rte_device * +rte_dev_iterator_next(struct rte_dev_iterator *it) +{ + struct rte_bus *bus = NULL; + int old_errno = rte_errno; + char *bus_str = NULL; + char *cls_str = NULL; + + rte_errno = 0; + if (it->bus_str == NULL && it->cls_str == NULL) { + /* Invalid iterator. */ + rte_errno = EINVAL; + return NULL; + } + if (it->bus != NULL) + bus = TAILQ_PREV(it->bus, rte_bus_list, next); + if (it->bus_str != NULL) { + bus_str = dev_str_sane_copy(it->bus_str); + if (bus_str == NULL) + goto out; + } + if (it->cls_str != NULL) { + cls_str = dev_str_sane_copy(it->cls_str); + if (cls_str == NULL) + goto out; + } + while ((bus = rte_bus_find(bus, bus_next_dev_cmp, + CTX(it, bus_str, cls_str)))) { + if (it->device != NULL) { + it->bus = bus; + goto out; + } + if (it->bus_str != NULL || + rte_errno != 0) + break; + } + if (rte_errno == 0) + rte_errno = old_errno; +out: + free(bus_str); + free(cls_str); + return it->device; +} diff --git a/lib/librte_eal/common/eal_common_devargs.c b/lib/librte_eal/common/eal_common_devargs.c index b0434158..dac2402a 100644 --- a/lib/librte_eal/common/eal_common_devargs.c +++ b/lib/librte_eal/common/eal_common_devargs.c @@ -13,9 +13,14 @@ #include <string.h> #include <stdarg.h> +#include <rte_bus.h> +#include <rte_class.h> #include <rte_compat.h> #include <rte_dev.h> #include <rte_devargs.h> +#include <rte_errno.h> +#include <rte_kvargs.h> +#include <rte_log.h> #include <rte_tailq.h> #include "eal_private.h" @@ -56,30 +61,164 @@ rte_eal_parse_devargs_str(const char *devargs_str, return 0; } +static size_t +devargs_layer_count(const char *s) +{ + size_t i = s ? 1 : 0; + + while (s != NULL && s[0] != '\0') { + i += s[0] == '/'; + s++; + } + return i; +} + +int +rte_devargs_layers_parse(struct rte_devargs *devargs, + const char *devstr) +{ + struct { + const char *key; + const char *str; + struct rte_kvargs *kvlist; + } layers[] = { + { "bus=", NULL, NULL, }, + { "class=", NULL, NULL, }, + { "driver=", NULL, NULL, }, + }; + struct rte_kvargs_pair *kv = NULL; + struct rte_class *cls = NULL; + struct rte_bus *bus = NULL; + const char *s = devstr; + size_t nblayer; + size_t i = 0; + int ret = 0; + + /* Split each sub-lists. */ + nblayer = devargs_layer_count(devstr); + if (nblayer > RTE_DIM(layers)) { + RTE_LOG(ERR, EAL, "Invalid format: too many layers (%zu)\n", + nblayer); + ret = -E2BIG; + goto get_out; + } + + /* If the devargs points the devstr + * as source data, then it should not allocate + * anything and keep referring only to it. + */ + if (devargs->data != devstr) { + devargs->data = strdup(devstr); + if (devargs->data == NULL) { + RTE_LOG(ERR, EAL, "OOM\n"); + ret = -ENOMEM; + goto get_out; + } + s = devargs->data; + } + + while (s != NULL) { + if (i >= RTE_DIM(layers)) { + RTE_LOG(ERR, EAL, "Unrecognized layer %s\n", s); + ret = -EINVAL; + goto get_out; + } + /* + * The last layer is free-form. + * The "driver" key is not required (but accepted). + */ + if (strncmp(layers[i].key, s, strlen(layers[i].key)) && + i != RTE_DIM(layers) - 1) + goto next_layer; + layers[i].str = s; + layers[i].kvlist = rte_kvargs_parse_delim(s, NULL, "/"); + if (layers[i].kvlist == NULL) { + RTE_LOG(ERR, EAL, "Could not parse %s\n", s); + ret = -EINVAL; + goto get_out; + } + s = strchr(s, '/'); + if (s != NULL) + s++; +next_layer: + i++; + } + + /* Parse each sub-list. */ + for (i = 0; i < RTE_DIM(layers); i++) { + if (layers[i].kvlist == NULL) + continue; + kv = &layers[i].kvlist->pairs[0]; + if (strcmp(kv->key, "bus") == 0) { + bus = rte_bus_find_by_name(kv->value); + if (bus == NULL) { + RTE_LOG(ERR, EAL, "Could not find bus \"%s\"\n", + kv->value); + ret = -EFAULT; + goto get_out; + } + } else if (strcmp(kv->key, "class") == 0) { + cls = rte_class_find_by_name(kv->value); + if (cls == NULL) { + RTE_LOG(ERR, EAL, "Could not find class \"%s\"\n", + kv->value); + ret = -EFAULT; + goto get_out; + } + } else if (strcmp(kv->key, "driver") == 0) { + /* Ignore */ + continue; + } + } + + /* Fill devargs fields. */ + devargs->bus_str = layers[0].str; + devargs->cls_str = layers[1].str; + devargs->drv_str = layers[2].str; + devargs->bus = bus; + devargs->cls = cls; + + /* If we own the data, clean up a bit + * the several layers string, to ease + * their parsing afterward. + */ + if (devargs->data != devstr) { + char *s = (void *)(intptr_t)(devargs->data); + + while ((s = strchr(s, '/'))) { + *s = '\0'; + s++; + } + } + +get_out: + for (i = 0; i < RTE_DIM(layers); i++) { + if (layers[i].kvlist) + rte_kvargs_free(layers[i].kvlist); + } + if (ret != 0) + rte_errno = -ret; + return ret; +} + static int bus_name_cmp(const struct rte_bus *bus, const void *name) { return strncmp(bus->name, name, strlen(bus->name)); } -int __rte_experimental -rte_devargs_parse(struct rte_devargs *da, const char *format, ...) +__rte_experimental +int +rte_devargs_parse(struct rte_devargs *da, const char *dev) { struct rte_bus *bus = NULL; - va_list ap; - va_start(ap, format); - char dev[vsnprintf(NULL, 0, format, ap) + 1]; const char *devname; const size_t maxlen = sizeof(da->name); size_t i; - va_end(ap); if (da == NULL) return -EINVAL; - va_start(ap, format); - vsnprintf(dev, sizeof(dev), format, ap); - va_end(ap); /* Retrieve eventual bus info */ do { devname = dev; @@ -96,7 +235,7 @@ rte_devargs_parse(struct rte_devargs *da, const char *format, ...) da->name[i] = devname[i]; i++; if (i == maxlen) { - fprintf(stderr, "WARNING: Parsing \"%s\": device name should be shorter than %zu\n", + RTE_LOG(WARNING, EAL, "Parsing \"%s\": device name should be shorter than %zu\n", dev, maxlen); da->name[i - 1] = '\0'; return -EINVAL; @@ -106,7 +245,7 @@ rte_devargs_parse(struct rte_devargs *da, const char *format, ...) if (bus == NULL) { bus = rte_bus_find_by_device_name(da->name); if (bus == NULL) { - fprintf(stderr, "ERROR: failed to parse device \"%s\"\n", + RTE_LOG(ERR, EAL, "failed to parse device \"%s\"\n", da->name); return -EFAULT; } @@ -118,12 +257,40 @@ rte_devargs_parse(struct rte_devargs *da, const char *format, ...) else da->args = strdup(""); if (da->args == NULL) { - fprintf(stderr, "ERROR: not enough memory to parse arguments\n"); + RTE_LOG(ERR, EAL, "not enough memory to parse arguments\n"); return -ENOMEM; } return 0; } +__rte_experimental +int +rte_devargs_parsef(struct rte_devargs *da, const char *format, ...) +{ + va_list ap; + size_t len; + char *dev; + + if (da == NULL) + return -EINVAL; + + va_start(ap, format); + len = vsnprintf(NULL, 0, format, ap); + va_end(ap); + + dev = calloc(1, len + 1); + if (dev == NULL) { + RTE_LOG(ERR, EAL, "not enough memory to parse device\n"); + return -ENOMEM; + } + + va_start(ap, format); + vsnprintf(dev, len + 1, format, ap); + va_end(ap); + + return rte_devargs_parse(da, dev); +} + int __rte_experimental rte_devargs_insert(struct rte_devargs *da) { @@ -150,7 +317,7 @@ rte_devargs_add(enum rte_devtype devtype, const char *devargs_str) if (devargs == NULL) goto fail; - if (rte_devargs_parse(devargs, "%s", dev)) + if (rte_devargs_parse(devargs, dev)) goto fail; devargs->type = devtype; bus = devargs->bus; diff --git a/lib/librte_eal/common/eal_common_fbarray.c b/lib/librte_eal/common/eal_common_fbarray.c index 019f84c1..43caf3ce 100644 --- a/lib/librte_eal/common/eal_common_fbarray.c +++ b/lib/librte_eal/common/eal_common_fbarray.c @@ -231,7 +231,7 @@ find_next_n(const struct rte_fbarray *arr, unsigned int start, unsigned int n, return MASK_GET_IDX(msk_idx, run_start); } /* we didn't find anything */ - rte_errno = used ? -ENOENT : -ENOSPC; + rte_errno = used ? ENOENT : ENOSPC; return -1; } @@ -287,7 +287,7 @@ find_next(const struct rte_fbarray *arr, unsigned int start, bool used) return MASK_GET_IDX(idx, found); } /* we didn't find anything */ - rte_errno = used ? -ENOENT : -ENOSPC; + rte_errno = used ? ENOENT : ENOSPC; return -1; } @@ -353,6 +353,277 @@ find_contig(const struct rte_fbarray *arr, unsigned int start, bool used) } static int +find_prev_n(const struct rte_fbarray *arr, unsigned int start, unsigned int n, + bool used) +{ + const struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz, + arr->len); + unsigned int msk_idx, lookbehind_idx, first, first_mod; + uint64_t ignore_msk; + + /* + * mask only has granularity of MASK_ALIGN, but start may not be aligned + * on that boundary, so construct a special mask to exclude anything we + * don't want to see to avoid confusing ctz. + */ + first = MASK_LEN_TO_IDX(start); + first_mod = MASK_LEN_TO_MOD(start); + /* we're going backwards, so mask must start from the top */ + ignore_msk = first_mod == MASK_ALIGN - 1 ? + -1ULL : /* prevent overflow */ + ~(-1ULL << (first_mod + 1)); + + /* go backwards, include zero */ + msk_idx = first; + do { + uint64_t cur_msk, lookbehind_msk; + unsigned int run_start, run_end, ctz, left; + bool found = false; + /* + * The process of getting n consecutive bits from the top for + * arbitrary n is a bit involved, but here it is in a nutshell: + * + * 1. let n be the number of consecutive bits we're looking for + * 2. check if n can fit in one mask, and if so, do n-1 + * lshift-ands to see if there is an appropriate run inside + * our current mask + * 2a. if we found a run, bail out early + * 2b. if we didn't find a run, proceed + * 3. invert the mask and count trailing zeroes (that is, count + * how many consecutive set bits we had starting from the + * start of current mask) as k + * 3a. if k is 0, continue to next mask + * 3b. if k is not 0, we have a potential run + * 4. to satisfy our requirements, next mask must have n-k + * consecutive set bits at the end, so we will do (n-k-1) + * lshift-ands and check if last bit is set. + * + * Step 4 will need to be repeated if (n-k) > MASK_ALIGN until + * we either run out of masks, lose the run, or find what we + * were looking for. + */ + cur_msk = msk->data[msk_idx]; + left = n; + + /* if we're looking for free spaces, invert the mask */ + if (!used) + cur_msk = ~cur_msk; + + /* if we have an ignore mask, ignore once */ + if (ignore_msk) { + cur_msk &= ignore_msk; + ignore_msk = 0; + } + + /* if n can fit in within a single mask, do a search */ + if (n <= MASK_ALIGN) { + uint64_t tmp_msk = cur_msk; + unsigned int s_idx; + for (s_idx = 0; s_idx < n - 1; s_idx++) + tmp_msk &= tmp_msk << 1ULL; + /* we found what we were looking for */ + if (tmp_msk != 0) { + /* clz will give us offset from end of mask, and + * we only get the end of our run, not start, + * so adjust result to point to where start + * would have been. + */ + run_start = MASK_ALIGN - + __builtin_clzll(tmp_msk) - n; + return MASK_GET_IDX(msk_idx, run_start); + } + } + + /* + * we didn't find our run within the mask, or n > MASK_ALIGN, + * so we're going for plan B. + */ + + /* count trailing zeroes on inverted mask */ + if (~cur_msk == 0) + ctz = sizeof(cur_msk) * 8; + else + ctz = __builtin_ctzll(~cur_msk); + + /* if there aren't any runs at the start either, just + * continue + */ + if (ctz == 0) + continue; + + /* we have a partial run at the start, so try looking behind */ + run_end = MASK_GET_IDX(msk_idx, ctz); + left -= ctz; + + /* go backwards, include zero */ + lookbehind_idx = msk_idx - 1; + + /* we can't lookbehind as we've run out of masks, so stop */ + if (msk_idx == 0) + break; + + do { + const uint64_t last_bit = 1ULL << (MASK_ALIGN - 1); + unsigned int s_idx, need; + + lookbehind_msk = msk->data[lookbehind_idx]; + + /* if we're looking for free space, invert the mask */ + if (!used) + lookbehind_msk = ~lookbehind_msk; + + /* figure out how many consecutive bits we need here */ + need = RTE_MIN(left, MASK_ALIGN); + + for (s_idx = 0; s_idx < need - 1; s_idx++) + lookbehind_msk &= lookbehind_msk << 1ULL; + + /* if last bit is not set, we've lost the run */ + if ((lookbehind_msk & last_bit) == 0) { + /* + * we've scanned this far, so we know there are + * no runs in the space we've lookbehind-scanned + * as well, so skip that on next iteration. + */ + ignore_msk = -1ULL << need; + msk_idx = lookbehind_idx; + break; + } + + left -= need; + + /* check if we've found what we were looking for */ + if (left == 0) { + found = true; + break; + } + } while ((lookbehind_idx--) != 0); /* decrement after check to + * include zero + */ + + /* we didn't find anything, so continue */ + if (!found) + continue; + + /* we've found what we were looking for, but we only know where + * the run ended, so calculate start position. + */ + return run_end - n; + } while (msk_idx-- != 0); /* decrement after check to include zero */ + /* we didn't find anything */ + rte_errno = used ? ENOENT : ENOSPC; + return -1; +} + +static int +find_prev(const struct rte_fbarray *arr, unsigned int start, bool used) +{ + const struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz, + arr->len); + unsigned int idx, first, first_mod; + uint64_t ignore_msk; + + /* + * mask only has granularity of MASK_ALIGN, but start may not be aligned + * on that boundary, so construct a special mask to exclude anything we + * don't want to see to avoid confusing clz. + */ + first = MASK_LEN_TO_IDX(start); + first_mod = MASK_LEN_TO_MOD(start); + /* we're going backwards, so mask must start from the top */ + ignore_msk = first_mod == MASK_ALIGN - 1 ? + -1ULL : /* prevent overflow */ + ~(-1ULL << (first_mod + 1)); + + /* go backwards, include zero */ + idx = first; + do { + uint64_t cur = msk->data[idx]; + int found; + + /* if we're looking for free entries, invert mask */ + if (!used) + cur = ~cur; + + /* ignore everything before start on first iteration */ + if (idx == first) + cur &= ignore_msk; + + /* check if we have any entries */ + if (cur == 0) + continue; + + /* + * find last set bit - that will correspond to whatever it is + * that we're looking for. we're counting trailing zeroes, thus + * the value we get is counted from end of mask, so calculate + * position from start of mask. + */ + found = MASK_ALIGN - __builtin_clzll(cur) - 1; + + return MASK_GET_IDX(idx, found); + } while (idx-- != 0); /* decrement after check to include zero*/ + + /* we didn't find anything */ + rte_errno = used ? ENOENT : ENOSPC; + return -1; +} + +static int +find_rev_contig(const struct rte_fbarray *arr, unsigned int start, bool used) +{ + const struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz, + arr->len); + unsigned int idx, first, first_mod; + unsigned int need_len, result = 0; + + first = MASK_LEN_TO_IDX(start); + first_mod = MASK_LEN_TO_MOD(start); + + /* go backwards, include zero */ + idx = first; + do { + uint64_t cur = msk->data[idx]; + unsigned int run_len; + + need_len = MASK_ALIGN; + + /* if we're looking for free entries, invert mask */ + if (!used) + cur = ~cur; + + /* ignore everything after start on first iteration */ + if (idx == first) { + unsigned int end_len = MASK_ALIGN - first_mod - 1; + cur <<= end_len; + /* at the start, we don't need the full mask len */ + need_len -= end_len; + } + + /* we will be looking for zeroes, so invert the mask */ + cur = ~cur; + + /* if mask is zero, we have a complete run */ + if (cur == 0) + goto endloop; + + /* + * see where run ends, starting from the end. + */ + run_len = __builtin_clzll(cur); + + /* add however many zeroes we've had in the last run and quit */ + if (run_len < need_len) { + result += run_len; + break; + } +endloop: + result += need_len; + } while (idx-- != 0); /* decrement after check to include zero */ + return result; +} + +static int set_used(struct rte_fbarray *arr, unsigned int idx, bool used) { struct used_mask *msk; @@ -434,39 +705,52 @@ rte_fbarray_init(struct rte_fbarray *arr, const char *name, unsigned int len, if (data == NULL) goto fail; - eal_get_fbarray_path(path, sizeof(path), name); + if (internal_config.no_shconf) { + /* remap virtual area as writable */ + void *new_data = mmap(data, mmap_len, PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (new_data == MAP_FAILED) { + RTE_LOG(DEBUG, EAL, "%s(): couldn't remap anonymous memory: %s\n", + __func__, strerror(errno)); + goto fail; + } + } else { + eal_get_fbarray_path(path, sizeof(path), name); - /* - * Each fbarray is unique to process namespace, i.e. the filename - * depends on process prefix. Try to take out a lock and see if we - * succeed. If we don't, someone else is using it already. - */ - fd = open(path, O_CREAT | O_RDWR, 0600); - if (fd < 0) { - RTE_LOG(DEBUG, EAL, "%s(): couldn't open %s: %s\n", __func__, - path, strerror(errno)); - rte_errno = errno; - goto fail; - } else if (flock(fd, LOCK_EX | LOCK_NB)) { - RTE_LOG(DEBUG, EAL, "%s(): couldn't lock %s: %s\n", __func__, - path, strerror(errno)); - rte_errno = EBUSY; - goto fail; - } + /* + * Each fbarray is unique to process namespace, i.e. the + * filename depends on process prefix. Try to take out a lock + * and see if we succeed. If we don't, someone else is using it + * already. + */ + fd = open(path, O_CREAT | O_RDWR, 0600); + if (fd < 0) { + RTE_LOG(DEBUG, EAL, "%s(): couldn't open %s: %s\n", + __func__, path, strerror(errno)); + rte_errno = errno; + goto fail; + } else if (flock(fd, LOCK_EX | LOCK_NB)) { + RTE_LOG(DEBUG, EAL, "%s(): couldn't lock %s: %s\n", + __func__, path, strerror(errno)); + rte_errno = EBUSY; + goto fail; + } - /* take out a non-exclusive lock, so that other processes could still - * attach to it, but no other process could reinitialize it. - */ - if (flock(fd, LOCK_SH | LOCK_NB)) { - rte_errno = errno; - goto fail; - } + /* take out a non-exclusive lock, so that other processes could + * still attach to it, but no other process could reinitialize + * it. + */ + if (flock(fd, LOCK_SH | LOCK_NB)) { + rte_errno = errno; + goto fail; + } - if (resize_and_map(fd, data, mmap_len)) - goto fail; + if (resize_and_map(fd, data, mmap_len)) + goto fail; - /* we've mmap'ed the file, we can now close the fd */ - close(fd); + /* we've mmap'ed the file, we can now close the fd */ + close(fd); + } /* initialize the data */ memset(data, 0, mmap_len); @@ -675,8 +959,8 @@ rte_fbarray_is_used(struct rte_fbarray *arr, unsigned int idx) return ret; } -int __rte_experimental -rte_fbarray_find_next_free(struct rte_fbarray *arr, unsigned int start) +static int +fbarray_find(struct rte_fbarray *arr, unsigned int start, bool next, bool used) { int ret = -1; @@ -688,36 +972,106 @@ rte_fbarray_find_next_free(struct rte_fbarray *arr, unsigned int start) /* prevent array from changing under us */ rte_rwlock_read_lock(&arr->rwlock); - if (arr->len == arr->count) { - rte_errno = ENOSPC; - goto out; + /* cheap checks to prevent doing useless work */ + if (!used) { + if (arr->len == arr->count) { + rte_errno = ENOSPC; + goto out; + } + if (arr->count == 0) { + ret = start; + goto out; + } + } else { + if (arr->count == 0) { + rte_errno = ENOENT; + goto out; + } + if (arr->len == arr->count) { + ret = start; + goto out; + } } - - ret = find_next(arr, start, false); + if (next) + ret = find_next(arr, start, used); + else + ret = find_prev(arr, start, used); out: rte_rwlock_read_unlock(&arr->rwlock); return ret; } int __rte_experimental +rte_fbarray_find_next_free(struct rte_fbarray *arr, unsigned int start) +{ + return fbarray_find(arr, start, true, false); +} + +int __rte_experimental rte_fbarray_find_next_used(struct rte_fbarray *arr, unsigned int start) { + return fbarray_find(arr, start, true, true); +} + +int __rte_experimental +rte_fbarray_find_prev_free(struct rte_fbarray *arr, unsigned int start) +{ + return fbarray_find(arr, start, false, false); +} + +int __rte_experimental +rte_fbarray_find_prev_used(struct rte_fbarray *arr, unsigned int start) +{ + return fbarray_find(arr, start, false, true); +} + +static int +fbarray_find_n(struct rte_fbarray *arr, unsigned int start, unsigned int n, + bool next, bool used) +{ int ret = -1; - if (arr == NULL || start >= arr->len) { + if (arr == NULL || start >= arr->len || n > arr->len || n == 0) { rte_errno = EINVAL; return -1; } + if (next && (arr->len - start) < n) { + rte_errno = used ? ENOENT : ENOSPC; + return -1; + } + if (!next && start < (n - 1)) { + rte_errno = used ? ENOENT : ENOSPC; + return -1; + } /* prevent array from changing under us */ rte_rwlock_read_lock(&arr->rwlock); - if (arr->count == 0) { - rte_errno = ENOENT; - goto out; + /* cheap checks to prevent doing useless work */ + if (!used) { + if (arr->len == arr->count || arr->len - arr->count < n) { + rte_errno = ENOSPC; + goto out; + } + if (arr->count == 0) { + ret = next ? start : start - n + 1; + goto out; + } + } else { + if (arr->count < n) { + rte_errno = ENOENT; + goto out; + } + if (arr->count == arr->len) { + ret = next ? start : start - n + 1; + goto out; + } } - ret = find_next(arr, start, true); + if (next) + ret = find_next_n(arr, start, n, used); + else + ret = find_prev_n(arr, start, n, used); out: rte_rwlock_read_unlock(&arr->rwlock); return ret; @@ -727,54 +1081,33 @@ int __rte_experimental rte_fbarray_find_next_n_free(struct rte_fbarray *arr, unsigned int start, unsigned int n) { - int ret = -1; - - if (arr == NULL || start >= arr->len || n > arr->len) { - rte_errno = EINVAL; - return -1; - } - - /* prevent array from changing under us */ - rte_rwlock_read_lock(&arr->rwlock); - - if (arr->len == arr->count || arr->len - arr->count < n) { - rte_errno = ENOSPC; - goto out; - } - - ret = find_next_n(arr, start, n, false); -out: - rte_rwlock_read_unlock(&arr->rwlock); - return ret; + return fbarray_find_n(arr, start, n, true, false); } int __rte_experimental rte_fbarray_find_next_n_used(struct rte_fbarray *arr, unsigned int start, unsigned int n) { - int ret = -1; - - if (arr == NULL || start >= arr->len || n > arr->len) { - rte_errno = EINVAL; - return -1; - } - - /* prevent array from changing under us */ - rte_rwlock_read_lock(&arr->rwlock); - - if (arr->count < n) { - rte_errno = ENOENT; - goto out; - } + return fbarray_find_n(arr, start, n, true, true); +} - ret = find_next_n(arr, start, n, true); -out: - rte_rwlock_read_unlock(&arr->rwlock); - return ret; +int __rte_experimental +rte_fbarray_find_prev_n_free(struct rte_fbarray *arr, unsigned int start, + unsigned int n) +{ + return fbarray_find_n(arr, start, n, false, false); } int __rte_experimental -rte_fbarray_find_contig_free(struct rte_fbarray *arr, unsigned int start) +rte_fbarray_find_prev_n_used(struct rte_fbarray *arr, unsigned int start, + unsigned int n) +{ + return fbarray_find_n(arr, start, n, false, true); +} + +static int +fbarray_find_contig(struct rte_fbarray *arr, unsigned int start, bool next, + bool used) { int ret = -1; @@ -786,39 +1119,66 @@ rte_fbarray_find_contig_free(struct rte_fbarray *arr, unsigned int start) /* prevent array from changing under us */ rte_rwlock_read_lock(&arr->rwlock); - if (arr->len == arr->count) { - rte_errno = ENOSPC; - goto out; - } - - if (arr->count == 0) { - ret = arr->len - start; - goto out; + /* cheap checks to prevent doing useless work */ + if (used) { + if (arr->count == 0) { + ret = 0; + goto out; + } + if (next && arr->count == arr->len) { + ret = arr->len - start; + goto out; + } + if (!next && arr->count == arr->len) { + ret = start + 1; + goto out; + } + } else { + if (arr->len == arr->count) { + ret = 0; + goto out; + } + if (next && arr->count == 0) { + ret = arr->len - start; + goto out; + } + if (!next && arr->count == 0) { + ret = start + 1; + goto out; + } } - ret = find_contig(arr, start, false); + if (next) + ret = find_contig(arr, start, used); + else + ret = find_rev_contig(arr, start, used); out: rte_rwlock_read_unlock(&arr->rwlock); return ret; } int __rte_experimental -rte_fbarray_find_contig_used(struct rte_fbarray *arr, unsigned int start) +rte_fbarray_find_contig_free(struct rte_fbarray *arr, unsigned int start) { - int ret = -1; - - if (arr == NULL || start >= arr->len) { - rte_errno = EINVAL; - return -1; - } + return fbarray_find_contig(arr, start, true, false); +} - /* prevent array from changing under us */ - rte_rwlock_read_lock(&arr->rwlock); +int __rte_experimental +rte_fbarray_find_contig_used(struct rte_fbarray *arr, unsigned int start) +{ + return fbarray_find_contig(arr, start, true, true); +} - ret = find_contig(arr, start, true); +int __rte_experimental +rte_fbarray_find_rev_contig_free(struct rte_fbarray *arr, unsigned int start) +{ + return fbarray_find_contig(arr, start, false, false); +} - rte_rwlock_read_unlock(&arr->rwlock); - return ret; +int __rte_experimental +rte_fbarray_find_rev_contig_used(struct rte_fbarray *arr, unsigned int start) +{ + return fbarray_find_contig(arr, start, false, true); } int __rte_experimental diff --git a/lib/librte_eal/common/eal_common_log.c b/lib/librte_eal/common/eal_common_log.c index 81811894..c714a4bd 100644 --- a/lib/librte_eal/common/eal_common_log.c +++ b/lib/librte_eal/common/eal_common_log.c @@ -335,9 +335,7 @@ static const struct logtype logtype_strings[] = { }; /* Logging should be first initializer (before drivers and bus) */ -RTE_INIT_PRIO(rte_log_init, LOG); -static void -rte_log_init(void) +RTE_INIT_PRIO(rte_log_init, LOG) { uint32_t i; diff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c index 4f0688f9..fbfb1b05 100644 --- a/lib/librte_eal/common/eal_common_memory.c +++ b/lib/librte_eal/common/eal_common_memory.c @@ -34,7 +34,7 @@ #define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i" -static uint64_t baseaddr_offset; +static void *next_baseaddr; static uint64_t system_page_sz; void * @@ -56,21 +56,27 @@ eal_get_virtual_area(void *requested_addr, size_t *size, allow_shrink = (flags & EAL_VIRTUAL_AREA_ALLOW_SHRINK) > 0; unmap = (flags & EAL_VIRTUAL_AREA_UNMAP) > 0; - if (requested_addr == NULL && internal_config.base_virtaddr != 0) { - requested_addr = (void *) (internal_config.base_virtaddr + - (size_t)baseaddr_offset); + if (next_baseaddr == NULL && internal_config.base_virtaddr != 0 && + rte_eal_process_type() == RTE_PROC_PRIMARY) + next_baseaddr = (void *) internal_config.base_virtaddr; + + if (requested_addr == NULL && next_baseaddr != NULL) { + requested_addr = next_baseaddr; requested_addr = RTE_PTR_ALIGN(requested_addr, page_sz); addr_is_hint = true; } - /* if requested address is not aligned by page size, or if requested - * address is NULL, add page size to requested length as we may get an - * address that's aligned by system page size, which can be smaller than - * our requested page size. additionally, we shouldn't try to align if - * system page size is the same as requested page size. + /* we don't need alignment of resulting pointer in the following cases: + * + * 1. page size is equal to system size + * 2. we have a requested address, and it is page-aligned, and we will + * be discarding the address if we get a different one. + * + * for all other cases, alignment is potentially necessary. */ no_align = (requested_addr != NULL && - ((uintptr_t)requested_addr & (page_sz - 1)) == 0) || + requested_addr == RTE_PTR_ALIGN(requested_addr, page_sz) && + !addr_is_hint) || page_sz == system_page_sz; do { @@ -116,6 +122,8 @@ eal_get_virtual_area(void *requested_addr, size_t *size, RTE_LOG(WARNING, EAL, "WARNING! Base virtual address hint (%p != %p) not respected!\n", requested_addr, aligned_addr); RTE_LOG(WARNING, EAL, " This may cause issues with mapping memory into secondary processes\n"); + } else if (next_baseaddr != NULL) { + next_baseaddr = RTE_PTR_ADD(aligned_addr, *size); } RTE_LOG(DEBUG, EAL, "Virtual area found at %p (size = 0x%zx)\n", @@ -148,387 +156,9 @@ eal_get_virtual_area(void *requested_addr, size_t *size, munmap(aligned_end, after_len); } - baseaddr_offset += *size; - return aligned_addr; } -static uint64_t -get_mem_amount(uint64_t page_sz, uint64_t max_mem) -{ - uint64_t area_sz, max_pages; - - /* limit to RTE_MAX_MEMSEG_PER_LIST pages or RTE_MAX_MEM_MB_PER_LIST */ - max_pages = RTE_MAX_MEMSEG_PER_LIST; - max_mem = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20, max_mem); - - area_sz = RTE_MIN(page_sz * max_pages, max_mem); - - /* make sure the list isn't smaller than the page size */ - area_sz = RTE_MAX(area_sz, page_sz); - - return RTE_ALIGN(area_sz, page_sz); -} - -static int -free_memseg_list(struct rte_memseg_list *msl) -{ - if (rte_fbarray_destroy(&msl->memseg_arr)) { - RTE_LOG(ERR, EAL, "Cannot destroy memseg list\n"); - return -1; - } - memset(msl, 0, sizeof(*msl)); - return 0; -} - -static int -alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz, - uint64_t max_mem, int socket_id, int type_msl_idx) -{ - char name[RTE_FBARRAY_NAME_LEN]; - uint64_t mem_amount; - int max_segs; - - mem_amount = get_mem_amount(page_sz, max_mem); - max_segs = mem_amount / page_sz; - - snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id, - type_msl_idx); - if (rte_fbarray_init(&msl->memseg_arr, name, max_segs, - sizeof(struct rte_memseg))) { - RTE_LOG(ERR, EAL, "Cannot allocate memseg list: %s\n", - rte_strerror(rte_errno)); - return -1; - } - - msl->page_sz = page_sz; - msl->socket_id = socket_id; - msl->base_va = NULL; - - RTE_LOG(DEBUG, EAL, "Memseg list allocated: 0x%zxkB at socket %i\n", - (size_t)page_sz >> 10, socket_id); - - return 0; -} - -static int -alloc_va_space(struct rte_memseg_list *msl) -{ - uint64_t page_sz; - size_t mem_sz; - void *addr; - int flags = 0; - -#ifdef RTE_ARCH_PPC_64 - flags |= MAP_HUGETLB; -#endif - - page_sz = msl->page_sz; - mem_sz = page_sz * msl->memseg_arr.len; - - addr = eal_get_virtual_area(msl->base_va, &mem_sz, page_sz, 0, flags); - if (addr == NULL) { - if (rte_errno == EADDRNOTAVAIL) - RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - please use '--base-virtaddr' option\n", - (unsigned long long)mem_sz, msl->base_va); - else - RTE_LOG(ERR, EAL, "Cannot reserve memory\n"); - return -1; - } - msl->base_va = addr; - - return 0; -} - -static int __rte_unused -memseg_primary_init_32(void) -{ - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - int active_sockets, hpi_idx, msl_idx = 0; - unsigned int socket_id, i; - struct rte_memseg_list *msl; - uint64_t extra_mem_per_socket, total_extra_mem, total_requested_mem; - uint64_t max_mem; - - /* no-huge does not need this at all */ - if (internal_config.no_hugetlbfs) - return 0; - - /* this is a giant hack, but desperate times call for desperate - * measures. in legacy 32-bit mode, we cannot preallocate VA space, - * because having upwards of 2 gigabytes of VA space already mapped will - * interfere with our ability to map and sort hugepages. - * - * therefore, in legacy 32-bit mode, we will be initializing memseg - * lists much later - in eal_memory.c, right after we unmap all the - * unneeded pages. this will not affect secondary processes, as those - * should be able to mmap the space without (too many) problems. - */ - if (internal_config.legacy_mem) - return 0; - - /* 32-bit mode is a very special case. we cannot know in advance where - * the user will want to allocate their memory, so we have to do some - * heuristics. - */ - active_sockets = 0; - total_requested_mem = 0; - if (internal_config.force_sockets) - for (i = 0; i < rte_socket_count(); i++) { - uint64_t mem; - - socket_id = rte_socket_id_by_idx(i); - mem = internal_config.socket_mem[socket_id]; - - if (mem == 0) - continue; - - active_sockets++; - total_requested_mem += mem; - } - else - total_requested_mem = internal_config.memory; - - max_mem = (uint64_t)RTE_MAX_MEM_MB << 20; - if (total_requested_mem > max_mem) { - RTE_LOG(ERR, EAL, "Invalid parameters: 32-bit process can at most use %uM of memory\n", - (unsigned int)(max_mem >> 20)); - return -1; - } - total_extra_mem = max_mem - total_requested_mem; - extra_mem_per_socket = active_sockets == 0 ? total_extra_mem : - total_extra_mem / active_sockets; - - /* the allocation logic is a little bit convoluted, but here's how it - * works, in a nutshell: - * - if user hasn't specified on which sockets to allocate memory via - * --socket-mem, we allocate all of our memory on master core socket. - * - if user has specified sockets to allocate memory on, there may be - * some "unused" memory left (e.g. if user has specified --socket-mem - * such that not all memory adds up to 2 gigabytes), so add it to all - * sockets that are in use equally. - * - * page sizes are sorted by size in descending order, so we can safely - * assume that we dispense with bigger page sizes first. - */ - - /* create memseg lists */ - for (i = 0; i < rte_socket_count(); i++) { - int hp_sizes = (int) internal_config.num_hugepage_sizes; - uint64_t max_socket_mem, cur_socket_mem; - unsigned int master_lcore_socket; - struct rte_config *cfg = rte_eal_get_configuration(); - bool skip; - - socket_id = rte_socket_id_by_idx(i); - -#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES - if (socket_id > 0) - break; -#endif - - /* if we didn't specifically request memory on this socket */ - skip = active_sockets != 0 && - internal_config.socket_mem[socket_id] == 0; - /* ...or if we didn't specifically request memory on *any* - * socket, and this is not master lcore - */ - master_lcore_socket = rte_lcore_to_socket_id(cfg->master_lcore); - skip |= active_sockets == 0 && socket_id != master_lcore_socket; - - if (skip) { - RTE_LOG(DEBUG, EAL, "Will not preallocate memory on socket %u\n", - socket_id); - continue; - } - - /* max amount of memory on this socket */ - max_socket_mem = (active_sockets != 0 ? - internal_config.socket_mem[socket_id] : - internal_config.memory) + - extra_mem_per_socket; - cur_socket_mem = 0; - - for (hpi_idx = 0; hpi_idx < hp_sizes; hpi_idx++) { - uint64_t max_pagesz_mem, cur_pagesz_mem = 0; - uint64_t hugepage_sz; - struct hugepage_info *hpi; - int type_msl_idx, max_segs, total_segs = 0; - - hpi = &internal_config.hugepage_info[hpi_idx]; - hugepage_sz = hpi->hugepage_sz; - - /* check if pages are actually available */ - if (hpi->num_pages[socket_id] == 0) - continue; - - max_segs = RTE_MAX_MEMSEG_PER_TYPE; - max_pagesz_mem = max_socket_mem - cur_socket_mem; - - /* make it multiple of page size */ - max_pagesz_mem = RTE_ALIGN_FLOOR(max_pagesz_mem, - hugepage_sz); - - RTE_LOG(DEBUG, EAL, "Attempting to preallocate " - "%" PRIu64 "M on socket %i\n", - max_pagesz_mem >> 20, socket_id); - - type_msl_idx = 0; - while (cur_pagesz_mem < max_pagesz_mem && - total_segs < max_segs) { - if (msl_idx >= RTE_MAX_MEMSEG_LISTS) { - RTE_LOG(ERR, EAL, - "No more space in memseg lists, please increase %s\n", - RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); - return -1; - } - - msl = &mcfg->memsegs[msl_idx]; - - if (alloc_memseg_list(msl, hugepage_sz, - max_pagesz_mem, socket_id, - type_msl_idx)) { - /* failing to allocate a memseg list is - * a serious error. - */ - RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n"); - return -1; - } - - if (alloc_va_space(msl)) { - /* if we couldn't allocate VA space, we - * can try with smaller page sizes. - */ - RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list, retrying with different page size\n"); - /* deallocate memseg list */ - if (free_memseg_list(msl)) - return -1; - break; - } - - total_segs += msl->memseg_arr.len; - cur_pagesz_mem = total_segs * hugepage_sz; - type_msl_idx++; - msl_idx++; - } - cur_socket_mem += cur_pagesz_mem; - } - if (cur_socket_mem == 0) { - RTE_LOG(ERR, EAL, "Cannot allocate VA space on socket %u\n", - socket_id); - return -1; - } - } - - return 0; -} - -static int __rte_unused -memseg_primary_init(void) -{ - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - int i, socket_id, hpi_idx, msl_idx = 0; - struct rte_memseg_list *msl; - uint64_t max_mem, total_mem; - - /* no-huge does not need this at all */ - if (internal_config.no_hugetlbfs) - return 0; - - max_mem = (uint64_t)RTE_MAX_MEM_MB << 20; - total_mem = 0; - - /* create memseg lists */ - for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes; - hpi_idx++) { - struct hugepage_info *hpi; - uint64_t hugepage_sz; - - hpi = &internal_config.hugepage_info[hpi_idx]; - hugepage_sz = hpi->hugepage_sz; - - for (i = 0; i < (int) rte_socket_count(); i++) { - uint64_t max_type_mem, total_type_mem = 0; - int type_msl_idx, max_segs, total_segs = 0; - - socket_id = rte_socket_id_by_idx(i); - -#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES - if (socket_id > 0) - break; -#endif - - if (total_mem >= max_mem) - break; - - max_type_mem = RTE_MIN(max_mem - total_mem, - (uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20); - max_segs = RTE_MAX_MEMSEG_PER_TYPE; - - type_msl_idx = 0; - while (total_type_mem < max_type_mem && - total_segs < max_segs) { - uint64_t cur_max_mem; - if (msl_idx >= RTE_MAX_MEMSEG_LISTS) { - RTE_LOG(ERR, EAL, - "No more space in memseg lists, please increase %s\n", - RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); - return -1; - } - - msl = &mcfg->memsegs[msl_idx++]; - - cur_max_mem = max_type_mem - total_type_mem; - if (alloc_memseg_list(msl, hugepage_sz, - cur_max_mem, socket_id, - type_msl_idx)) - return -1; - - total_segs += msl->memseg_arr.len; - total_type_mem = total_segs * hugepage_sz; - type_msl_idx++; - - if (alloc_va_space(msl)) { - RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n"); - return -1; - } - } - total_mem += total_type_mem; - } - } - return 0; -} - -static int -memseg_secondary_init(void) -{ - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - int msl_idx = 0; - struct rte_memseg_list *msl; - - for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) { - - msl = &mcfg->memsegs[msl_idx]; - - /* skip empty memseg lists */ - if (msl->memseg_arr.len == 0) - continue; - - if (rte_fbarray_attach(&msl->memseg_arr)) { - RTE_LOG(ERR, EAL, "Cannot attach to primary process memseg lists\n"); - return -1; - } - - /* preallocate VA space */ - if (alloc_va_space(msl)) { - RTE_LOG(ERR, EAL, "Cannot preallocate VA space for hugepage memory\n"); - return -1; - } - } - - return 0; -} - static struct rte_memseg * virt2memseg(const void *addr, const struct rte_memseg_list *msl) { @@ -536,6 +166,9 @@ virt2memseg(const void *addr, const struct rte_memseg_list *msl) void *start, *end; int ms_idx; + if (msl == NULL) + return NULL; + /* a memseg list was specified, check if it's the right one */ start = msl->base_va; end = RTE_PTR_ADD(start, (size_t)msl->page_sz * msl->memseg_arr.len); @@ -788,14 +421,11 @@ rte_mem_lock_page(const void *virt) } int __rte_experimental -rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg) +rte_memseg_contig_walk_thread_unsafe(rte_memseg_contig_walk_t func, void *arg) { struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; int i, ms_idx, ret = 0; - /* do not allow allocations/frees/init while we iterate */ - rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); - for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { struct rte_memseg_list *msl = &mcfg->memsegs[i]; const struct rte_memseg *ms; @@ -820,30 +450,34 @@ rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg) len = n_segs * msl->page_sz; ret = func(msl, ms, len, arg); - if (ret < 0) { - ret = -1; - goto out; - } else if (ret > 0) { - ret = 1; - goto out; - } + if (ret) + return ret; ms_idx = rte_fbarray_find_next_used(arr, ms_idx + n_segs); } } -out: - rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); - return ret; + return 0; } int __rte_experimental -rte_memseg_walk(rte_memseg_walk_t func, void *arg) +rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg) { struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - int i, ms_idx, ret = 0; + int ret = 0; /* do not allow allocations/frees/init while we iterate */ rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + ret = rte_memseg_contig_walk_thread_unsafe(func, arg); + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + + return ret; +} + +int __rte_experimental +rte_memseg_walk_thread_unsafe(rte_memseg_walk_t func, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int i, ms_idx, ret = 0; for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { struct rte_memseg_list *msl = &mcfg->memsegs[i]; @@ -859,29 +493,33 @@ rte_memseg_walk(rte_memseg_walk_t func, void *arg) while (ms_idx >= 0) { ms = rte_fbarray_get(arr, ms_idx); ret = func(msl, ms, arg); - if (ret < 0) { - ret = -1; - goto out; - } else if (ret > 0) { - ret = 1; - goto out; - } + if (ret) + return ret; ms_idx = rte_fbarray_find_next_used(arr, ms_idx + 1); } } -out: - rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); - return ret; + return 0; } int __rte_experimental -rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg) +rte_memseg_walk(rte_memseg_walk_t func, void *arg) { struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - int i, ret = 0; + int ret = 0; /* do not allow allocations/frees/init while we iterate */ rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + ret = rte_memseg_walk_thread_unsafe(func, arg); + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + + return ret; +} + +int __rte_experimental +rte_memseg_list_walk_thread_unsafe(rte_memseg_list_walk_t func, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int i, ret = 0; for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { struct rte_memseg_list *msl = &mcfg->memsegs[i]; @@ -890,17 +528,23 @@ rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg) continue; ret = func(msl, arg); - if (ret < 0) { - ret = -1; - goto out; - } - if (ret > 0) { - ret = 1; - goto out; - } + if (ret) + return ret; } -out: + return 0; +} + +int __rte_experimental +rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int ret = 0; + + /* do not allow allocations/frees/init while we iterate */ + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + ret = rte_memseg_list_walk_thread_unsafe(func, arg); rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + return ret; } @@ -918,15 +562,7 @@ rte_eal_memory_init(void) /* lock mem hotplug here, to prevent races while we init */ rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); - retval = rte_eal_process_type() == RTE_PROC_PRIMARY ? -#ifndef RTE_ARCH_64 - memseg_primary_init_32() : -#else - memseg_primary_init() : -#endif - memseg_secondary_init(); - - if (retval < 0) + if (rte_eal_memseg_init() < 0) goto fail; if (eal_memalloc_init() < 0) diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c index faa3b061..7300fe05 100644 --- a/lib/librte_eal/common/eal_common_memzone.c +++ b/lib/librte_eal/common/eal_common_memzone.c @@ -52,38 +52,6 @@ memzone_lookup_thread_unsafe(const char *name) return NULL; } - -/* This function will return the greatest free block if a heap has been - * specified. If no heap has been specified, it will return the heap and - * length of the greatest free block available in all heaps */ -static size_t -find_heap_max_free_elem(int *s, unsigned align) -{ - struct rte_mem_config *mcfg; - struct rte_malloc_socket_stats stats; - int i, socket = *s; - size_t len = 0; - - /* get pointer to global configuration */ - mcfg = rte_eal_get_configuration()->mem_config; - - for (i = 0; i < RTE_MAX_NUMA_NODES; i++) { - if ((socket != SOCKET_ID_ANY) && (socket != i)) - continue; - - malloc_heap_get_stats(&mcfg->malloc_heaps[i], &stats); - if (stats.greatest_free_size > len) { - len = stats.greatest_free_size; - *s = i; - } - } - - if (len < MALLOC_ELEM_OVERHEAD + align) - return 0; - - return len - MALLOC_ELEM_OVERHEAD - align; -} - static const struct rte_memzone * memzone_reserve_aligned_thread_unsafe(const char *name, size_t len, int socket_id, unsigned int flags, unsigned int align, @@ -92,6 +60,7 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len, struct rte_memzone *mz; struct rte_mem_config *mcfg; struct rte_fbarray *arr; + void *mz_addr; size_t requested_len; int mz_idx; bool contig; @@ -140,8 +109,7 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len, return NULL; } - len += RTE_CACHE_LINE_MASK; - len &= ~((size_t) RTE_CACHE_LINE_MASK); + len = RTE_ALIGN_CEIL(len, RTE_CACHE_LINE_SIZE); /* save minimal requested length */ requested_len = RTE_MAX((size_t)RTE_CACHE_LINE_SIZE, len); @@ -165,27 +133,18 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len, /* malloc only cares about size flags, remove contig flag from flags */ flags &= ~RTE_MEMZONE_IOVA_CONTIG; - if (len == 0) { - /* len == 0 is only allowed for non-contiguous zones */ - if (contig) { - RTE_LOG(DEBUG, EAL, "Reserving zero-length contiguous memzones is not supported\n"); - rte_errno = EINVAL; - return NULL; - } - if (bound != 0) + if (len == 0 && bound == 0) { + /* no size constraints were placed, so use malloc elem len */ + requested_len = 0; + mz_addr = malloc_heap_alloc_biggest(NULL, socket_id, flags, + align, contig); + } else { + if (len == 0) requested_len = bound; - else { - requested_len = find_heap_max_free_elem(&socket_id, align); - if (requested_len == 0) { - rte_errno = ENOMEM; - return NULL; - } - } + /* allocate memory on heap */ + mz_addr = malloc_heap_alloc(NULL, requested_len, socket_id, + flags, align, bound, contig); } - - /* allocate memory on heap */ - void *mz_addr = malloc_heap_alloc(NULL, requested_len, socket_id, flags, - align, bound, contig); if (mz_addr == NULL) { rte_errno = ENOMEM; return NULL; @@ -213,8 +172,9 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len, snprintf(mz->name, sizeof(mz->name), "%s", name); mz->iova = rte_malloc_virt2iova(mz_addr); mz->addr = mz_addr; - mz->len = (requested_len == 0 ? - (elem->size - MALLOC_ELEM_OVERHEAD) : requested_len); + mz->len = requested_len == 0 ? + elem->size - elem->pad - MALLOC_ELEM_OVERHEAD : + requested_len; mz->hugepage_sz = elem->msl->page_sz; mz->socket_id = elem->msl->socket_id; mz->flags = 0; diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c index ecebb292..dd5f9740 100644 --- a/lib/librte_eal/common/eal_common_options.c +++ b/lib/librte_eal/common/eal_common_options.c @@ -66,10 +66,12 @@ eal_long_options[] = { {OPT_NO_HUGE, 0, NULL, OPT_NO_HUGE_NUM }, {OPT_NO_PCI, 0, NULL, OPT_NO_PCI_NUM }, {OPT_NO_SHCONF, 0, NULL, OPT_NO_SHCONF_NUM }, + {OPT_IN_MEMORY, 0, NULL, OPT_IN_MEMORY_NUM }, {OPT_PCI_BLACKLIST, 1, NULL, OPT_PCI_BLACKLIST_NUM }, {OPT_PCI_WHITELIST, 1, NULL, OPT_PCI_WHITELIST_NUM }, {OPT_PROC_TYPE, 1, NULL, OPT_PROC_TYPE_NUM }, {OPT_SOCKET_MEM, 1, NULL, OPT_SOCKET_MEM_NUM }, + {OPT_SOCKET_LIMIT, 1, NULL, OPT_SOCKET_LIMIT_NUM }, {OPT_SYSLOG, 1, NULL, OPT_SYSLOG_NUM }, {OPT_VDEV, 1, NULL, OPT_VDEV_NUM }, {OPT_VFIO_INTR, 1, NULL, OPT_VFIO_INTR_NUM }, @@ -179,6 +181,10 @@ eal_reset_internal_config(struct internal_config *internal_cfg) /* zero out the NUMA config */ for (i = 0; i < RTE_MAX_NUMA_NODES; i++) internal_cfg->socket_mem[i] = 0; + internal_cfg->force_socket_limits = 0; + /* zero out the NUMA limits config */ + for (i = 0; i < RTE_MAX_NUMA_NODES; i++) + internal_cfg->socket_limit[i] = 0; /* zero out hugedir descriptors */ for (i = 0; i < MAX_HUGEPAGE_SIZES; i++) { memset(&internal_cfg->hugepage_info[i], 0, @@ -315,6 +321,7 @@ eal_parse_service_coremask(const char *coremask) unsigned int count = 0; char c; int val; + uint32_t taken_lcore_count = 0; if (coremask == NULL) return -1; @@ -348,7 +355,7 @@ eal_parse_service_coremask(const char *coremask) if (master_lcore_parsed && cfg->master_lcore == lcore) { RTE_LOG(ERR, EAL, - "Error: lcore %u is master lcore, cannot use as service core\n", + "lcore %u is master lcore, cannot use as service core\n", idx); return -1; } @@ -358,6 +365,10 @@ eal_parse_service_coremask(const char *coremask) "lcore %u unavailable\n", idx); return -1; } + + if (cfg->lcore_role[idx] == ROLE_RTE) + taken_lcore_count++; + lcore_config[idx].core_role = ROLE_SERVICE; count++; } @@ -374,11 +385,28 @@ eal_parse_service_coremask(const char *coremask) if (count == 0) return -1; + if (core_parsed && taken_lcore_count != count) { + RTE_LOG(WARNING, EAL, + "Not all service cores are in the coremask. " + "Please ensure -c or -l includes service cores\n"); + } + cfg->service_lcore_count = count; return 0; } static int +eal_service_cores_parsed(void) +{ + int idx; + for (idx = 0; idx < RTE_MAX_LCORE; idx++) { + if (lcore_config[idx].core_role == ROLE_SERVICE) + return 1; + } + return 0; +} + +static int eal_parse_coremask(const char *coremask) { struct rte_config *cfg = rte_eal_get_configuration(); @@ -387,6 +415,11 @@ eal_parse_coremask(const char *coremask) char c; int val; + if (eal_service_cores_parsed()) + RTE_LOG(WARNING, EAL, + "Service cores parsed before dataplane cores. " + "Please ensure -c is before -s or -S\n"); + if (coremask == NULL) return -1; /* Remove all blank characters ahead and after . @@ -418,6 +451,7 @@ eal_parse_coremask(const char *coremask) "unavailable\n", idx); return -1; } + cfg->lcore_role[idx] = ROLE_RTE; lcore_config[idx].core_index = count; count++; @@ -449,6 +483,7 @@ eal_parse_service_corelist(const char *corelist) unsigned count = 0; char *end = NULL; int min, max; + uint32_t taken_lcore_count = 0; if (corelist == NULL) return -1; @@ -490,6 +525,9 @@ eal_parse_service_corelist(const char *corelist) idx); return -1; } + if (cfg->lcore_role[idx] == ROLE_RTE) + taken_lcore_count++; + lcore_config[idx].core_role = ROLE_SERVICE; count++; @@ -504,6 +542,12 @@ eal_parse_service_corelist(const char *corelist) if (count == 0) return -1; + if (core_parsed && taken_lcore_count != count) { + RTE_LOG(WARNING, EAL, + "Not all service cores were in the coremask. " + "Please ensure -c or -l includes service cores\n"); + } + return 0; } @@ -516,6 +560,11 @@ eal_parse_corelist(const char *corelist) char *end = NULL; int min, max; + if (eal_service_cores_parsed()) + RTE_LOG(WARNING, EAL, + "Service cores parsed before dataplane cores. " + "Please ensure -l is before -s or -S\n"); + if (corelist == NULL) return -1; @@ -590,7 +639,8 @@ eal_parse_master_lcore(const char *arg) /* ensure master core is not used as service core */ if (lcore_config[cfg->master_lcore].core_role == ROLE_SERVICE) { - RTE_LOG(ERR, EAL, "Error: Master lcore is used as a service core.\n"); + RTE_LOG(ERR, EAL, + "Error: Master lcore is used as a service core\n"); return -1; } @@ -1165,6 +1215,13 @@ eal_parse_common_option(int opt, const char *optarg, conf->no_shconf = 1; break; + case OPT_IN_MEMORY_NUM: + conf->in_memory = 1; + /* in-memory is a superset of noshconf and huge-unlink */ + conf->no_shconf = 1; + conf->hugepage_unlink = 1; + break; + case OPT_PROC_TYPE_NUM: conf->process_type = eal_parse_proc_type(optarg); break; @@ -1316,12 +1373,23 @@ eal_check_common_options(struct internal_config *internal_cfg) "be specified together with --"OPT_NO_HUGE"\n"); return -1; } - - if (internal_cfg->no_hugetlbfs && internal_cfg->hugepage_unlink) { + if (internal_cfg->no_hugetlbfs && internal_cfg->hugepage_unlink && + !internal_cfg->in_memory) { RTE_LOG(ERR, EAL, "Option --"OPT_HUGE_UNLINK" cannot " "be specified together with --"OPT_NO_HUGE"\n"); return -1; } + if (internal_config.force_socket_limits && internal_config.legacy_mem) { + RTE_LOG(ERR, EAL, "Option --"OPT_SOCKET_LIMIT + " is only supported in non-legacy memory mode\n"); + } + if (internal_cfg->single_file_segments && + internal_cfg->hugepage_unlink) { + RTE_LOG(ERR, EAL, "Option --"OPT_SINGLE_FILE_SEGMENTS" is " + "not compatible with neither --"OPT_IN_MEMORY" nor " + "--"OPT_HUGE_UNLINK"\n"); + return -1; + } return 0; } @@ -1370,6 +1438,8 @@ eal_common_usage(void) " Set specific log level\n" " -v Display version information on startup\n" " -h, --help This help\n" + " --"OPT_IN_MEMORY" Operate entirely in memory. This will\n" + " disable secondary process support\n" "\nEAL options for DEBUG use only:\n" " --"OPT_HUGE_UNLINK" Unlink hugepage files after init\n" " --"OPT_NO_HUGE" Use malloc instead of hugetlbfs\n" diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c index 707d8ab3..9fcb9121 100644 --- a/lib/librte_eal/common/eal_common_proc.c +++ b/lib/librte_eal/common/eal_common_proc.c @@ -20,6 +20,7 @@ #include <sys/un.h> #include <unistd.h> +#include <rte_alarm.h> #include <rte_common.h> #include <rte_cycles.h> #include <rte_eal.h> @@ -94,11 +95,9 @@ TAILQ_HEAD(pending_request_list, pending_request); static struct { struct pending_request_list requests; pthread_mutex_t lock; - pthread_cond_t async_cond; } pending_requests = { .requests = TAILQ_HEAD_INITIALIZER(pending_requests.requests), .lock = PTHREAD_MUTEX_INITIALIZER, - .async_cond = PTHREAD_COND_INITIALIZER /**< used in async requests only */ }; @@ -106,6 +105,16 @@ static struct { static int mp_send(struct rte_mp_msg *msg, const char *peer, int type); +/* for use with alarm callback */ +static void +async_reply_handle(void *arg); + +/* for use with process_msg */ +static struct pending_request * +async_reply_handle_thread_unsafe(void *arg); + +static void +trigger_async_action(struct pending_request *req); static struct pending_request * find_pending_request(const char *dst, const char *act_name) @@ -290,6 +299,8 @@ process_msg(struct mp_msg_internal *m, struct sockaddr_un *s) RTE_LOG(DEBUG, EAL, "msg: %s\n", msg->name); if (m->type == MP_REP || m->type == MP_IGN) { + struct pending_request *req = NULL; + pthread_mutex_lock(&pending_requests.lock); pending_req = find_pending_request(s->sun_path, msg->name); if (pending_req) { @@ -301,11 +312,14 @@ process_msg(struct mp_msg_internal *m, struct sockaddr_un *s) if (pending_req->type == REQUEST_TYPE_SYNC) pthread_cond_signal(&pending_req->sync.cond); else if (pending_req->type == REQUEST_TYPE_ASYNC) - pthread_cond_signal( - &pending_requests.async_cond); + req = async_reply_handle_thread_unsafe( + pending_req); } else RTE_LOG(ERR, EAL, "Drop mp reply: %s\n", msg->name); pthread_mutex_unlock(&pending_requests.lock); + + if (req != NULL) + trigger_async_action(req); return; } @@ -365,7 +379,6 @@ timespec_cmp(const struct timespec *a, const struct timespec *b) } enum async_action { - ACTION_NONE, /**< don't do anything */ ACTION_FREE, /**< free the action entry, but don't trigger callback */ ACTION_TRIGGER /**< trigger callback, then free action entry */ }; @@ -375,7 +388,7 @@ process_async_request(struct pending_request *sr, const struct timespec *now) { struct async_request_param *param; struct rte_mp_reply *reply; - bool timeout, received, last_msg; + bool timeout, last_msg; param = sr->async.param; reply = ¶m->user_reply; @@ -383,13 +396,6 @@ process_async_request(struct pending_request *sr, const struct timespec *now) /* did we timeout? */ timeout = timespec_cmp(¶m->end, now) <= 0; - /* did we receive a response? */ - received = sr->reply_received != 0; - - /* if we didn't time out, and we didn't receive a response, ignore */ - if (!timeout && !received) - return ACTION_NONE; - /* if we received a response, adjust relevant data and copy mesasge. */ if (sr->reply_received == 1 && sr->reply) { struct rte_mp_msg *msg, *user_msgs, *tmp; @@ -448,118 +454,58 @@ trigger_async_action(struct pending_request *sr) free(sr->async.param->user_reply.msgs); free(sr->async.param); free(sr->request); + free(sr); } static struct pending_request * -check_trigger(struct timespec *ts) +async_reply_handle_thread_unsafe(void *arg) { - struct pending_request *next, *cur, *trigger = NULL; - - TAILQ_FOREACH_SAFE(cur, &pending_requests.requests, next, next) { - enum async_action action; - if (cur->type != REQUEST_TYPE_ASYNC) - continue; + struct pending_request *req = (struct pending_request *)arg; + enum async_action action; + struct timespec ts_now; + struct timeval now; - action = process_async_request(cur, ts); - if (action == ACTION_FREE) { - TAILQ_REMOVE(&pending_requests.requests, cur, next); - free(cur); - } else if (action == ACTION_TRIGGER) { - TAILQ_REMOVE(&pending_requests.requests, cur, next); - trigger = cur; - break; - } + if (gettimeofday(&now, NULL) < 0) { + RTE_LOG(ERR, EAL, "Cannot get current time\n"); + goto no_trigger; } - return trigger; -} + ts_now.tv_nsec = now.tv_usec * 1000; + ts_now.tv_sec = now.tv_sec; -static void -wait_for_async_messages(void) -{ - struct pending_request *sr; - struct timespec timeout; - bool timedwait = false; - bool nowait = false; - int ret; + action = process_async_request(req, &ts_now); - /* scan through the list and see if there are any timeouts that - * are earlier than our current timeout. - */ - TAILQ_FOREACH(sr, &pending_requests.requests, next) { - if (sr->type != REQUEST_TYPE_ASYNC) - continue; - if (!timedwait || timespec_cmp(&sr->async.param->end, - &timeout) < 0) { - memcpy(&timeout, &sr->async.param->end, - sizeof(timeout)); - timedwait = true; - } + TAILQ_REMOVE(&pending_requests.requests, req, next); - /* sometimes, we don't even wait */ - if (sr->reply_received) { - nowait = true; - break; + if (rte_eal_alarm_cancel(async_reply_handle, req) < 0) { + /* if we failed to cancel the alarm because it's already in + * progress, don't proceed because otherwise we will end up + * handling the same message twice. + */ + if (rte_errno == EINPROGRESS) { + RTE_LOG(DEBUG, EAL, "Request handling is already in progress\n"); + goto no_trigger; } + RTE_LOG(ERR, EAL, "Failed to cancel alarm\n"); } - if (nowait) - return; - - do { - ret = timedwait ? - pthread_cond_timedwait( - &pending_requests.async_cond, - &pending_requests.lock, - &timeout) : - pthread_cond_wait( - &pending_requests.async_cond, - &pending_requests.lock); - } while (ret != 0 && ret != ETIMEDOUT); - - /* we've been woken up or timed out */ + if (action == ACTION_TRIGGER) + return req; +no_trigger: + free(req); + return NULL; } -static void * -async_reply_handle(void *arg __rte_unused) +static void +async_reply_handle(void *arg) { - struct timeval now; - struct timespec ts_now; - while (1) { - struct pending_request *trigger = NULL; - - pthread_mutex_lock(&pending_requests.lock); + struct pending_request *req; - /* we exit this function holding the lock */ - wait_for_async_messages(); - - if (gettimeofday(&now, NULL) < 0) { - pthread_mutex_unlock(&pending_requests.lock); - RTE_LOG(ERR, EAL, "Cannot get current time\n"); - break; - } - ts_now.tv_nsec = now.tv_usec * 1000; - ts_now.tv_sec = now.tv_sec; - - do { - trigger = check_trigger(&ts_now); - /* unlock request list */ - pthread_mutex_unlock(&pending_requests.lock); - - if (trigger) { - trigger_async_action(trigger); - free(trigger); - - /* we've triggered a callback, but there may be - * more, so lock the list and check again. - */ - pthread_mutex_lock(&pending_requests.lock); - } - } while (trigger); - } - - RTE_LOG(ERR, EAL, "ERROR: asynchronous requests disabled\n"); + pthread_mutex_lock(&pending_requests.lock); + req = async_reply_handle_thread_unsafe(arg); + pthread_mutex_unlock(&pending_requests.lock); - return NULL; + if (req != NULL) + trigger_async_action(req); } static int @@ -624,7 +570,15 @@ rte_mp_channel_init(void) { char path[PATH_MAX]; int dir_fd; - pthread_t mp_handle_tid, async_reply_handle_tid; + pthread_t mp_handle_tid; + + /* in no shared files mode, we do not have secondary processes support, + * so no need to initialize IPC. + */ + if (internal_config.no_shconf) { + RTE_LOG(DEBUG, EAL, "No shared files mode enabled, IPC will be disabled\n"); + return 0; + } /* create filter path */ create_socket_path("*", path, sizeof(path)); @@ -671,17 +625,6 @@ rte_mp_channel_init(void) return -1; } - if (rte_ctrl_thread_create(&async_reply_handle_tid, - "rte_mp_async", NULL, - async_reply_handle, NULL) < 0) { - RTE_LOG(ERR, EAL, "failed to create mp thead: %s\n", - strerror(errno)); - close(mp_fd); - close(dir_fd); - mp_fd = -1; - return -1; - } - /* unlock the directory */ flock(dir_fd, LOCK_UN); close(dir_fd); @@ -786,7 +729,7 @@ mp_send(struct rte_mp_msg *msg, const char *peer, int type) dir_fd = dirfd(mp_dir); /* lock the directory to prevent processes spinning up while we send */ - if (flock(dir_fd, LOCK_EX)) { + if (flock(dir_fd, LOCK_SH)) { RTE_LOG(ERR, EAL, "Unable to lock directory %s\n", mp_dir_path); rte_errno = errno; @@ -853,7 +796,7 @@ rte_mp_sendmsg(struct rte_mp_msg *msg) static int mp_request_async(const char *dst, struct rte_mp_msg *req, - struct async_request_param *param) + struct async_request_param *param, const struct timespec *ts) { struct rte_mp_msg *reply_msg; struct pending_request *pending_req, *exist; @@ -898,6 +841,13 @@ mp_request_async(const char *dst, struct rte_mp_msg *req, param->user_reply.nb_sent++; + if (rte_eal_alarm_set(ts->tv_sec * 1000000 + ts->tv_nsec / 1000, + async_reply_handle, pending_req) < 0) { + RTE_LOG(ERR, EAL, "Fail to set alarm for request %s:%s\n", + dst, req->name); + rte_panic("Fix the above shit to properly free all memory\n"); + } + return 0; fail: free(pending_req); @@ -988,6 +938,12 @@ rte_mp_request_sync(struct rte_mp_msg *req, struct rte_mp_reply *reply, if (check_input(req) == false) return -1; + + if (internal_config.no_shconf) { + RTE_LOG(DEBUG, EAL, "No shared files mode enabled, IPC is disabled\n"); + return 0; + } + if (gettimeofday(&now, NULL) < 0) { RTE_LOG(ERR, EAL, "Faile to get current time\n"); rte_errno = errno; @@ -1020,7 +976,7 @@ rte_mp_request_sync(struct rte_mp_msg *req, struct rte_mp_reply *reply, dir_fd = dirfd(mp_dir); /* lock the directory to prevent processes spinning up while we send */ - if (flock(dir_fd, LOCK_EX)) { + if (flock(dir_fd, LOCK_SH)) { RTE_LOG(ERR, EAL, "Unable to lock directory %s\n", mp_dir_path); closedir(mp_dir); @@ -1072,6 +1028,12 @@ rte_mp_request_async(struct rte_mp_msg *req, const struct timespec *ts, if (check_input(req) == false) return -1; + + if (internal_config.no_shconf) { + RTE_LOG(DEBUG, EAL, "No shared files mode enabled, IPC is disabled\n"); + return 0; + } + if (gettimeofday(&now, NULL) < 0) { RTE_LOG(ERR, EAL, "Faile to get current time\n"); rte_errno = errno; @@ -1119,7 +1081,7 @@ rte_mp_request_async(struct rte_mp_msg *req, const struct timespec *ts, /* for secondary process, send request to the primary process only */ if (rte_eal_process_type() == RTE_PROC_SECONDARY) { - ret = mp_request_async(eal_mp_socket_path(), copy, param); + ret = mp_request_async(eal_mp_socket_path(), copy, param, ts); /* if we didn't send anything, put dummy request on the queue */ if (ret == 0 && reply->nb_sent == 0) { @@ -1146,7 +1108,7 @@ rte_mp_request_async(struct rte_mp_msg *req, const struct timespec *ts, dir_fd = dirfd(mp_dir); /* lock the directory to prevent processes spinning up while we send */ - if (flock(dir_fd, LOCK_EX)) { + if (flock(dir_fd, LOCK_SH)) { RTE_LOG(ERR, EAL, "Unable to lock directory %s\n", mp_dir_path); rte_errno = errno; @@ -1162,7 +1124,7 @@ rte_mp_request_async(struct rte_mp_msg *req, const struct timespec *ts, snprintf(path, sizeof(path), "%s/%s", mp_dir_path, ent->d_name); - if (mp_request_async(path, copy, param)) + if (mp_request_async(path, copy, param, ts)) ret = -1; } /* if we didn't send anything, put dummy request on the queue */ @@ -1171,9 +1133,6 @@ rte_mp_request_async(struct rte_mp_msg *req, const struct timespec *ts, dummy_used = true; } - /* trigger async request thread wake up */ - pthread_cond_signal(&pending_requests.async_cond); - /* finally, unlock the queue */ pthread_mutex_unlock(&pending_requests.lock); @@ -1213,5 +1172,10 @@ rte_mp_reply(struct rte_mp_msg *msg, const char *peer) return -1; } + if (internal_config.no_shconf) { + RTE_LOG(DEBUG, EAL, "No shared files mode enabled, IPC is disabled\n"); + return 0; + } + return mp_send(msg, peer, MP_REP); } diff --git a/lib/librte_eal/common/eal_common_thread.c b/lib/librte_eal/common/eal_common_thread.c index 42398630..48ef4d6d 100644 --- a/lib/librte_eal/common/eal_common_thread.c +++ b/lib/librte_eal/common/eal_common_thread.c @@ -175,7 +175,7 @@ rte_ctrl_thread_create(pthread_t *thread, const char *name, params = malloc(sizeof(*params)); if (!params) - return -1; + return -ENOMEM; params->start_routine = start_routine; params->arg = arg; @@ -185,13 +185,14 @@ rte_ctrl_thread_create(pthread_t *thread, const char *name, ret = pthread_create(thread, attr, rte_thread_init, (void *)params); if (ret != 0) { free(params); - return ret; + return -ret; } if (name != NULL) { ret = rte_thread_setname(*thread, name); if (ret < 0) - goto fail; + RTE_LOG(DEBUG, EAL, + "Cannot set name for ctrl thread\n"); } cpu_found = 0; @@ -227,5 +228,5 @@ fail: } pthread_cancel(*thread); pthread_join(*thread, NULL); - return ret; + return -ret; } diff --git a/lib/librte_eal/common/eal_common_uuid.c b/lib/librte_eal/common/eal_common_uuid.c new file mode 100644 index 00000000..1b93c5b3 --- /dev/null +++ b/lib/librte_eal/common/eal_common_uuid.c @@ -0,0 +1,193 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright (C) 1996, 1997 Theodore Ts'o. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF + * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +#include <stdio.h> +#include <string.h> +#include <stdint.h> +#include <stdlib.h> +#include <ctype.h> + +#include <rte_uuid.h> + +/* UUID packed form */ +struct uuid { + uint32_t time_low; + uint16_t time_mid; + uint16_t time_hi_and_version; + uint16_t clock_seq; + uint8_t node[6]; +}; + +static void uuid_pack(const struct uuid *uu, rte_uuid_t ptr) +{ + uint32_t tmp; + uint8_t *out = ptr; + + tmp = uu->time_low; + out[3] = (uint8_t) tmp; + tmp >>= 8; + out[2] = (uint8_t) tmp; + tmp >>= 8; + out[1] = (uint8_t) tmp; + tmp >>= 8; + out[0] = (uint8_t) tmp; + + tmp = uu->time_mid; + out[5] = (uint8_t) tmp; + tmp >>= 8; + out[4] = (uint8_t) tmp; + + tmp = uu->time_hi_and_version; + out[7] = (uint8_t) tmp; + tmp >>= 8; + out[6] = (uint8_t) tmp; + + tmp = uu->clock_seq; + out[9] = (uint8_t) tmp; + tmp >>= 8; + out[8] = (uint8_t) tmp; + + memcpy(out+10, uu->node, 6); +} + +static void uuid_unpack(const rte_uuid_t in, struct uuid *uu) +{ + const uint8_t *ptr = in; + uint32_t tmp; + + tmp = *ptr++; + tmp = (tmp << 8) | *ptr++; + tmp = (tmp << 8) | *ptr++; + tmp = (tmp << 8) | *ptr++; + uu->time_low = tmp; + + tmp = *ptr++; + tmp = (tmp << 8) | *ptr++; + uu->time_mid = tmp; + + tmp = *ptr++; + tmp = (tmp << 8) | *ptr++; + uu->time_hi_and_version = tmp; + + tmp = *ptr++; + tmp = (tmp << 8) | *ptr++; + uu->clock_seq = tmp; + + memcpy(uu->node, ptr, 6); +} + +bool rte_uuid_is_null(const rte_uuid_t uu) +{ + const uint8_t *cp = uu; + int i; + + for (i = 0; i < 16; i++) + if (*cp++) + return false; + return true; +} + +/* + * rte_uuid_compare() - compare two UUIDs. + */ +int rte_uuid_compare(const rte_uuid_t uu1, const rte_uuid_t uu2) +{ + struct uuid uuid1, uuid2; + + uuid_unpack(uu1, &uuid1); + uuid_unpack(uu2, &uuid2); + +#define UUCMP(u1, u2) \ + do { if (u1 != u2) return (u1 < u2) ? -1 : 1; } while (0) + + UUCMP(uuid1.time_low, uuid2.time_low); + UUCMP(uuid1.time_mid, uuid2.time_mid); + UUCMP(uuid1.time_hi_and_version, uuid2.time_hi_and_version); + UUCMP(uuid1.clock_seq, uuid2.clock_seq); +#undef UUCMP + + return memcmp(uuid1.node, uuid2.node, 6); +} + +int rte_uuid_parse(const char *in, rte_uuid_t uu) +{ + struct uuid uuid; + int i; + const char *cp; + char buf[3]; + + if (strlen(in) != 36) + return -1; + + for (i = 0, cp = in; i <= 36; i++, cp++) { + if ((i == 8) || (i == 13) || (i == 18) || + (i == 23)) { + if (*cp == '-') + continue; + else + return -1; + } + if (i == 36) + if (*cp == 0) + continue; + if (!isxdigit(*cp)) + return -1; + } + + uuid.time_low = strtoul(in, NULL, 16); + uuid.time_mid = strtoul(in+9, NULL, 16); + uuid.time_hi_and_version = strtoul(in+14, NULL, 16); + uuid.clock_seq = strtoul(in+19, NULL, 16); + cp = in+24; + buf[2] = 0; + + for (i = 0; i < 6; i++) { + buf[0] = *cp++; + buf[1] = *cp++; + uuid.node[i] = strtoul(buf, NULL, 16); + } + + uuid_pack(&uuid, uu); + return 0; +} + +void rte_uuid_unparse(const rte_uuid_t uu, char *out, size_t len) +{ + struct uuid uuid; + + uuid_unpack(uu, &uuid); + + snprintf(out, len, + "%08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x", + uuid.time_low, uuid.time_mid, uuid.time_hi_and_version, + uuid.clock_seq >> 8, uuid.clock_seq & 0xFF, + uuid.node[0], uuid.node[1], uuid.node[2], + uuid.node[3], uuid.node[4], uuid.node[5]); +} diff --git a/lib/librte_eal/common/eal_filesystem.h b/lib/librte_eal/common/eal_filesystem.h index 364f38d1..de05febf 100644 --- a/lib/librte_eal/common/eal_filesystem.h +++ b/lib/librte_eal/common/eal_filesystem.h @@ -12,7 +12,6 @@ #define EAL_FILESYSTEM_H /** Path of rte config file. */ -#define RUNTIME_CONFIG_FMT "%s/.%s_config" #include <stdint.h> #include <limits.h> @@ -30,17 +29,14 @@ eal_create_runtime_dir(void); const char * eal_get_runtime_dir(void); +#define RUNTIME_CONFIG_FNAME "config" static inline const char * eal_runtime_config_path(void) { static char buffer[PATH_MAX]; /* static so auto-zeroed */ - const char *directory = "/var/run"; - const char *home_dir = getenv("HOME"); - if (getuid() != 0 && home_dir != NULL) - directory = home_dir; - snprintf(buffer, sizeof(buffer) - 1, RUNTIME_CONFIG_FMT, directory, - internal_config.hugefile_prefix); + snprintf(buffer, sizeof(buffer) - 1, "%s/%s", eal_get_runtime_dir(), + RUNTIME_CONFIG_FNAME); return buffer; } diff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h index c4cbf3ac..00ee6e06 100644 --- a/lib/librte_eal/common/eal_internal_cfg.h +++ b/lib/librte_eal/common/eal_internal_cfg.h @@ -41,11 +41,17 @@ struct internal_config { volatile unsigned vmware_tsc_map; /**< true to use VMware TSC mapping * instead of native TSC */ volatile unsigned no_shconf; /**< true if there is no shared config */ + volatile unsigned in_memory; + /**< true if DPDK should operate entirely in-memory and not create any + * shared files or runtime data. + */ volatile unsigned create_uio_dev; /**< true to create /dev/uioX devices */ volatile enum rte_proc_type_t process_type; /**< multi-process proc type */ /** true to try allocating memory on specific sockets */ volatile unsigned force_sockets; volatile uint64_t socket_mem[RTE_MAX_NUMA_NODES]; /**< amount of memory per socket */ + volatile unsigned force_socket_limits; + volatile uint64_t socket_limit[RTE_MAX_NUMA_NODES]; /**< limit amount of memory per socket */ uintptr_t base_virtaddr; /**< base address to try and reserve memory from */ volatile unsigned legacy_mem; /**< true to enable legacy memory behavior (no dynamic allocation, diff --git a/lib/librte_eal/common/eal_options.h b/lib/librte_eal/common/eal_options.h index 211ae06a..96e16678 100644 --- a/lib/librte_eal/common/eal_options.h +++ b/lib/librte_eal/common/eal_options.h @@ -45,8 +45,12 @@ enum { OPT_NO_PCI_NUM, #define OPT_NO_SHCONF "no-shconf" OPT_NO_SHCONF_NUM, +#define OPT_IN_MEMORY "in-memory" + OPT_IN_MEMORY_NUM, #define OPT_SOCKET_MEM "socket-mem" OPT_SOCKET_MEM_NUM, +#define OPT_SOCKET_LIMIT "socket-limit" + OPT_SOCKET_LIMIT_NUM, #define OPT_SYSLOG "syslog" OPT_SYSLOG_NUM, #define OPT_VDEV "vdev" diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h index bdadc4d5..4f809a83 100644 --- a/lib/librte_eal/common/eal_private.h +++ b/lib/librte_eal/common/eal_private.h @@ -47,6 +47,18 @@ void eal_log_set_default(FILE *default_log); int rte_eal_cpu_init(void); /** + * Create memseg lists + * + * This function is private to EAL. + * + * Preallocate virtual memory. + * + * @return + * 0 on success, negative on error + */ +int rte_eal_memseg_init(void); + +/** * Map memory * * This function is private to EAL. @@ -258,4 +270,38 @@ int rte_mp_channel_init(void); */ void dev_callback_process(char *device_name, enum rte_dev_event_type event); +/** + * @internal + * Parse a device string and store its information in an + * rte_devargs structure. + * + * A device description is split by layers of abstraction of the device: + * bus, class and driver. Each layer will offer a set of properties that + * can be applied either to configure or recognize a device. + * + * This function will parse those properties and prepare the rte_devargs + * to be given to each layers for processing. + * + * Note: if the "data" field of the devargs points to devstr, + * then no dynamic allocation is performed and the rte_devargs + * can be safely discarded. + * + * Otherwise ``data`` will hold a workable copy of devstr, that will be + * used by layers descriptors within rte_devargs. In this case, + * any rte_devargs should be cleaned-up before being freed. + * + * @param da + * rte_devargs structure to fill. + * + * @param devstr + * Device string. + * + * @return + * 0 on success. + * Negative errno values on error (rte_errno is set). + */ +int +rte_devargs_layers_parse(struct rte_devargs *devargs, + const char *devstr); + #endif /* _EAL_PRIVATE_H_ */ diff --git a/lib/librte_eal/common/include/rte_bitmap.h b/lib/librte_eal/common/include/rte_bitmap.h index 7d4935fc..d9facc64 100644 --- a/lib/librte_eal/common/include/rte_bitmap.h +++ b/lib/librte_eal/common/include/rte_bitmap.h @@ -198,12 +198,12 @@ rte_bitmap_get_memory_footprint(uint32_t n_bits) { /** * Bitmap initialization * - * @param mem_size - * Minimum expected size of bitmap. + * @param n_bits + * Number of pre-allocated bits in array2. * @param mem * Base address of array1 and array2. - * @param n_bits - * Number of pre-allocated bits in array2. Must be non-zero and multiple of 512. + * @param mem_size + * Minimum expected size of bitmap. * @return * Handle to bitmap instance. */ diff --git a/lib/librte_eal/common/include/rte_bus.h b/lib/librte_eal/common/include/rte_bus.h index eb9eded4..b7b5b084 100644 --- a/lib/librte_eal/common/include/rte_bus.h +++ b/lib/librte_eal/common/include/rte_bus.h @@ -211,6 +211,7 @@ struct rte_bus { rte_bus_parse_t parse; /**< Parse a device name */ struct rte_bus_conf conf; /**< Bus configuration */ rte_bus_get_iommu_class_t get_iommu_class; /**< Get iommu class */ + rte_dev_iterate_t dev_iterate; /**< Device iterator. */ }; /** @@ -325,8 +326,7 @@ enum rte_iova_mode rte_bus_get_iommu_class(void); * The constructor has higher priority than PMD constructors. */ #define RTE_REGISTER_BUS(nm, bus) \ -RTE_INIT_PRIO(businitfn_ ##nm, BUS); \ -static void businitfn_ ##nm(void) \ +RTE_INIT_PRIO(businitfn_ ##nm, BUS) \ {\ (bus).name = RTE_STR(nm);\ rte_bus_register(&bus); \ diff --git a/lib/librte_eal/common/include/rte_class.h b/lib/librte_eal/common/include/rte_class.h new file mode 100644 index 00000000..276c91e9 --- /dev/null +++ b/lib/librte_eal/common/include/rte_class.h @@ -0,0 +1,134 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2018 Gaëtan Rivet + */ + +#ifndef _RTE_CLASS_H_ +#define _RTE_CLASS_H_ + +/** + * @file + * + * DPDK device class interface. + * + * This file describes the interface of the device class + * abstraction layer. + * + * A device class defines the type of function a device + * will be used for e.g.: Ethernet adapter (eth), + * cryptographic coprocessor (crypto), etc. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/queue.h> + +#include <rte_dev.h> + +/** Double linked list of classes */ +TAILQ_HEAD(rte_class_list, rte_class); + +/** + * A structure describing a generic device class. + */ +struct rte_class { + TAILQ_ENTRY(rte_class) next; /**< Next device class in linked list */ + const char *name; /**< Name of the class */ + rte_dev_iterate_t dev_iterate; /**< Device iterator. */ +}; + +/** + * Class comparison function. + * + * @param cls + * Class under test. + * + * @param data + * Data to compare against. + * + * @return + * 0 if the class matches the data. + * !0 if the class does not match. + * <0 if ordering is possible and the class is lower than the data. + * >0 if ordering is possible and the class is greater than the data. + */ +typedef int (*rte_class_cmp_t)(const struct rte_class *cls, const void *data); + +/** + * Class iterator to find a particular class. + * + * This function compares each registered class to find one that matches + * the data passed as parameter. + * + * If the comparison function returns zero this function will stop iterating + * over any more classes. To continue a search the class of a previous search + * can be passed via the start parameter. + * + * @param start + * Starting point for the iteration. + * + * @param cmp + * Comparison function. + * + * @param data + * Data to pass to comparison function. + * + * @return + * A pointer to a rte_class structure or NULL in case no class matches + */ +__rte_experimental +struct rte_class * +rte_class_find(const struct rte_class *start, rte_class_cmp_t cmp, + const void *data); + +/** + * Find the registered class for a given name. + */ +__rte_experimental +struct rte_class * +rte_class_find_by_name(const char *name); + +/** + * Register a Class handle. + * + * @param cls + * A pointer to a rte_class structure describing the class + * to be registered. + */ +__rte_experimental +void rte_class_register(struct rte_class *cls); + +/** + * Unregister a Class handle. + * + * @param cls + * A pointer to a rte_class structure describing the class + * to be unregistered. + */ +__rte_experimental +void rte_class_unregister(struct rte_class *cls); + +/** + * Helper for Class registration. + * The constructor has lower priority than Bus constructors. + * The constructor has higher priority than PMD constructors. + */ +#define RTE_REGISTER_CLASS(nm, cls) \ +RTE_INIT_PRIO(classinitfn_ ##nm, CLASS) \ +{\ + (cls).name = RTE_STR(nm); \ + rte_class_register(&cls); \ +} + +#define RTE_UNREGISTER_CLASS(nm, cls) \ +RTE_FINI_PRIO(classfinifn_ ##nm, CLASS) \ +{ \ + rte_class_unregister(&cls); \ +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_CLASS_H_ */ diff --git a/lib/librte_eal/common/include/rte_common.h b/lib/librte_eal/common/include/rte_common.h index 434adfd4..069c13ec 100644 --- a/lib/librte_eal/common/include/rte_common.h +++ b/lib/librte_eal/common/include/rte_common.h @@ -83,6 +83,7 @@ typedef uint16_t unaligned_uint16_t; #define RTE_PRIORITY_LOG 101 #define RTE_PRIORITY_BUS 110 +#define RTE_PRIORITY_CLASS 120 #define RTE_PRIORITY_LAST 65535 #define RTE_PRIO(prio) \ @@ -112,6 +113,29 @@ static void __attribute__((constructor(RTE_PRIO(prio)), used)) func(void) RTE_INIT_PRIO(func, LAST) /** + * Run after main() with low priority. + * + * @param func + * Destructor function name. + * @param prio + * Priority number must be above 100. + * Lowest number is the last to run. + */ +#define RTE_FINI_PRIO(func, prio) \ +static void __attribute__((destructor(RTE_PRIO(prio)), used)) func(void) + +/** + * Run after main() with high priority. + * + * The destructor will be run *before* prioritized destructors. + * + * @param func + * Destructor function name. + */ +#define RTE_FINI(func) \ + RTE_FINI_PRIO(func, LAST) + +/** * Force a function to be inlined */ #define __rte_always_inline inline __attribute__((always_inline)) @@ -294,6 +318,11 @@ rte_combine64ms1b(register uint64_t v) /*********** Macros to work with powers of 2 ********/ /** + * Macro to return 1 if n is a power of 2, 0 otherwise + */ +#define RTE_IS_POWER_OF_2(n) ((n) && !(((n) - 1) & (n))) + +/** * Returns true if n is a power of 2 * @param n * Number to check diff --git a/lib/librte_eal/common/include/rte_dev.h b/lib/librte_eal/common/include/rte_dev.h index 3879ff3c..b80a8059 100644 --- a/lib/librte_eal/common/include/rte_dev.h +++ b/lib/librte_eal/common/include/rte_dev.h @@ -69,9 +69,7 @@ rte_pmd_debug_trace(const char *func_name, const char *fmt, ...) * Enable RTE_PMD_DEBUG_TRACE() when at least one component relying on the * RTE_*_RET() macros defined below is compiled in debug mode. */ -#if defined(RTE_LIBRTE_ETHDEV_DEBUG) || \ - defined(RTE_LIBRTE_CRYPTODEV_DEBUG) || \ - defined(RTE_LIBRTE_EVENTDEV_DEBUG) +#if defined(RTE_LIBRTE_EVENTDEV_DEBUG) #define RTE_PMD_DEBUG_TRACE(...) \ rte_pmd_debug_trace(__func__, __VA_ARGS__) #else @@ -176,6 +174,7 @@ struct rte_device { * @return * 0 on success, negative on error. */ +__rte_deprecated int rte_eal_dev_attach(const char *name, const char *devargs); /** @@ -186,6 +185,7 @@ int rte_eal_dev_attach(const char *name, const char *devargs); * @return * 0 on success, negative on error. */ +__rte_deprecated int rte_eal_dev_detach(struct rte_device *dev); /** @@ -285,6 +285,103 @@ __attribute__((used)) = str static const char DRV_EXP_TAG(name, kmod_dep_export)[] \ __attribute__((used)) = str +/** + * Iteration context. + * + * This context carries over the current iteration state. + */ +struct rte_dev_iterator { + const char *dev_str; /**< device string. */ + const char *bus_str; /**< bus-related part of device string. */ + const char *cls_str; /**< class-related part of device string. */ + struct rte_bus *bus; /**< bus handle. */ + struct rte_class *cls; /**< class handle. */ + struct rte_device *device; /**< current position. */ + void *class_device; /**< additional specialized context. */ +}; + +/** + * Device iteration function. + * + * Find the next device matching properties passed in parameters. + * The function takes an additional ``start`` parameter, that is + * used as starting context when relevant. + * + * The function returns the current element in the iteration. + * This return value will potentially be used as a start parameter + * in subsequent calls to the function. + * + * The additional iterator parameter is only there if a specific + * implementation needs additional context. It must not be modified by + * the iteration function itself. + * + * @param start + * Starting iteration context. + * + * @param devstr + * Device description string. + * + * @param it + * Device iterator. + * + * @return + * The address of the current element matching the device description + * string. + */ +typedef void *(*rte_dev_iterate_t)(const void *start, + const char *devstr, + const struct rte_dev_iterator *it); + +/** + * Initializes a device iterator. + * + * This iterator allows accessing a list of devices matching a criteria. + * The device matching is made among all buses and classes currently registered, + * filtered by the device description given as parameter. + * + * This function will not allocate any memory. It is safe to stop the + * iteration at any moment and let the iterator go out of context. + * + * @param it + * Device iterator handle. + * + * @param str + * Device description string. + * + * @return + * 0 on successful initialization. + * <0 on error. + */ +__rte_experimental +int +rte_dev_iterator_init(struct rte_dev_iterator *it, const char *str); + +/** + * Iterates on a device iterator. + * + * Generates a new rte_device handle corresponding to the next element + * in the list described in comprehension by the iterator. + * + * The next object is returned, and the iterator is updated. + * + * @param it + * Device iterator handle. + * + * @return + * An rte_device handle if found. + * NULL if an error occurred (rte_errno is set). + * NULL if no device could be found (rte_errno is not set). + */ +__rte_experimental +struct rte_device * +rte_dev_iterator_next(struct rte_dev_iterator *it); + +#define RTE_DEV_FOREACH(dev, devstr, it) \ + for (rte_dev_iterator_init(it, devstr), \ + dev = rte_dev_iterator_next(it); \ + dev != NULL; \ + dev = rte_dev_iterator_next(it)) + #ifdef __cplusplus } #endif diff --git a/lib/librte_eal/common/include/rte_devargs.h b/lib/librte_eal/common/include/rte_devargs.h index 58fbd90a..097a4ce7 100644 --- a/lib/librte_eal/common/include/rte_devargs.h +++ b/lib/librte_eal/common/include/rte_devargs.h @@ -51,12 +51,19 @@ struct rte_devargs { enum rte_devtype type; /** Device policy. */ enum rte_dev_policy policy; - /** Bus handle for the device. */ - struct rte_bus *bus; /** Name of the device. */ char name[RTE_DEV_NAME_MAX_LEN]; + RTE_STD_C11 + union { /** Arguments string as given by user or "" for no argument. */ - char *args; + char *args; + const char *drv_str; + }; + struct rte_bus *bus; /**< bus handle. */ + struct rte_class *cls; /**< class handle. */ + const char *bus_str; /**< bus-related part of device string. */ + const char *cls_str; /**< class-related part of device string. */ + const char *data; /**< Device string storage. */ }; /** @@ -96,6 +103,42 @@ int rte_eal_parse_devargs_str(const char *devargs_str, * in argument. Store which bus will handle the device, its name * and the eventual device parameters. * + * The syntax is: + * + * bus:device_identifier,arg1=val1,arg2=val2 + * + * where "bus:" is the bus name followed by any character separator. + * The bus name is optional. If no bus name is specified, each bus + * will attempt to recognize the device identifier. The first one + * to succeed will be used. + * + * Examples: + * + * pci:0000:05.00.0,arg=val + * 05.00.0,arg=val + * vdev:net_ring0 + * + * @param da + * The devargs structure holding the device information. + * + * @param dev + * String describing a device. + * + * @return + * - 0 on success. + * - Negative errno on error. + */ +__rte_experimental +int +rte_devargs_parse(struct rte_devargs *da, const char *dev); + +/** + * Parse a device string. + * + * Verify that a bus is capable of handling the device passed + * in argument. Store which bus will handle the device, its name + * and the eventual device parameters. + * * The device string is built with a printf-like syntax. * * The syntax is: @@ -124,8 +167,8 @@ int rte_eal_parse_devargs_str(const char *devargs_str, */ __rte_experimental int -rte_devargs_parse(struct rte_devargs *da, - const char *format, ...) +rte_devargs_parsef(struct rte_devargs *da, + const char *format, ...) __attribute__((format(printf, 2, 0))); /** diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h index 8de5d69e..e114dcbd 100644 --- a/lib/librte_eal/common/include/rte_eal.h +++ b/lib/librte_eal/common/include/rte_eal.h @@ -490,27 +490,13 @@ static inline int rte_gettid(void) enum rte_iova_mode rte_eal_iova_mode(void); /** - * @warning - * @b EXPERIMENTAL: this API may change without prior notice - * * Get user provided pool ops name for mbuf * * @return * returns user provided pool ops name. */ -const char * __rte_experimental -rte_eal_mbuf_user_pool_ops(void); - -/** - * @deprecated - * Get default pool ops name for mbuf - * - * @return - * returns default pool ops name. - */ -__rte_deprecated const char * -rte_eal_mbuf_default_mempool_ops(void); +rte_eal_mbuf_user_pool_ops(void); #ifdef __cplusplus } diff --git a/lib/librte_eal/common/include/rte_fbarray.h b/lib/librte_eal/common/include/rte_fbarray.h index 3e61fffe..5d880551 100644 --- a/lib/librte_eal/common/include/rte_fbarray.h +++ b/lib/librte_eal/common/include/rte_fbarray.h @@ -336,6 +336,120 @@ rte_fbarray_find_contig_free(struct rte_fbarray *arr, int __rte_experimental rte_fbarray_find_contig_used(struct rte_fbarray *arr, unsigned int start); +/** + * Find index of previous free element, starting at specified index. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_find_prev_free(struct rte_fbarray *arr, unsigned int start); + + +/** + * Find index of previous used element, starting at specified index. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_find_prev_used(struct rte_fbarray *arr, unsigned int start); + + +/** + * Find lowest start index of chunk of ``n`` free elements, down from specified + * index. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @param n + * Number of free elements to look for. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_find_prev_n_free(struct rte_fbarray *arr, unsigned int start, + unsigned int n); + + +/** + * Find lowest start index of chunk of ``n`` used elements, down from specified + * index. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @param n + * Number of used elements to look for. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_find_prev_n_used(struct rte_fbarray *arr, unsigned int start, + unsigned int n); + + +/** + * Find how many more free entries there are before specified index (like + * ``rte_fbarray_find_contig_free`` but going in reverse). + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_find_rev_contig_free(struct rte_fbarray *arr, + unsigned int start); + + +/** + * Find how many more used entries there are before specified index (like + * ``rte_fbarray_find_contig_used`` but going in reverse). + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_find_rev_contig_used(struct rte_fbarray *arr, unsigned int start); + /** * Dump ``rte_fbarray`` metadata. diff --git a/lib/librte_eal/common/include/rte_memory.h b/lib/librte_eal/common/include/rte_memory.h index aab9f6fe..c4b7f4cf 100644 --- a/lib/librte_eal/common/include/rte_memory.h +++ b/lib/librte_eal/common/include/rte_memory.h @@ -264,6 +264,60 @@ int __rte_experimental rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg); /** + * Walk list of all memsegs without performing any locking. + * + * @note This function does not perform any locking, and is only safe to call + * from within memory-related callback functions. + * + * @param func + * Iterator function + * @param arg + * Argument passed to iterator + * @return + * 0 if walked over the entire list + * 1 if stopped by the user + * -1 if user function reported error + */ +int __rte_experimental +rte_memseg_walk_thread_unsafe(rte_memseg_walk_t func, void *arg); + +/** + * Walk each VA-contiguous area without performing any locking. + * + * @note This function does not perform any locking, and is only safe to call + * from within memory-related callback functions. + * + * @param func + * Iterator function + * @param arg + * Argument passed to iterator + * @return + * 0 if walked over the entire list + * 1 if stopped by the user + * -1 if user function reported error + */ +int __rte_experimental +rte_memseg_contig_walk_thread_unsafe(rte_memseg_contig_walk_t func, void *arg); + +/** + * Walk each allocated memseg list without performing any locking. + * + * @note This function does not perform any locking, and is only safe to call + * from within memory-related callback functions. + * + * @param func + * Iterator function + * @param arg + * Argument passed to iterator + * @return + * 0 if walked over the entire list + * 1 if stopped by the user + * -1 if user function reported error + */ +int __rte_experimental +rte_memseg_list_walk_thread_unsafe(rte_memseg_list_walk_t func, void *arg); + +/** * Dump the physical memory layout to a file. * * @note This function read-locks the memory hotplug subsystem, and thus cannot diff --git a/lib/librte_eal/common/include/rte_memzone.h b/lib/librte_eal/common/include/rte_memzone.h index ef370fa6..f478fa9e 100644 --- a/lib/librte_eal/common/include/rte_memzone.h +++ b/lib/librte_eal/common/include/rte_memzone.h @@ -81,8 +81,12 @@ struct rte_memzone { * memzones from memory that is already available. It will not trigger any * new allocations. * - * @note Reserving IOVA-contiguous memzones with len set to 0 is not currently - * supported. + * @note: When reserving memzones with len set to 0, it is preferable to also + * set a valid socket_id. Setting socket_id to SOCKET_ID_ANY is supported, but + * will likely not yield expected results. Specifically, the resulting memzone + * may not necessarily be the biggest memzone available, but rather biggest + * memzone available on socket id corresponding to an lcore from which + * reservation was called. * * @param name * The name of the memzone. If it already exists, the function will @@ -141,8 +145,12 @@ const struct rte_memzone *rte_memzone_reserve(const char *name, * memzones from memory that is already available. It will not trigger any * new allocations. * - * @note Reserving IOVA-contiguous memzones with len set to 0 is not currently - * supported. + * @note: When reserving memzones with len set to 0, it is preferable to also + * set a valid socket_id. Setting socket_id to SOCKET_ID_ANY is supported, but + * will likely not yield expected results. Specifically, the resulting memzone + * may not necessarily be the biggest memzone available, but rather biggest + * memzone available on socket id corresponding to an lcore from which + * reservation was called. * * @param name * The name of the memzone. If it already exists, the function will @@ -206,8 +214,12 @@ const struct rte_memzone *rte_memzone_reserve_aligned(const char *name, * memzones from memory that is already available. It will not trigger any * new allocations. * - * @note Reserving IOVA-contiguous memzones with len set to 0 is not currently - * supported. + * @note: When reserving memzones with len set to 0, it is preferable to also + * set a valid socket_id. Setting socket_id to SOCKET_ID_ANY is supported, but + * will likely not yield expected results. Specifically, the resulting memzone + * may not necessarily be the biggest memzone available, but rather biggest + * memzone available on socket id corresponding to an lcore from which + * reservation was called. * * @param name * The name of the memzone. If it already exists, the function will diff --git a/lib/librte_eal/common/include/rte_service.h b/lib/librte_eal/common/include/rte_service.h index aea4d91b..34b41aff 100644 --- a/lib/librte_eal/common/include/rte_service.h +++ b/lib/librte_eal/common/include/rte_service.h @@ -162,6 +162,26 @@ int32_t rte_service_runstate_set(uint32_t id, uint32_t runstate); int32_t rte_service_runstate_get(uint32_t id); /** + * @warning + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice + * + * This function returns whether the service may be currently executing on + * at least one lcore, or definitely is not. This function can be used to + * determine if, after setting the service runstate to stopped, the service + * is still executing a service lcore. + * + * Care must be taken if calling this function when the service runstate is + * running, since the result of this function may be incorrect by the time the + * function returns due to service cores running in parallel. + * + * @retval 1 Service may be running on one or more lcores + * @retval 0 Service is not running on any lcore + * @retval -EINVAL Invalid service id + */ +int32_t __rte_experimental +rte_service_may_be_active(uint32_t id); + +/** * Enable or disable the check for a service-core being mapped to the service. * An application can disable the check when takes the responsibility to run a * service itself using *rte_service_run_iter_on_app_lcore*. @@ -363,6 +383,42 @@ int32_t rte_service_attr_get(uint32_t id, uint32_t attr_id, */ int32_t rte_service_attr_reset_all(uint32_t id); +/** + * Returns the number of times the service runner has looped. + */ +#define RTE_SERVICE_LCORE_ATTR_LOOPS 0 + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Get an attribute from a service core. + * + * @param lcore Id of the service core. + * @param attr_id Id of the attribute to be retrieved. + * @param [out] attr_value Pointer to storage in which to write retrieved value. + * @retval 0 Success, the attribute value has been written to *attr_value*. + * -EINVAL Invalid lcore, attr_id or attr_value was NULL. + * -ENOTSUP lcore is not a service core. + */ +int32_t __rte_experimental +rte_service_lcore_attr_get(uint32_t lcore, uint32_t attr_id, + uint64_t *attr_value); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Reset all attribute values of a service core. + * + * @param lcore The service core to reset all the statistics of + * @retval 0 Successfully reset attributes + * -EINVAL Invalid service id provided + * -ENOTSUP lcore is not a service core. + */ +int32_t __rte_experimental +rte_service_lcore_attr_reset_all(uint32_t lcore); + #ifdef __cplusplus } #endif diff --git a/lib/librte_eal/common/include/rte_tailq.h b/lib/librte_eal/common/include/rte_tailq.h index 8dccaefc..9b01abb2 100644 --- a/lib/librte_eal/common/include/rte_tailq.h +++ b/lib/librte_eal/common/include/rte_tailq.h @@ -119,8 +119,7 @@ struct rte_tailq_head *rte_eal_tailq_lookup(const char *name); int rte_eal_tailq_register(struct rte_tailq_elem *t); #define EAL_REGISTER_TAILQ(t) \ -RTE_INIT(tailqinitfn_ ##t); \ -static void tailqinitfn_ ##t(void) \ +RTE_INIT(tailqinitfn_ ##t) \ { \ if (rte_eal_tailq_register(&t) < 0) \ rte_panic("Cannot initialize tailq: %s\n", t.name); \ diff --git a/lib/librte_eal/common/include/rte_uuid.h b/lib/librte_eal/common/include/rte_uuid.h new file mode 100644 index 00000000..2c846b5f --- /dev/null +++ b/lib/librte_eal/common/include/rte_uuid.h @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright (C) 1996, 1997, 1998 Theodore Ts'o. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF + * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ +/** + * @file + * + * UUID related functions originally from libuuid + */ + +#ifndef _RTE_UUID_H_ +#define _RTE_UUID_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdbool.h> + +/** + * Struct describing a Universal Unique Identifer + */ +typedef unsigned char rte_uuid_t[16]; + +/** + * Helper for defining UUID values for id tables. + */ +#define RTE_UUID_INIT(a, b, c, d, e) { \ + ((a) >> 24) & 0xff, ((a) >> 16) & 0xff, \ + ((a) >> 8) & 0xff, (a) & 0xff, \ + ((b) >> 8) & 0xff, (b) & 0xff, \ + ((c) >> 8) & 0xff, (c) & 0xff, \ + ((d) >> 8) & 0xff, (d) & 0xff, \ + ((e) >> 40) & 0xff, ((e) >> 32) & 0xff, \ + ((e) >> 24) & 0xff, ((e) >> 16) & 0xff, \ + ((e) >> 8) & 0xff, (e) & 0xff \ +} + +/** + * Test if UUID is all zeros. + * + * @param uu + * The uuid to check. + * @return + * true if uuid is NULL value, false otherwise + */ +bool rte_uuid_is_null(const rte_uuid_t uu); + +/** + * Copy uuid. + * + * @param dst + * Destination uuid + * @param src + * Source uuid + */ +static inline void rte_uuid_copy(rte_uuid_t dst, const rte_uuid_t src) +{ + memcpy(dst, src, sizeof(rte_uuid_t)); +} + +/** + * Compare two UUID's + * + * @param a + * A UUID to compare + * @param b + * A UUID to compare + * @return + * returns an integer less than, equal to, or greater than zero if UUID a is + * is less than, equal, or greater than UUID b. + */ +int rte_uuid_compare(const rte_uuid_t a, const rte_uuid_t b); + +/** + * Extract UUID from string + * + * @param in + * Pointer to string of characters to convert + * @param uu + * Destination UUID + * @return + * Returns 0 on succes, and -1 if string is not a valid UUID. + */ +int rte_uuid_parse(const char *in, rte_uuid_t uu); + +/** + * Convert UUID to string + * + * @param uu + * UUID to format + * @param out + * Resulting string buffer + * @param len + * Sizeof the available string buffer + */ +#define RTE_UUID_STRLEN (36 + 1) +void rte_uuid_unparse(const rte_uuid_t uu, char *out, size_t len); + +#ifdef __cplusplus +} +#endif + +#endif /* RTE_UUID_H */ diff --git a/lib/librte_eal/common/include/rte_version.h b/lib/librte_eal/common/include/rte_version.h index 6e2a2362..7c6714a2 100644 --- a/lib/librte_eal/common/include/rte_version.h +++ b/lib/librte_eal/common/include/rte_version.h @@ -32,7 +32,7 @@ extern "C" { /** * Minor version/month number i.e. the mm in yy.mm.z */ -#define RTE_VER_MONTH 05 +#define RTE_VER_MONTH 8 /** * Patch level number i.e. the z in yy.mm.z diff --git a/lib/librte_eal/common/include/rte_vfio.h b/lib/librte_eal/common/include/rte_vfio.h index f90972fa..5ca13fcc 100644 --- a/lib/librte_eal/common/include/rte_vfio.h +++ b/lib/librte_eal/common/include/rte_vfio.h @@ -179,7 +179,7 @@ rte_vfio_clear_group(int vfio_group_fd); * 0 if success. * -1 on error. */ -int __rte_experimental +int rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len); @@ -200,7 +200,7 @@ rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len); * -1 on error. */ -int __rte_experimental +int rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len); /** * Parse IOMMU group number for a device @@ -222,7 +222,7 @@ rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len); * 0 for non-existent group or VFIO * <0 for errors */ -int __rte_experimental +int rte_vfio_get_group_num(const char *sysfs_base, const char *dev_addr, int *iommu_group_num); @@ -236,7 +236,7 @@ rte_vfio_get_group_num(const char *sysfs_base, * > 0 container fd * < 0 for errors */ -int __rte_experimental +int rte_vfio_get_container_fd(void); /** @@ -252,13 +252,10 @@ rte_vfio_get_container_fd(void); * > 0 group fd * < 0 for errors */ -int __rte_experimental +int rte_vfio_get_group_fd(int iommu_group_num); /** - * @warning - * @b EXPERIMENTAL: this API may change, or be removed, without prior notice - * * Create a new container for device binding. * * @note Any newly allocated DPDK memory will not be mapped into these @@ -269,13 +266,10 @@ rte_vfio_get_group_fd(int iommu_group_num); * the container fd if successful * <0 if failed */ -int __rte_experimental +int rte_vfio_container_create(void); /** - * @warning - * @b EXPERIMENTAL: this API may change, or be removed, without prior notice - * * Destroy the container, unbind all vfio groups within it. * * @param container_fd @@ -285,13 +279,10 @@ rte_vfio_container_create(void); * 0 if successful * <0 if failed */ -int __rte_experimental +int rte_vfio_container_destroy(int container_fd); /** - * @warning - * @b EXPERIMENTAL: this API may change, or be removed, without prior notice - * * Bind a IOMMU group to a container. * * @param container_fd @@ -304,13 +295,10 @@ rte_vfio_container_destroy(int container_fd); * group fd if successful * <0 if failed */ -int __rte_experimental +int rte_vfio_container_group_bind(int container_fd, int iommu_group_num); /** - * @warning - * @b EXPERIMENTAL: this API may change, or be removed, without prior notice - * * Unbind a IOMMU group from a container. * * @param container_fd @@ -323,13 +311,10 @@ rte_vfio_container_group_bind(int container_fd, int iommu_group_num); * 0 if successful * <0 if failed */ -int __rte_experimental +int rte_vfio_container_group_unbind(int container_fd, int iommu_group_num); /** - * @warning - * @b EXPERIMENTAL: this API may change, or be removed, without prior notice - * * Perform DMA mapping for devices in a container. * * @param container_fd @@ -348,14 +333,11 @@ rte_vfio_container_group_unbind(int container_fd, int iommu_group_num); * 0 if successful * <0 if failed */ -int __rte_experimental +int rte_vfio_container_dma_map(int container_fd, uint64_t vaddr, uint64_t iova, uint64_t len); /** - * @warning - * @b EXPERIMENTAL: this API may change, or be removed, without prior notice - * * Perform DMA unmapping for devices in a container. * * @param container_fd @@ -374,7 +356,7 @@ rte_vfio_container_dma_map(int container_fd, uint64_t vaddr, * 0 if successful * <0 if failed */ -int __rte_experimental +int rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr, uint64_t iova, uint64_t len); diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c index 9bfe9b9b..e0a8ed15 100644 --- a/lib/librte_eal/common/malloc_elem.c +++ b/lib/librte_eal/common/malloc_elem.c @@ -18,10 +18,89 @@ #include <rte_common.h> #include <rte_spinlock.h> +#include "eal_internal_cfg.h" #include "eal_memalloc.h" #include "malloc_elem.h" #include "malloc_heap.h" +size_t +malloc_elem_find_max_iova_contig(struct malloc_elem *elem, size_t align) +{ + void *cur_page, *contig_seg_start, *page_end, *cur_seg_end; + void *data_start, *data_end; + rte_iova_t expected_iova; + struct rte_memseg *ms; + size_t page_sz, cur, max; + + page_sz = (size_t)elem->msl->page_sz; + data_start = RTE_PTR_ADD(elem, MALLOC_ELEM_HEADER_LEN); + data_end = RTE_PTR_ADD(elem, elem->size - MALLOC_ELEM_TRAILER_LEN); + /* segment must start after header and with specified alignment */ + contig_seg_start = RTE_PTR_ALIGN_CEIL(data_start, align); + + /* if we're in IOVA as VA mode, or if we're in legacy mode with + * hugepages, all elements are IOVA-contiguous. + */ + if (rte_eal_iova_mode() == RTE_IOVA_VA || + (internal_config.legacy_mem && rte_eal_has_hugepages())) + return RTE_PTR_DIFF(data_end, contig_seg_start); + + cur_page = RTE_PTR_ALIGN_FLOOR(contig_seg_start, page_sz); + ms = rte_mem_virt2memseg(cur_page, elem->msl); + + /* do first iteration outside the loop */ + page_end = RTE_PTR_ADD(cur_page, page_sz); + cur_seg_end = RTE_MIN(page_end, data_end); + cur = RTE_PTR_DIFF(cur_seg_end, contig_seg_start) - + MALLOC_ELEM_TRAILER_LEN; + max = cur; + expected_iova = ms->iova + page_sz; + /* memsegs are contiguous in memory */ + ms++; + + cur_page = RTE_PTR_ADD(cur_page, page_sz); + + while (cur_page < data_end) { + page_end = RTE_PTR_ADD(cur_page, page_sz); + cur_seg_end = RTE_MIN(page_end, data_end); + + /* reset start of contiguous segment if unexpected iova */ + if (ms->iova != expected_iova) { + /* next contiguous segment must start at specified + * alignment. + */ + contig_seg_start = RTE_PTR_ALIGN(cur_page, align); + /* new segment start may be on a different page, so find + * the page and skip to next iteration to make sure + * we're not blowing past data end. + */ + ms = rte_mem_virt2memseg(contig_seg_start, elem->msl); + cur_page = ms->addr; + /* don't trigger another recalculation */ + expected_iova = ms->iova; + continue; + } + /* cur_seg_end ends on a page boundary or on data end. if we're + * looking at data end, then malloc trailer is already included + * in the calculations. if we're looking at page end, then we + * know there's more data past this page and thus there's space + * for malloc element trailer, so don't count it here. + */ + cur = RTE_PTR_DIFF(cur_seg_end, contig_seg_start); + /* update max if cur value is bigger */ + if (cur > max) + max = cur; + + /* move to next page */ + cur_page = page_end; + expected_iova = ms->iova + page_sz; + /* memsegs are contiguous in memory */ + ms++; + } + + return max; +} + /* * Initialize a general malloc_elem header structure */ @@ -386,16 +465,18 @@ malloc_elem_join_adjacent_free(struct malloc_elem *elem) if (elem->next != NULL && elem->next->state == ELEM_FREE && next_elem_is_adjacent(elem)) { void *erase; + size_t erase_len; /* we will want to erase the trailer and header */ erase = RTE_PTR_SUB(elem->next, MALLOC_ELEM_TRAILER_LEN); + erase_len = MALLOC_ELEM_OVERHEAD + elem->next->pad; /* remove from free list, join to this one */ malloc_elem_free_list_remove(elem->next); join_elem(elem, elem->next); - /* erase header and trailer */ - memset(erase, 0, MALLOC_ELEM_OVERHEAD); + /* erase header, trailer and pad */ + memset(erase, 0, erase_len); } /* @@ -406,9 +487,11 @@ malloc_elem_join_adjacent_free(struct malloc_elem *elem) prev_elem_is_adjacent(elem)) { struct malloc_elem *new_elem; void *erase; + size_t erase_len; /* we will want to erase trailer and header */ erase = RTE_PTR_SUB(elem, MALLOC_ELEM_TRAILER_LEN); + erase_len = MALLOC_ELEM_OVERHEAD + elem->pad; /* remove from free list, join to this one */ malloc_elem_free_list_remove(elem->prev); @@ -416,8 +499,8 @@ malloc_elem_join_adjacent_free(struct malloc_elem *elem) new_elem = elem->prev; join_elem(new_elem, elem); - /* erase header and trailer */ - memset(erase, 0, MALLOC_ELEM_OVERHEAD); + /* erase header, trailer and pad */ + memset(erase, 0, erase_len); elem = new_elem; } @@ -436,7 +519,7 @@ malloc_elem_free(struct malloc_elem *elem) void *ptr; size_t data_len; - ptr = RTE_PTR_ADD(elem, sizeof(*elem)); + ptr = RTE_PTR_ADD(elem, MALLOC_ELEM_HEADER_LEN); data_len = elem->size - MALLOC_ELEM_OVERHEAD; elem = malloc_elem_join_adjacent_free(elem); diff --git a/lib/librte_eal/common/malloc_elem.h b/lib/librte_eal/common/malloc_elem.h index 7331af9c..e2bda4c0 100644 --- a/lib/librte_eal/common/malloc_elem.h +++ b/lib/librte_eal/common/malloc_elem.h @@ -179,4 +179,10 @@ malloc_elem_free_list_index(size_t size); void malloc_elem_free_list_insert(struct malloc_elem *elem); +/* + * Find biggest IOVA-contiguous zone within an element with specified alignment. + */ +size_t +malloc_elem_find_max_iova_contig(struct malloc_elem *elem, size_t align); + #endif /* MALLOC_ELEM_H_ */ diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c index d6cf3af8..12aaf2d7 100644 --- a/lib/librte_eal/common/malloc_heap.c +++ b/lib/librte_eal/common/malloc_heap.c @@ -149,6 +149,52 @@ find_suitable_element(struct malloc_heap *heap, size_t size, } /* + * Iterates through the freelist for a heap to find a free element with the + * biggest size and requested alignment. Will also set size to whatever element + * size that was found. + * Returns null on failure, or pointer to element on success. + */ +static struct malloc_elem * +find_biggest_element(struct malloc_heap *heap, size_t *size, + unsigned int flags, size_t align, bool contig) +{ + struct malloc_elem *elem, *max_elem = NULL; + size_t idx, max_size = 0; + + for (idx = 0; idx < RTE_HEAP_NUM_FREELISTS; idx++) { + for (elem = LIST_FIRST(&heap->free_head[idx]); + !!elem; elem = LIST_NEXT(elem, free_list)) { + size_t cur_size; + if (!check_hugepage_sz(flags, elem->msl->page_sz)) + continue; + if (contig) { + cur_size = + malloc_elem_find_max_iova_contig(elem, + align); + } else { + void *data_start = RTE_PTR_ADD(elem, + MALLOC_ELEM_HEADER_LEN); + void *data_end = RTE_PTR_ADD(elem, elem->size - + MALLOC_ELEM_TRAILER_LEN); + void *aligned = RTE_PTR_ALIGN_CEIL(data_start, + align); + /* check if aligned data start is beyond end */ + if (aligned >= data_end) + continue; + cur_size = RTE_PTR_DIFF(data_end, aligned); + } + if (cur_size > max_size) { + max_size = cur_size; + max_elem = elem; + } + } + } + + *size = max_size; + return max_elem; +} + +/* * Main function to allocate a block of memory from the heap. * It locks the free list, scans it, and adds a new memseg if the * scan fails. Once the new memseg is added, it re-scans and should return @@ -174,6 +220,26 @@ heap_alloc(struct malloc_heap *heap, const char *type __rte_unused, size_t size, return elem == NULL ? NULL : (void *)(&elem[1]); } +static void * +heap_alloc_biggest(struct malloc_heap *heap, const char *type __rte_unused, + unsigned int flags, size_t align, bool contig) +{ + struct malloc_elem *elem; + size_t size; + + align = RTE_CACHE_LINE_ROUNDUP(align); + + elem = find_biggest_element(heap, &size, flags, align, contig); + if (elem != NULL) { + elem = malloc_elem_alloc(elem, size, align, 0, contig); + + /* increase heap's count of allocated elements */ + heap->alloc_count++; + } + + return elem == NULL ? NULL : (void *)(&elem[1]); +} + /* this function is exposed in malloc_mp.h */ void rollback_expand_heap(struct rte_memseg **ms, int n_segs, @@ -575,6 +641,66 @@ malloc_heap_alloc(const char *type, size_t size, int socket_arg, return NULL; } +static void * +heap_alloc_biggest_on_socket(const char *type, int socket, unsigned int flags, + size_t align, bool contig) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct malloc_heap *heap = &mcfg->malloc_heaps[socket]; + void *ret; + + rte_spinlock_lock(&(heap->lock)); + + align = align == 0 ? 1 : align; + + ret = heap_alloc_biggest(heap, type, flags, align, contig); + + rte_spinlock_unlock(&(heap->lock)); + + return ret; +} + +void * +malloc_heap_alloc_biggest(const char *type, int socket_arg, unsigned int flags, + size_t align, bool contig) +{ + int socket, i, cur_socket; + void *ret; + + /* return NULL if align is not power-of-2 */ + if ((align && !rte_is_power_of_2(align))) + return NULL; + + if (!rte_eal_has_hugepages()) + socket_arg = SOCKET_ID_ANY; + + if (socket_arg == SOCKET_ID_ANY) + socket = malloc_get_numa_socket(); + else + socket = socket_arg; + + /* Check socket parameter */ + if (socket >= RTE_MAX_NUMA_NODES) + return NULL; + + ret = heap_alloc_biggest_on_socket(type, socket, flags, align, + contig); + if (ret != NULL || socket_arg != SOCKET_ID_ANY) + return ret; + + /* try other heaps */ + for (i = 0; i < (int) rte_socket_count(); i++) { + cur_socket = rte_socket_id_by_idx(i); + if (cur_socket == socket) + continue; + ret = heap_alloc_biggest_on_socket(type, cur_socket, flags, + align, contig); + if (ret != NULL) + return ret; + } + return NULL; +} + /* this function is exposed in malloc_mp.h */ int malloc_heap_free_pages(void *aligned_start, size_t aligned_len) diff --git a/lib/librte_eal/common/malloc_heap.h b/lib/librte_eal/common/malloc_heap.h index 03b80141..f52cb555 100644 --- a/lib/librte_eal/common/malloc_heap.h +++ b/lib/librte_eal/common/malloc_heap.h @@ -29,6 +29,10 @@ void * malloc_heap_alloc(const char *type, size_t size, int socket, unsigned int flags, size_t align, size_t bound, bool contig); +void * +malloc_heap_alloc_biggest(const char *type, int socket, unsigned int flags, + size_t align, bool contig); + int malloc_heap_free(struct malloc_elem *elem); diff --git a/lib/librte_eal/common/meson.build b/lib/librte_eal/common/meson.build index 8a3dcfee..56005bea 100644 --- a/lib/librte_eal/common/meson.build +++ b/lib/librte_eal/common/meson.build @@ -8,6 +8,7 @@ common_objs = [] common_sources = files( 'eal_common_bus.c', 'eal_common_cpuflags.c', + 'eal_common_class.c', 'eal_common_devargs.c', 'eal_common_dev.c', 'eal_common_errno.c', @@ -25,6 +26,7 @@ common_sources = files( 'eal_common_tailqs.c', 'eal_common_thread.c', 'eal_common_timer.c', + 'eal_common_uuid.c', 'malloc_elem.c', 'malloc_heap.c', 'malloc_mp.c', @@ -46,6 +48,7 @@ common_headers = files( 'include/rte_branch_prediction.h', 'include/rte_bus.h', 'include/rte_bitmap.h', + 'include/rte_class.h', 'include/rte_common.h', 'include/rte_debug.h', 'include/rte_devargs.h', @@ -75,6 +78,7 @@ common_headers = files( 'include/rte_string_fns.h', 'include/rte_tailq.h', 'include/rte_time.h', + 'include/rte_uuid.h', 'include/rte_version.h') # special case install the generic headers, since they go in a subdir diff --git a/lib/librte_eal/common/rte_service.c b/lib/librte_eal/common/rte_service.c index 73507aac..8767c722 100644 --- a/lib/librte_eal/common/rte_service.c +++ b/lib/librte_eal/common/rte_service.c @@ -52,6 +52,7 @@ struct rte_service_spec_impl { rte_atomic32_t num_mapped_cores; uint64_t calls; uint64_t cycles_spent; + uint8_t active_on_lcore[RTE_MAX_LCORE]; } __rte_cache_aligned; /* the internal values of a service core */ @@ -61,7 +62,7 @@ struct core_state { uint8_t runstate; /* running or stopped */ uint8_t is_service_core; /* set if core is currently a service core */ - /* extreme statistics */ + uint64_t loops; uint64_t calls_per_service[RTE_SERVICE_NUM_MAX]; } __rte_cache_aligned; @@ -347,15 +348,19 @@ rte_service_runner_do_callback(struct rte_service_spec_impl *s, static inline int32_t -service_run(uint32_t i, struct core_state *cs, uint64_t service_mask) +service_run(uint32_t i, int lcore, struct core_state *cs, uint64_t service_mask) { if (!service_valid(i)) return -EINVAL; struct rte_service_spec_impl *s = &rte_services[i]; if (s->comp_runstate != RUNSTATE_RUNNING || s->app_runstate != RUNSTATE_RUNNING || - !(service_mask & (UINT64_C(1) << i))) + !(service_mask & (UINT64_C(1) << i))) { + s->active_on_lcore[lcore] = 0; return -ENOEXEC; + } + + s->active_on_lcore[lcore] = 1; /* check do we need cmpset, if MT safe or <= 1 core * mapped, atomic ops are not required. @@ -374,6 +379,25 @@ service_run(uint32_t i, struct core_state *cs, uint64_t service_mask) return 0; } +int32_t __rte_experimental +rte_service_may_be_active(uint32_t id) +{ + uint32_t ids[RTE_MAX_LCORE] = {0}; + struct rte_service_spec_impl *s = &rte_services[id]; + int32_t lcore_count = rte_service_lcore_list(ids, RTE_MAX_LCORE); + int i; + + if (!service_valid(id)) + return -EINVAL; + + for (i = 0; i < lcore_count; i++) { + if (s->active_on_lcore[ids[i]]) + return 1; + } + + return 0; +} + int32_t rte_service_run_iter_on_app_lcore(uint32_t id, uint32_t serialize_mt_unsafe) { @@ -398,7 +422,7 @@ int32_t rte_service_run_iter_on_app_lcore(uint32_t id, return -EBUSY; } - int ret = service_run(id, cs, UINT64_MAX); + int ret = service_run(id, rte_lcore_id(), cs, UINT64_MAX); if (serialize_mt_unsafe) rte_atomic32_dec(&s->num_mapped_cores); @@ -419,9 +443,11 @@ rte_service_runner_func(void *arg) for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) { /* return value ignored as no change to code flow */ - service_run(i, cs, service_mask); + service_run(i, lcore, cs, service_mask); } + cs->loops++; + rte_smp_rmb(); } @@ -729,6 +755,28 @@ rte_service_attr_get(uint32_t id, uint32_t attr_id, uint32_t *attr_value) } } +int32_t __rte_experimental +rte_service_lcore_attr_get(uint32_t lcore, uint32_t attr_id, + uint64_t *attr_value) +{ + struct core_state *cs; + + if (lcore >= RTE_MAX_LCORE || !attr_value) + return -EINVAL; + + cs = &lcore_states[lcore]; + if (!cs->is_service_core) + return -ENOTSUP; + + switch (attr_id) { + case RTE_SERVICE_LCORE_ATTR_LOOPS: + *attr_value = cs->loops; + return 0; + default: + return -EINVAL; + } +} + static void rte_service_dump_one(FILE *f, struct rte_service_spec_impl *s, uint64_t all_cycles, uint32_t reset) @@ -764,6 +812,23 @@ rte_service_attr_reset_all(uint32_t id) return 0; } +int32_t __rte_experimental +rte_service_lcore_attr_reset_all(uint32_t lcore) +{ + struct core_state *cs; + + if (lcore >= RTE_MAX_LCORE) + return -EINVAL; + + cs = &lcore_states[lcore]; + if (!cs->is_service_core) + return -ENOTSUP; + + cs->loops = 0; + + return 0; +} + static void service_dump_calls_per_lcore(FILE *f, uint32_t lcore, uint32_t reset) { diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile index 3719ec9d..fd92c75c 100644 --- a/lib/librte_eal/linuxapp/eal/Makefile +++ b/lib/librte_eal/linuxapp/eal/Makefile @@ -10,7 +10,7 @@ ARCH_DIR ?= $(RTE_ARCH) EXPORT_MAP := ../../rte_eal_version.map VPATH += $(RTE_SDK)/lib/librte_eal/common/arch/$(ARCH_DIR) -LIBABIVER := 7 +LIBABIVER := 8 VPATH += $(RTE_SDK)/lib/librte_eal/common @@ -24,6 +24,7 @@ LDLIBS += -ldl LDLIBS += -lpthread LDLIBS += -lgcc_s LDLIBS += -lrt +LDLIBS += -lrte_kvargs ifeq ($(CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES),y) LDLIBS += -lnuma endif @@ -60,12 +61,14 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_hypervisor.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_string_fns.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_hexdump.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_devargs.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_class.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_bus.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_dev.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_options.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_thread.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_proc.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_fbarray.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_uuid.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_malloc.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_elem.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_heap.c diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c index 8655b869..e59ac657 100644 --- a/lib/librte_eal/linuxapp/eal/eal.c +++ b/lib/librte_eal/linuxapp/eal/eal.c @@ -155,22 +155,12 @@ eal_get_runtime_dir(void) } /* Return user provided mbuf pool ops name */ -const char * __rte_experimental +const char * rte_eal_mbuf_user_pool_ops(void) { return internal_config.user_mbuf_pool_ops_name; } -/* Return mbuf pool ops name */ -const char * -rte_eal_mbuf_default_mempool_ops(void) -{ - if (internal_config.user_mbuf_pool_ops_name == NULL) - return RTE_MBUF_DEFAULT_MEMPOOL_OPS; - - return internal_config.user_mbuf_pool_ops_name; -} - /* Return a pointer to the configuration structure */ struct rte_config * rte_eal_get_configuration(void) @@ -344,12 +334,17 @@ eal_proc_type_detect(void) enum rte_proc_type_t ptype = RTE_PROC_PRIMARY; const char *pathname = eal_runtime_config_path(); - /* if we can open the file but not get a write-lock we are a secondary - * process. NOTE: if we get a file handle back, we keep that open - * and don't close it to prevent a race condition between multiple opens */ - if (((mem_cfg_fd = open(pathname, O_RDWR)) >= 0) && - (fcntl(mem_cfg_fd, F_SETLK, &wr_lock) < 0)) - ptype = RTE_PROC_SECONDARY; + /* if there no shared config, there can be no secondary processes */ + if (!internal_config.no_shconf) { + /* if we can open the file but not get a write-lock we are a + * secondary process. NOTE: if we get a file handle back, we + * keep that open and don't close it to prevent a race condition + * between multiple opens. + */ + if (((mem_cfg_fd = open(pathname, O_RDWR)) >= 0) && + (fcntl(mem_cfg_fd, F_SETLK, &wr_lock) < 0)) + ptype = RTE_PROC_SECONDARY; + } RTE_LOG(INFO, EAL, "Auto-detected process type: %s\n", ptype == RTE_PROC_PRIMARY ? "PRIMARY" : "SECONDARY"); @@ -405,6 +400,7 @@ eal_usage(const char *prgname) eal_common_usage(); printf("EAL Linux options:\n" " --"OPT_SOCKET_MEM" Memory to allocate on sockets (comma separated values)\n" + " --"OPT_SOCKET_LIMIT" Limit memory allocation on sockets (comma separated values)\n" " --"OPT_HUGE_DIR" Directory where hugetlbfs is mounted\n" " --"OPT_FILE_PREFIX" Prefix for hugepage filenames\n" " --"OPT_BASE_VIRTADDR" Base virtual address\n" @@ -434,46 +430,45 @@ rte_set_application_usage_hook( rte_usage_hook_t usage_func ) } static int -eal_parse_socket_mem(char *socket_mem) +eal_parse_socket_arg(char *strval, volatile uint64_t *socket_arg) { char * arg[RTE_MAX_NUMA_NODES]; char *end; int arg_num, i, len; uint64_t total_mem = 0; - len = strnlen(socket_mem, SOCKET_MEM_STRLEN); + len = strnlen(strval, SOCKET_MEM_STRLEN); if (len == SOCKET_MEM_STRLEN) { RTE_LOG(ERR, EAL, "--socket-mem is too long\n"); return -1; } /* all other error cases will be caught later */ - if (!isdigit(socket_mem[len-1])) + if (!isdigit(strval[len-1])) return -1; /* split the optarg into separate socket values */ - arg_num = rte_strsplit(socket_mem, len, + arg_num = rte_strsplit(strval, len, arg, RTE_MAX_NUMA_NODES, ','); /* if split failed, or 0 arguments */ if (arg_num <= 0) return -1; - internal_config.force_sockets = 1; - /* parse each defined socket option */ errno = 0; for (i = 0; i < arg_num; i++) { + uint64_t val; end = NULL; - internal_config.socket_mem[i] = strtoull(arg[i], &end, 10); + val = strtoull(arg[i], &end, 10); /* check for invalid input */ if ((errno != 0) || (arg[i][0] == '\0') || (end == NULL) || (*end != '\0')) return -1; - internal_config.socket_mem[i] *= 1024ULL; - internal_config.socket_mem[i] *= 1024ULL; - total_mem += internal_config.socket_mem[i]; + val <<= 20; + total_mem += val; + socket_arg[i] = val; } /* check if we have a positive amount of total memory */ @@ -621,13 +616,27 @@ eal_parse_args(int argc, char **argv) break; case OPT_SOCKET_MEM_NUM: - if (eal_parse_socket_mem(optarg) < 0) { + if (eal_parse_socket_arg(optarg, + internal_config.socket_mem) < 0) { RTE_LOG(ERR, EAL, "invalid parameters for --" OPT_SOCKET_MEM "\n"); eal_usage(prgname); ret = -1; goto out; } + internal_config.force_sockets = 1; + break; + + case OPT_SOCKET_LIMIT_NUM: + if (eal_parse_socket_arg(optarg, + internal_config.socket_limit) < 0) { + RTE_LOG(ERR, EAL, "invalid parameters for --" + OPT_SOCKET_LIMIT "\n"); + eal_usage(prgname); + ret = -1; + goto out; + } + internal_config.force_socket_limits = 1; break; case OPT_BASE_VIRTADDR_NUM: @@ -678,6 +687,14 @@ eal_parse_args(int argc, char **argv) } } + /* create runtime data directory */ + if (internal_config.no_shconf == 0 && + eal_create_runtime_dir() < 0) { + RTE_LOG(ERR, EAL, "Cannot create runtime directory\n"); + ret = -1; + goto out; + } + if (eal_adjust_config(&internal_config) != 0) { ret = -1; goto out; @@ -817,13 +834,6 @@ rte_eal_init(int argc, char **argv) return -1; } - /* create runtime data directory */ - if (eal_create_runtime_dir() < 0) { - rte_eal_init_alert("Cannot create runtime directory\n"); - rte_errno = EACCES; - return -1; - } - if (eal_plugins_init() < 0) { rte_eal_init_alert("Cannot init plugins\n"); rte_errno = EINVAL; @@ -839,6 +849,11 @@ rte_eal_init(int argc, char **argv) rte_config_init(); + if (rte_eal_intr_init() < 0) { + rte_eal_init_alert("Cannot init interrupt-handling thread\n"); + return -1; + } + /* Put mp channel init before bus scan so that we can init the vdev * bus through mp channel in the secondary process before the bus scan. */ @@ -968,11 +983,6 @@ rte_eal_init(int argc, char **argv) rte_config.master_lcore, (int)thread_id, cpuset, ret == 0 ? "" : "..."); - if (rte_eal_intr_init() < 0) { - rte_eal_init_alert("Cannot init interrupt-handling thread\n"); - return -1; - } - RTE_LCORE_FOREACH_SLAVE(i) { /* @@ -1044,9 +1054,26 @@ rte_eal_init(int argc, char **argv) return fctret; } +static int +mark_freeable(const struct rte_memseg_list *msl, const struct rte_memseg *ms, + void *arg __rte_unused) +{ + /* ms is const, so find this memseg */ + struct rte_memseg *found = rte_mem_virt2memseg(ms->addr, msl); + + found->flags &= ~RTE_MEMSEG_FLAG_DO_NOT_FREE; + + return 0; +} + int __rte_experimental rte_eal_cleanup(void) { + /* if we're in a primary process, we need to mark hugepages as freeable + * so that finalization can release them back to the system. + */ + if (rte_eal_process_type() == RTE_PROC_PRIMARY) + rte_memseg_walk(mark_freeable, NULL); rte_service_finalize(); return 0; } diff --git a/lib/librte_eal/linuxapp/eal/eal_alarm.c b/lib/librte_eal/linuxapp/eal/eal_alarm.c index c115e823..391d2a65 100644 --- a/lib/librte_eal/linuxapp/eal/eal_alarm.c +++ b/lib/librte_eal/linuxapp/eal/eal_alarm.c @@ -19,7 +19,6 @@ #include <rte_launch.h> #include <rte_lcore.h> #include <rte_errno.h> -#include <rte_malloc.h> #include <rte_spinlock.h> #include <eal_private.h> @@ -91,7 +90,7 @@ eal_alarm_callback(void *arg __rte_unused) rte_spinlock_lock(&alarm_list_lk); LIST_REMOVE(ap, next); - rte_free(ap); + free(ap); } if (!LIST_EMPTY(&alarm_list)) { @@ -122,7 +121,7 @@ rte_eal_alarm_set(uint64_t us, rte_eal_alarm_callback cb_fn, void *cb_arg) if (us < 1 || us > (UINT64_MAX - US_PER_S) || cb_fn == NULL) return -EINVAL; - new_alarm = rte_zmalloc(NULL, sizeof(*new_alarm), 0); + new_alarm = calloc(1, sizeof(*new_alarm)); if (new_alarm == NULL) return -ENOMEM; @@ -196,7 +195,7 @@ rte_eal_alarm_cancel(rte_eal_alarm_callback cb_fn, void *cb_arg) if (ap->executing == 0) { LIST_REMOVE(ap, next); - rte_free(ap); + free(ap); count++; } else { /* If calling from other context, mark that alarm is executing @@ -220,7 +219,7 @@ rte_eal_alarm_cancel(rte_eal_alarm_callback cb_fn, void *cb_arg) if (ap->executing == 0) { LIST_REMOVE(ap, next); - rte_free(ap); + free(ap); count++; ap = ap_prev; } else if (pthread_equal(ap->executing_id, pthread_self()) == 0) diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c index 7eca711b..3a7d4b22 100644 --- a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c +++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c @@ -18,6 +18,8 @@ #include <sys/queue.h> #include <sys/stat.h> +#include <linux/mman.h> /* for hugetlb-related flags */ + #include <rte_memory.h> #include <rte_eal.h> #include <rte_launch.h> @@ -313,11 +315,49 @@ compare_hpi(const void *a, const void *b) return hpi_b->hugepage_sz - hpi_a->hugepage_sz; } +static void +calc_num_pages(struct hugepage_info *hpi, struct dirent *dirent) +{ + uint64_t total_pages = 0; + unsigned int i; + + /* + * first, try to put all hugepages into relevant sockets, but + * if first attempts fails, fall back to collecting all pages + * in one socket and sorting them later + */ + total_pages = 0; + /* we also don't want to do this for legacy init */ + if (!internal_config.legacy_mem) + for (i = 0; i < rte_socket_count(); i++) { + int socket = rte_socket_id_by_idx(i); + unsigned int num_pages = + get_num_hugepages_on_node( + dirent->d_name, socket); + hpi->num_pages[socket] = num_pages; + total_pages += num_pages; + } + /* + * we failed to sort memory from the get go, so fall + * back to old way + */ + if (total_pages == 0) { + hpi->num_pages[0] = get_num_hugepages(dirent->d_name); + +#ifndef RTE_ARCH_64 + /* for 32-bit systems, limit number of hugepages to + * 1GB per page size */ + hpi->num_pages[0] = RTE_MIN(hpi->num_pages[0], + RTE_PGSIZE_1G / hpi->hugepage_sz); +#endif + } +} + static int hugepage_info_init(void) { const char dirent_start_text[] = "hugepages-"; const size_t dirent_start_len = sizeof(dirent_start_text) - 1; - unsigned int i, total_pages, num_sizes = 0; + unsigned int i, num_sizes = 0; DIR *dir; struct dirent *dirent; @@ -355,6 +395,22 @@ hugepage_info_init(void) "%" PRIu64 " reserved, but no mounted " "hugetlbfs found for that size\n", num_pages, hpi->hugepage_sz); + /* if we have kernel support for reserving hugepages + * through mmap, and we're in in-memory mode, treat this + * page size as valid. we cannot be in legacy mode at + * this point because we've checked this earlier in the + * init process. + */ +#ifdef MAP_HUGE_SHIFT + if (internal_config.in_memory) { + RTE_LOG(DEBUG, EAL, "In-memory mode enabled, " + "hugepages of size %" PRIu64 " bytes " + "will be allocated anonymously\n", + hpi->hugepage_sz); + calc_num_pages(hpi, dirent); + num_sizes++; + } +#endif continue; } @@ -371,35 +427,7 @@ hugepage_info_init(void) if (clear_hugedir(hpi->hugedir) == -1) break; - /* - * first, try to put all hugepages into relevant sockets, but - * if first attempts fails, fall back to collecting all pages - * in one socket and sorting them later - */ - total_pages = 0; - /* we also don't want to do this for legacy init */ - if (!internal_config.legacy_mem) - for (i = 0; i < rte_socket_count(); i++) { - int socket = rte_socket_id_by_idx(i); - unsigned int num_pages = - get_num_hugepages_on_node( - dirent->d_name, socket); - hpi->num_pages[socket] = num_pages; - total_pages += num_pages; - } - /* - * we failed to sort memory from the get go, so fall - * back to old way - */ - if (total_pages == 0) - hpi->num_pages[0] = get_num_hugepages(dirent->d_name); - -#ifndef RTE_ARCH_64 - /* for 32-bit systems, limit number of hugepages to - * 1GB per page size */ - hpi->num_pages[0] = RTE_MIN(hpi->num_pages[0], - RTE_PGSIZE_1G / hpi->hugepage_sz); -#endif + calc_num_pages(hpi, dirent); num_sizes++; } @@ -423,8 +451,7 @@ hugepage_info_init(void) for (j = 0; j < RTE_MAX_NUMA_NODES; j++) num_pages += hpi->num_pages[j]; - if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0 && - num_pages > 0) + if (num_pages > 0) return 0; } @@ -446,6 +473,10 @@ eal_hugepage_info_init(void) if (hugepage_info_init() < 0) return -1; + /* for no shared files mode, we're done */ + if (internal_config.no_shconf) + return 0; + hpi = &internal_config.hugepage_info[0]; tmp_hpi = create_shared_memory(eal_hugepage_info_path(), diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c index 056d41c1..4076c6d6 100644 --- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c +++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c @@ -30,7 +30,6 @@ #include <rte_branch_prediction.h> #include <rte_debug.h> #include <rte_log.h> -#include <rte_malloc.h> #include <rte_errno.h> #include <rte_spinlock.h> #include <rte_pause.h> @@ -405,8 +404,7 @@ rte_intr_callback_register(const struct rte_intr_handle *intr_handle, } /* allocate a new interrupt callback entity */ - callback = rte_zmalloc("interrupt callback list", - sizeof(*callback), 0); + callback = calloc(1, sizeof(*callback)); if (callback == NULL) { RTE_LOG(ERR, EAL, "Can not allocate memory\n"); return -ENOMEM; @@ -420,7 +418,7 @@ rte_intr_callback_register(const struct rte_intr_handle *intr_handle, TAILQ_FOREACH(src, &intr_sources, next) { if (src->intr_handle.fd == intr_handle->fd) { /* we had no interrupts for this */ - if TAILQ_EMPTY(&src->callbacks) + if (TAILQ_EMPTY(&src->callbacks)) wake_thread = 1; TAILQ_INSERT_TAIL(&(src->callbacks), callback, next); @@ -431,10 +429,10 @@ rte_intr_callback_register(const struct rte_intr_handle *intr_handle, /* no existing callbacks for this - add new source */ if (src == NULL) { - if ((src = rte_zmalloc("interrupt source list", - sizeof(*src), 0)) == NULL) { + src = calloc(1, sizeof(*src)); + if (src == NULL) { RTE_LOG(ERR, EAL, "Can not allocate memory\n"); - rte_free(callback); + free(callback); ret = -ENOMEM; } else { src->intr_handle = *intr_handle; @@ -501,7 +499,7 @@ rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle, if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 || cb->cb_arg == cb_arg)) { TAILQ_REMOVE(&src->callbacks, cb, next); - rte_free(cb); + free(cb); ret++; } } @@ -509,7 +507,7 @@ rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle, /* all callbacks for that source are removed. */ if (TAILQ_EMPTY(&src->callbacks)) { TAILQ_REMOVE(&intr_sources, src, next); - rte_free(src); + free(src); } } diff --git a/lib/librte_eal/linuxapp/eal/eal_log.c b/lib/librte_eal/linuxapp/eal/eal_log.c index ff145884..9d02dddb 100644 --- a/lib/librte_eal/linuxapp/eal/eal_log.c +++ b/lib/librte_eal/linuxapp/eal/eal_log.c @@ -25,25 +25,14 @@ static ssize_t console_log_write(__attribute__((unused)) void *c, const char *buf, size_t size) { - char copybuf[BUFSIZ + 1]; ssize_t ret; - uint32_t loglevel; /* write on stdout */ ret = fwrite(buf, 1, size, stdout); fflush(stdout); - /* truncate message if too big (should not happen) */ - if (size > BUFSIZ) - size = BUFSIZ; - /* Syslog error levels are from 0 to 7, so subtract 1 to convert */ - loglevel = rte_log_cur_msg_loglevel() - 1; - memcpy(copybuf, buf, size); - copybuf[size] = '\0'; - - /* write on syslog too */ - syslog(loglevel, "%s", copybuf); + syslog(rte_log_cur_msg_loglevel() - 1, "%.*s", (int)size, buf); return ret; } diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c index 8c11f98c..aa95551a 100644 --- a/lib/librte_eal/linuxapp/eal/eal_memalloc.c +++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c @@ -28,6 +28,7 @@ #include <numaif.h> #endif #include <linux/falloc.h> +#include <linux/mman.h> /* for hugetlb-related mmap flags */ #include <rte_common.h> #include <rte_log.h> @@ -39,6 +40,16 @@ #include "eal_filesystem.h" #include "eal_internal_cfg.h" #include "eal_memalloc.h" +#include "eal_private.h" + +const int anonymous_hugepages_supported = +#ifdef MAP_HUGE_SHIFT + 1; +#define RTE_MAP_HUGE_SHIFT MAP_HUGE_SHIFT +#else + 0; +#define RTE_MAP_HUGE_SHIFT 26 +#endif /* * not all kernel version support fallocate on hugetlbfs, so fall back to @@ -171,32 +182,6 @@ get_file_size(int fd) return st.st_size; } -/* we cannot use rte_memseg_list_walk() here because we will be holding a - * write lock whenever we enter every function in this file, however copying - * the same iteration code everywhere is not ideal as well. so, use a lockless - * copy of memseg list walk here. - */ -static int -memseg_list_walk_thread_unsafe(rte_memseg_list_walk_t func, void *arg) -{ - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - int i, ret = 0; - - for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { - struct rte_memseg_list *msl = &mcfg->memsegs[i]; - - if (msl->base_va == NULL) - continue; - - ret = func(msl, arg); - if (ret < 0) - return -1; - if (ret > 0) - return 1; - } - return 0; -} - /* returns 1 on successful lock, 0 on unsuccessful lock, -1 on error */ static int lock(int fd, int type) { @@ -486,45 +471,84 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, int cur_socket_id = 0; #endif uint64_t map_offset; + rte_iova_t iova; + void *va; char path[PATH_MAX]; int ret = 0; int fd; size_t alloc_sz; - - /* takes out a read lock on segment or segment list */ - fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx); - if (fd < 0) { - RTE_LOG(ERR, EAL, "Couldn't get fd on hugepage file\n"); - return -1; - } + int flags; + void *new_addr; alloc_sz = hi->hugepage_sz; - if (internal_config.single_file_segments) { - map_offset = seg_idx * alloc_sz; - ret = resize_hugefile(fd, path, list_idx, seg_idx, map_offset, - alloc_sz, true); - if (ret < 0) - goto resized; - } else { + if (!internal_config.single_file_segments && + internal_config.in_memory && + anonymous_hugepages_supported) { + int log2, flags; + + log2 = rte_log2_u32(alloc_sz); + /* as per mmap() manpage, all page sizes are log2 of page size + * shifted by MAP_HUGE_SHIFT + */ + flags = (log2 << RTE_MAP_HUGE_SHIFT) | MAP_HUGETLB | MAP_FIXED | + MAP_PRIVATE | MAP_ANONYMOUS; + fd = -1; + va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, flags, -1, 0); + + /* single-file segments codepath will never be active because + * in-memory mode is incompatible with it and it's stopped at + * EAL initialization stage, however the compiler doesn't know + * that and complains about map_offset being used uninitialized + * on failure codepaths while having in-memory mode enabled. so, + * assign a value here. + */ map_offset = 0; - if (ftruncate(fd, alloc_sz) < 0) { - RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n", - __func__, strerror(errno)); - goto resized; + } else { + /* takes out a read lock on segment or segment list */ + fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx); + if (fd < 0) { + RTE_LOG(ERR, EAL, "Couldn't get fd on hugepage file\n"); + return -1; } - } - /* - * map the segment, and populate page tables, the kernel fills this - * segment with zeros if it's a new page. - */ - void *va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, map_offset); + if (internal_config.single_file_segments) { + map_offset = seg_idx * alloc_sz; + ret = resize_hugefile(fd, path, list_idx, seg_idx, + map_offset, alloc_sz, true); + if (ret < 0) + goto resized; + } else { + map_offset = 0; + if (ftruncate(fd, alloc_sz) < 0) { + RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n", + __func__, strerror(errno)); + goto resized; + } + if (internal_config.hugepage_unlink) { + if (unlink(path)) { + RTE_LOG(DEBUG, EAL, "%s(): unlink() failed: %s\n", + __func__, strerror(errno)); + goto resized; + } + } + } + + /* + * map the segment, and populate page tables, the kernel fills + * this segment with zeros if it's a new page. + */ + va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, + map_offset); + } if (va == MAP_FAILED) { RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__, strerror(errno)); - goto resized; + /* mmap failed, but the previous region might have been + * unmapped anyway. try to remap it + */ + goto unmapped; } if (va != addr) { RTE_LOG(DEBUG, EAL, "%s(): wrong mmap() address\n", __func__); @@ -532,24 +556,6 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, goto resized; } - rte_iova_t iova = rte_mem_virt2iova(addr); - if (iova == RTE_BAD_PHYS_ADDR) { - RTE_LOG(DEBUG, EAL, "%s(): can't get IOVA addr\n", - __func__); - goto mapped; - } - -#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES - move_pages(getpid(), 1, &addr, NULL, &cur_socket_id, 0); - - if (cur_socket_id != socket_id) { - RTE_LOG(DEBUG, EAL, - "%s(): allocation happened on wrong socket (wanted %d, got %d)\n", - __func__, socket_id, cur_socket_id); - goto mapped; - } -#endif - /* In linux, hugetlb limitations, like cgroup, are * enforced at fault time instead of mmap(), even * with the option of MAP_POPULATE. Kernel will send @@ -562,9 +568,6 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, (unsigned int)(alloc_sz >> 20)); goto mapped; } - /* for non-single file segments, we can close fd here */ - if (!internal_config.single_file_segments) - close(fd); /* we need to trigger a write to the page to enforce page fault and * ensure that page is accessible to us, but we can't overwrite value @@ -573,6 +576,28 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, */ *(volatile int *)addr = *(volatile int *)addr; + iova = rte_mem_virt2iova(addr); + if (iova == RTE_BAD_PHYS_ADDR) { + RTE_LOG(DEBUG, EAL, "%s(): can't get IOVA addr\n", + __func__); + goto mapped; + } + +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + move_pages(getpid(), 1, &addr, NULL, &cur_socket_id, 0); + + if (cur_socket_id != socket_id) { + RTE_LOG(DEBUG, EAL, + "%s(): allocation happened on wrong socket (wanted %d, got %d)\n", + __func__, socket_id, cur_socket_id); + goto mapped; + } +#endif + /* for non-single file segments that aren't in-memory, we can close fd + * here */ + if (!internal_config.single_file_segments && !internal_config.in_memory) + close(fd); + ms->addr = addr; ms->hugepage_sz = alloc_sz; ms->len = alloc_sz; @@ -585,14 +610,32 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, mapped: munmap(addr, alloc_sz); +unmapped: + flags = MAP_FIXED; +#ifdef RTE_ARCH_PPC_64 + flags |= MAP_HUGETLB; +#endif + new_addr = eal_get_virtual_area(addr, &alloc_sz, alloc_sz, 0, flags); + if (new_addr != addr) { + if (new_addr != NULL) + munmap(new_addr, alloc_sz); + /* we're leaving a hole in our virtual address space. if + * somebody else maps this hole now, we could accidentally + * override it in the future. + */ + RTE_LOG(CRIT, EAL, "Can't mmap holes in our virtual address space\n"); + } resized: + /* in-memory mode will never be single-file-segments mode */ if (internal_config.single_file_segments) { resize_hugefile(fd, path, list_idx, seg_idx, map_offset, alloc_sz, false); /* ignore failure, can't make it any worse */ } else { /* only remove file if we can take out a write lock */ - if (lock(fd, LOCK_EX) == 1) + if (internal_config.hugepage_unlink == 0 && + internal_config.in_memory == 0 && + lock(fd, LOCK_EX) == 1) unlink(path); close(fd); } @@ -617,6 +660,12 @@ free_seg(struct rte_memseg *ms, struct hugepage_info *hi, return -1; } + /* if we've already unlinked the page, nothing needs to be done */ + if (internal_config.hugepage_unlink) { + memset(ms, 0, sizeof(*ms)); + return 0; + } + /* if we are not in single file segments mode, we're going to unmap the * segment and thus drop the lock on original fd, but hugepage dir is * now locked so we can take out another one without races. @@ -695,7 +744,7 @@ alloc_seg_walk(const struct rte_memseg_list *msl, void *arg) * during init, we already hold a write lock, so don't try to take out * another one. */ - if (wa->hi->lock_descriptor == -1) { + if (wa->hi->lock_descriptor == -1 && !internal_config.in_memory) { dir_fd = open(wa->hi->hugedir, O_RDONLY); if (dir_fd < 0) { RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", @@ -799,7 +848,7 @@ free_seg_walk(const struct rte_memseg_list *msl, void *arg) * during init, we already hold a write lock, so don't try to take out * another one. */ - if (wa->hi->lock_descriptor == -1) { + if (wa->hi->lock_descriptor == -1 && !internal_config.in_memory) { dir_fd = open(wa->hi->hugedir, O_RDONLY); if (dir_fd < 0) { RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", @@ -878,7 +927,8 @@ eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs, size_t page_sz, wa.socket = socket; wa.segs_allocated = 0; - ret = memseg_list_walk_thread_unsafe(alloc_seg_walk, &wa); + /* memalloc is locked, so it's safe to use thread-unsafe version */ + ret = rte_memseg_list_walk_thread_unsafe(alloc_seg_walk, &wa); if (ret == 0) { RTE_LOG(ERR, EAL, "%s(): couldn't find suitable memseg_list\n", __func__); @@ -943,7 +993,10 @@ eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs) wa.ms = cur; wa.hi = hi; - walk_res = memseg_list_walk_thread_unsafe(free_seg_walk, &wa); + /* memalloc is locked, so it's safe to use thread-unsafe version + */ + walk_res = rte_memseg_list_walk_thread_unsafe(free_seg_walk, + &wa); if (walk_res == 1) continue; if (walk_res == 0) @@ -1230,7 +1283,8 @@ eal_memalloc_sync_with_primary(void) if (rte_eal_process_type() == RTE_PROC_PRIMARY) return 0; - if (memseg_list_walk_thread_unsafe(sync_walk, NULL)) + /* memalloc is locked, so it's safe to call thread-unsafe version */ + if (rte_memseg_list_walk_thread_unsafe(sync_walk, NULL)) return -1; return 0; } diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c index c917de1c..dbf19499 100644 --- a/lib/librte_eal/linuxapp/eal/eal_memory.c +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c @@ -66,7 +66,7 @@ static bool phys_addrs_available = true; static void test_phys_addrs_available(void) { - uint64_t tmp; + uint64_t tmp = 0; phys_addr_t physaddr; if (!rte_eal_has_hugepages()) { @@ -521,7 +521,18 @@ static void * create_shared_memory(const char *filename, const size_t mem_size) { void *retval; - int fd = open(filename, O_CREAT | O_RDWR, 0666); + int fd; + + /* if no shared files mode is used, create anonymous memory instead */ + if (internal_config.no_shconf) { + retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (retval == MAP_FAILED) + return NULL; + return retval; + } + + fd = open(filename, O_CREAT | O_RDWR, 0666); if (fd < 0) return NULL; if (ftruncate(fd, mem_size) < 0) { @@ -767,6 +778,34 @@ remap_segment(struct hugepage_file *hugepages, int seg_start, int seg_end) return 0; } +static uint64_t +get_mem_amount(uint64_t page_sz, uint64_t max_mem) +{ + uint64_t area_sz, max_pages; + + /* limit to RTE_MAX_MEMSEG_PER_LIST pages or RTE_MAX_MEM_MB_PER_LIST */ + max_pages = RTE_MAX_MEMSEG_PER_LIST; + max_mem = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20, max_mem); + + area_sz = RTE_MIN(page_sz * max_pages, max_mem); + + /* make sure the list isn't smaller than the page size */ + area_sz = RTE_MAX(area_sz, page_sz); + + return RTE_ALIGN(area_sz, page_sz); +} + +static int +free_memseg_list(struct rte_memseg_list *msl) +{ + if (rte_fbarray_destroy(&msl->memseg_arr)) { + RTE_LOG(ERR, EAL, "Cannot destroy memseg list\n"); + return -1; + } + memset(msl, 0, sizeof(*msl)); + return 0; +} + #define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i" static int alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz, @@ -1049,8 +1088,7 @@ get_socket_mem_size(int socket) for (i = 0; i < internal_config.num_hugepage_sizes; i++){ struct hugepage_info *hpi = &internal_config.hugepage_info[i]; - if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0) - size += hpi->hugepage_sz * hpi->num_pages[socket]; + size += hpi->hugepage_sz * hpi->num_pages[socket]; } return size; @@ -1605,6 +1643,15 @@ hugepage_count_walk(const struct rte_memseg_list *msl, void *arg) } static int +limits_callback(int socket_id, size_t cur_limit, size_t new_len) +{ + RTE_SET_USED(socket_id); + RTE_SET_USED(cur_limit); + RTE_SET_USED(new_len); + return -1; +} + +static int eal_hugepage_init(void) { struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES]; @@ -1687,6 +1734,18 @@ eal_hugepage_init(void) free(pages); } } + /* if socket limits were specified, set them */ + if (internal_config.force_socket_limits) { + unsigned int i; + for (i = 0; i < RTE_MAX_NUMA_NODES; i++) { + uint64_t limit = internal_config.socket_limit[i]; + if (limit == 0) + continue; + if (rte_mem_alloc_validator_register("socket-limit", + limits_callback, i, limit)) + RTE_LOG(ERR, EAL, "Failed to register socket limits validator callback\n"); + } + } return 0; } @@ -1840,3 +1899,316 @@ rte_eal_using_phys_addrs(void) { return phys_addrs_available; } + +static int __rte_unused +memseg_primary_init_32(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int active_sockets, hpi_idx, msl_idx = 0; + unsigned int socket_id, i; + struct rte_memseg_list *msl; + uint64_t extra_mem_per_socket, total_extra_mem, total_requested_mem; + uint64_t max_mem; + + /* no-huge does not need this at all */ + if (internal_config.no_hugetlbfs) + return 0; + + /* this is a giant hack, but desperate times call for desperate + * measures. in legacy 32-bit mode, we cannot preallocate VA space, + * because having upwards of 2 gigabytes of VA space already mapped will + * interfere with our ability to map and sort hugepages. + * + * therefore, in legacy 32-bit mode, we will be initializing memseg + * lists much later - in eal_memory.c, right after we unmap all the + * unneeded pages. this will not affect secondary processes, as those + * should be able to mmap the space without (too many) problems. + */ + if (internal_config.legacy_mem) + return 0; + + /* 32-bit mode is a very special case. we cannot know in advance where + * the user will want to allocate their memory, so we have to do some + * heuristics. + */ + active_sockets = 0; + total_requested_mem = 0; + if (internal_config.force_sockets) + for (i = 0; i < rte_socket_count(); i++) { + uint64_t mem; + + socket_id = rte_socket_id_by_idx(i); + mem = internal_config.socket_mem[socket_id]; + + if (mem == 0) + continue; + + active_sockets++; + total_requested_mem += mem; + } + else + total_requested_mem = internal_config.memory; + + max_mem = (uint64_t)RTE_MAX_MEM_MB << 20; + if (total_requested_mem > max_mem) { + RTE_LOG(ERR, EAL, "Invalid parameters: 32-bit process can at most use %uM of memory\n", + (unsigned int)(max_mem >> 20)); + return -1; + } + total_extra_mem = max_mem - total_requested_mem; + extra_mem_per_socket = active_sockets == 0 ? total_extra_mem : + total_extra_mem / active_sockets; + + /* the allocation logic is a little bit convoluted, but here's how it + * works, in a nutshell: + * - if user hasn't specified on which sockets to allocate memory via + * --socket-mem, we allocate all of our memory on master core socket. + * - if user has specified sockets to allocate memory on, there may be + * some "unused" memory left (e.g. if user has specified --socket-mem + * such that not all memory adds up to 2 gigabytes), so add it to all + * sockets that are in use equally. + * + * page sizes are sorted by size in descending order, so we can safely + * assume that we dispense with bigger page sizes first. + */ + + /* create memseg lists */ + for (i = 0; i < rte_socket_count(); i++) { + int hp_sizes = (int) internal_config.num_hugepage_sizes; + uint64_t max_socket_mem, cur_socket_mem; + unsigned int master_lcore_socket; + struct rte_config *cfg = rte_eal_get_configuration(); + bool skip; + + socket_id = rte_socket_id_by_idx(i); + +#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES + if (socket_id > 0) + break; +#endif + + /* if we didn't specifically request memory on this socket */ + skip = active_sockets != 0 && + internal_config.socket_mem[socket_id] == 0; + /* ...or if we didn't specifically request memory on *any* + * socket, and this is not master lcore + */ + master_lcore_socket = rte_lcore_to_socket_id(cfg->master_lcore); + skip |= active_sockets == 0 && socket_id != master_lcore_socket; + + if (skip) { + RTE_LOG(DEBUG, EAL, "Will not preallocate memory on socket %u\n", + socket_id); + continue; + } + + /* max amount of memory on this socket */ + max_socket_mem = (active_sockets != 0 ? + internal_config.socket_mem[socket_id] : + internal_config.memory) + + extra_mem_per_socket; + cur_socket_mem = 0; + + for (hpi_idx = 0; hpi_idx < hp_sizes; hpi_idx++) { + uint64_t max_pagesz_mem, cur_pagesz_mem = 0; + uint64_t hugepage_sz; + struct hugepage_info *hpi; + int type_msl_idx, max_segs, total_segs = 0; + + hpi = &internal_config.hugepage_info[hpi_idx]; + hugepage_sz = hpi->hugepage_sz; + + /* check if pages are actually available */ + if (hpi->num_pages[socket_id] == 0) + continue; + + max_segs = RTE_MAX_MEMSEG_PER_TYPE; + max_pagesz_mem = max_socket_mem - cur_socket_mem; + + /* make it multiple of page size */ + max_pagesz_mem = RTE_ALIGN_FLOOR(max_pagesz_mem, + hugepage_sz); + + RTE_LOG(DEBUG, EAL, "Attempting to preallocate " + "%" PRIu64 "M on socket %i\n", + max_pagesz_mem >> 20, socket_id); + + type_msl_idx = 0; + while (cur_pagesz_mem < max_pagesz_mem && + total_segs < max_segs) { + uint64_t cur_mem; + unsigned int n_segs; + + if (msl_idx >= RTE_MAX_MEMSEG_LISTS) { + RTE_LOG(ERR, EAL, + "No more space in memseg lists, please increase %s\n", + RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); + return -1; + } + + msl = &mcfg->memsegs[msl_idx]; + + cur_mem = get_mem_amount(hugepage_sz, + max_pagesz_mem); + n_segs = cur_mem / hugepage_sz; + + if (alloc_memseg_list(msl, hugepage_sz, n_segs, + socket_id, type_msl_idx)) { + /* failing to allocate a memseg list is + * a serious error. + */ + RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n"); + return -1; + } + + if (alloc_va_space(msl)) { + /* if we couldn't allocate VA space, we + * can try with smaller page sizes. + */ + RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list, retrying with different page size\n"); + /* deallocate memseg list */ + if (free_memseg_list(msl)) + return -1; + break; + } + + total_segs += msl->memseg_arr.len; + cur_pagesz_mem = total_segs * hugepage_sz; + type_msl_idx++; + msl_idx++; + } + cur_socket_mem += cur_pagesz_mem; + } + if (cur_socket_mem == 0) { + RTE_LOG(ERR, EAL, "Cannot allocate VA space on socket %u\n", + socket_id); + return -1; + } + } + + return 0; +} + +static int __rte_unused +memseg_primary_init(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int i, socket_id, hpi_idx, msl_idx = 0; + struct rte_memseg_list *msl; + uint64_t max_mem, total_mem; + + /* no-huge does not need this at all */ + if (internal_config.no_hugetlbfs) + return 0; + + max_mem = (uint64_t)RTE_MAX_MEM_MB << 20; + total_mem = 0; + + /* create memseg lists */ + for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes; + hpi_idx++) { + struct hugepage_info *hpi; + uint64_t hugepage_sz; + + hpi = &internal_config.hugepage_info[hpi_idx]; + hugepage_sz = hpi->hugepage_sz; + + for (i = 0; i < (int) rte_socket_count(); i++) { + uint64_t max_type_mem, total_type_mem = 0; + int type_msl_idx, max_segs, total_segs = 0; + + socket_id = rte_socket_id_by_idx(i); + +#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES + if (socket_id > 0) + break; +#endif + + if (total_mem >= max_mem) + break; + + max_type_mem = RTE_MIN(max_mem - total_mem, + (uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20); + max_segs = RTE_MAX_MEMSEG_PER_TYPE; + + type_msl_idx = 0; + while (total_type_mem < max_type_mem && + total_segs < max_segs) { + uint64_t cur_max_mem, cur_mem; + unsigned int n_segs; + + if (msl_idx >= RTE_MAX_MEMSEG_LISTS) { + RTE_LOG(ERR, EAL, + "No more space in memseg lists, please increase %s\n", + RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); + return -1; + } + + msl = &mcfg->memsegs[msl_idx++]; + + cur_max_mem = max_type_mem - total_type_mem; + + cur_mem = get_mem_amount(hugepage_sz, + cur_max_mem); + n_segs = cur_mem / hugepage_sz; + + if (alloc_memseg_list(msl, hugepage_sz, n_segs, + socket_id, type_msl_idx)) + return -1; + + total_segs += msl->memseg_arr.len; + total_type_mem = total_segs * hugepage_sz; + type_msl_idx++; + + if (alloc_va_space(msl)) { + RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n"); + return -1; + } + } + total_mem += total_type_mem; + } + } + return 0; +} + +static int +memseg_secondary_init(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int msl_idx = 0; + struct rte_memseg_list *msl; + + for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) { + + msl = &mcfg->memsegs[msl_idx]; + + /* skip empty memseg lists */ + if (msl->memseg_arr.len == 0) + continue; + + if (rte_fbarray_attach(&msl->memseg_arr)) { + RTE_LOG(ERR, EAL, "Cannot attach to primary process memseg lists\n"); + return -1; + } + + /* preallocate VA space */ + if (alloc_va_space(msl)) { + RTE_LOG(ERR, EAL, "Cannot preallocate VA space for hugepage memory\n"); + return -1; + } + } + + return 0; +} + +int +rte_eal_memseg_init(void) +{ + return rte_eal_process_type() == RTE_PROC_PRIMARY ? +#ifndef RTE_ARCH_64 + memseg_primary_init_32() : +#else + memseg_primary_init() : +#endif + memseg_secondary_init(); +} diff --git a/lib/librte_eal/linuxapp/eal/eal_thread.c b/lib/librte_eal/linuxapp/eal/eal_thread.c index f652ff98..b496fc71 100644 --- a/lib/librte_eal/linuxapp/eal/eal_thread.c +++ b/lib/librte_eal/linuxapp/eal/eal_thread.c @@ -176,7 +176,7 @@ int rte_sys_gettid(void) int rte_thread_setname(pthread_t id, const char *name) { - int ret = -1; + int ret = ENOSYS; #if defined(__GLIBC__) && defined(__GLIBC_PREREQ) #if __GLIBC_PREREQ(2, 12) ret = pthread_setname_np(id, name); @@ -184,5 +184,5 @@ int rte_thread_setname(pthread_t id, const char *name) #endif RTE_SET_USED(id); RTE_SET_USED(name); - return ret; + return -ret; } diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c index a2bbdfbf..c68dc38e 100644 --- a/lib/librte_eal/linuxapp/eal/eal_vfio.c +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c @@ -87,42 +87,6 @@ static const struct vfio_iommu_type iommu_types[] = { }, }; -/* for sPAPR IOMMU, we will need to walk memseg list, but we cannot use - * rte_memseg_walk() because by the time we enter callback we will be holding a - * write lock, so regular rte-memseg_walk will deadlock. copying the same - * iteration code everywhere is not ideal as well. so, use a lockless copy of - * memseg walk here. - */ -static int -memseg_walk_thread_unsafe(rte_memseg_walk_t func, void *arg) -{ - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - int i, ms_idx, ret = 0; - - for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { - struct rte_memseg_list *msl = &mcfg->memsegs[i]; - const struct rte_memseg *ms; - struct rte_fbarray *arr; - - if (msl->memseg_arr.count == 0) - continue; - - arr = &msl->memseg_arr; - - ms_idx = rte_fbarray_find_next_used(arr, 0); - while (ms_idx >= 0) { - ms = rte_fbarray_get(arr, ms_idx); - ret = func(msl, ms, arg); - if (ret < 0) - return -1; - if (ret > 0) - return 1; - ms_idx = rte_fbarray_find_next_used(arr, ms_idx + 1); - } - } - return 0; -} - static int is_null_map(const struct user_mem_map *map) { @@ -575,10 +539,6 @@ int rte_vfio_clear_group(int vfio_group_fd) { int i; - struct rte_mp_msg mp_req, *mp_rep; - struct rte_mp_reply mp_reply; - struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; - struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; struct vfio_config *vfio_cfg; vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); @@ -587,40 +547,15 @@ rte_vfio_clear_group(int vfio_group_fd) return -1; } - if (internal_config.process_type == RTE_PROC_PRIMARY) { - - i = get_vfio_group_idx(vfio_group_fd); - if (i < 0) - return -1; - vfio_cfg->vfio_groups[i].group_num = -1; - vfio_cfg->vfio_groups[i].fd = -1; - vfio_cfg->vfio_groups[i].devices = 0; - vfio_cfg->vfio_active_groups--; - return 0; - } - - p->req = SOCKET_CLR_GROUP; - p->group_num = vfio_group_fd; - strcpy(mp_req.name, EAL_VFIO_MP); - mp_req.len_param = sizeof(*p); - mp_req.num_fds = 0; - - if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && - mp_reply.nb_received == 1) { - mp_rep = &mp_reply.msgs[0]; - p = (struct vfio_mp_param *)mp_rep->param; - if (p->result == SOCKET_OK) { - free(mp_reply.msgs); - return 0; - } else if (p->result == SOCKET_NO_FD) - RTE_LOG(ERR, EAL, " BAD VFIO group fd!\n"); - else - RTE_LOG(ERR, EAL, " no such VFIO group fd!\n"); - - free(mp_reply.msgs); - } + i = get_vfio_group_idx(vfio_group_fd); + if (i < 0) + return -1; + vfio_cfg->vfio_groups[i].group_num = -1; + vfio_cfg->vfio_groups[i].fd = -1; + vfio_cfg->vfio_groups[i].devices = 0; + vfio_cfg->vfio_active_groups--; - return -1; + return 0; } int @@ -1357,7 +1292,8 @@ vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, /* check if window size needs to be adjusted */ memset(¶m, 0, sizeof(param)); - if (memseg_walk_thread_unsafe(vfio_spapr_window_size_walk, + /* we're inside a callback so use thread-unsafe version */ + if (rte_memseg_walk_thread_unsafe(vfio_spapr_window_size_walk, ¶m) < 0) { RTE_LOG(ERR, EAL, "Could not get window size\n"); ret = -1; @@ -1386,7 +1322,9 @@ vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, ret = -1; goto out; } - if (memseg_walk_thread_unsafe(vfio_spapr_map_walk, + /* we're inside a callback, so use thread-unsafe version + */ + if (rte_memseg_walk_thread_unsafe(vfio_spapr_map_walk, &vfio_container_fd) < 0) { RTE_LOG(ERR, EAL, "Could not recreate DMA maps\n"); ret = -1; @@ -1624,7 +1562,7 @@ out: return ret; } -int __rte_experimental +int rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len) { if (len == 0) { @@ -1635,7 +1573,7 @@ rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len) return container_dma_map(default_vfio_cfg, vaddr, iova, len); } -int __rte_experimental +int rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len) { if (len == 0) { @@ -1678,7 +1616,7 @@ rte_vfio_noiommu_is_enabled(void) return c == 'Y'; } -int __rte_experimental +int rte_vfio_container_create(void) { int i; @@ -1728,7 +1666,7 @@ rte_vfio_container_destroy(int container_fd) return 0; } -int __rte_experimental +int rte_vfio_container_group_bind(int container_fd, int iommu_group_num) { struct vfio_config *vfio_cfg; @@ -1774,11 +1712,11 @@ rte_vfio_container_group_bind(int container_fd, int iommu_group_num) return vfio_group_fd; } -int __rte_experimental +int rte_vfio_container_group_unbind(int container_fd, int iommu_group_num) { struct vfio_config *vfio_cfg; - struct vfio_group *cur_grp; + struct vfio_group *cur_grp = NULL; int i; vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); @@ -1795,7 +1733,7 @@ rte_vfio_container_group_unbind(int container_fd, int iommu_group_num) } /* This should not happen */ - if (i == VFIO_MAX_GROUPS) { + if (i == VFIO_MAX_GROUPS || cur_grp == NULL) { RTE_LOG(ERR, EAL, "Specified group number not found\n"); return -1; } @@ -1813,7 +1751,7 @@ rte_vfio_container_group_unbind(int container_fd, int iommu_group_num) return 0; } -int __rte_experimental +int rte_vfio_container_dma_map(int container_fd, uint64_t vaddr, uint64_t iova, uint64_t len) { @@ -1833,7 +1771,7 @@ rte_vfio_container_dma_map(int container_fd, uint64_t vaddr, uint64_t iova, return container_dma_map(vfio_cfg, vaddr, iova, len); } -int __rte_experimental +int rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr, uint64_t iova, uint64_t len) { @@ -1855,14 +1793,14 @@ rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr, uint64_t iova, #else -int __rte_experimental +int rte_vfio_dma_map(uint64_t __rte_unused vaddr, __rte_unused uint64_t iova, __rte_unused uint64_t len) { return -1; } -int __rte_experimental +int rte_vfio_dma_unmap(uint64_t __rte_unused vaddr, uint64_t __rte_unused iova, __rte_unused uint64_t len) { @@ -1909,7 +1847,7 @@ rte_vfio_clear_group(__rte_unused int vfio_group_fd) return -1; } -int __rte_experimental +int rte_vfio_get_group_num(__rte_unused const char *sysfs_base, __rte_unused const char *dev_addr, __rte_unused int *iommu_group_num) @@ -1917,45 +1855,45 @@ rte_vfio_get_group_num(__rte_unused const char *sysfs_base, return -1; } -int __rte_experimental +int rte_vfio_get_container_fd(void) { return -1; } -int __rte_experimental +int rte_vfio_get_group_fd(__rte_unused int iommu_group_num) { return -1; } -int __rte_experimental +int rte_vfio_container_create(void) { return -1; } -int __rte_experimental +int rte_vfio_container_destroy(__rte_unused int container_fd) { return -1; } -int __rte_experimental +int rte_vfio_container_group_bind(__rte_unused int container_fd, __rte_unused int iommu_group_num) { return -1; } -int __rte_experimental +int rte_vfio_container_group_unbind(__rte_unused int container_fd, __rte_unused int iommu_group_num) { return -1; } -int __rte_experimental +int rte_vfio_container_dma_map(__rte_unused int container_fd, __rte_unused uint64_t vaddr, __rte_unused uint64_t iova, @@ -1964,7 +1902,7 @@ rte_vfio_container_dma_map(__rte_unused int container_fd, return -1; } -int __rte_experimental +int rte_vfio_container_dma_unmap(__rte_unused int container_fd, __rte_unused uint64_t vaddr, __rte_unused uint64_t iova, diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h index e65b1037..68d4750a 100644 --- a/lib/librte_eal/linuxapp/eal/eal_vfio.h +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h @@ -129,7 +129,6 @@ int vfio_mp_sync_setup(void); #define SOCKET_REQ_CONTAINER 0x100 #define SOCKET_REQ_GROUP 0x200 -#define SOCKET_CLR_GROUP 0x300 #define SOCKET_OK 0x0 #define SOCKET_NO_FD 0x1 #define SOCKET_ERR 0xFF diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c index 9c202bb0..680a24aa 100644 --- a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c +++ b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c @@ -55,14 +55,6 @@ vfio_mp_primary(const struct rte_mp_msg *msg, const void *peer) reply.fds[0] = fd; } break; - case SOCKET_CLR_GROUP: - r->req = SOCKET_CLR_GROUP; - r->group_num = m->group_num; - if (rte_vfio_clear_group(m->group_num) < 0) - r->result = SOCKET_NO_FD; - else - r->result = SOCKET_OK; - break; case SOCKET_REQ_CONTAINER: r->req = SOCKET_REQ_CONTAINER; fd = rte_vfio_get_container_fd(); diff --git a/lib/librte_eal/linuxapp/eal/meson.build b/lib/librte_eal/linuxapp/eal/meson.build index cce37712..6e31c2aa 100644 --- a/lib/librte_eal/linuxapp/eal/meson.build +++ b/lib/librte_eal/linuxapp/eal/meson.build @@ -23,6 +23,7 @@ env_sources = files('eal_alarm.c', 'eal_dev.c', ) +deps += ['kvargs'] if has_libnuma == 1 dpdk_conf.set10('RTE_EAL_NUMA_AWARE_HUGEPAGES', true) endif diff --git a/lib/librte_eal/meson.build b/lib/librte_eal/meson.build index 4aa63e3d..e1fde15d 100644 --- a/lib/librte_eal/meson.build +++ b/lib/librte_eal/meson.build @@ -18,12 +18,13 @@ elif host_machine.system() == 'freebsd' subdir('bsdapp/eal') else - error('unsupported system type @0@'.format(hostmachine.system())) + error('unsupported system type "@0@"'.format(host_machine.system())) endif -version = 7 # the version of the EAL API +version = 8 # the version of the EAL API allow_experimental_apis = true deps += 'compat' +deps += 'kvargs' cflags += '-D_GNU_SOURCE' sources = common_sources + env_sources objs = common_objs + env_objs diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map index f7dd0e7b..344a43d3 100644 --- a/lib/librte_eal/rte_eal_version.map +++ b/lib/librte_eal/rte_eal_version.map @@ -181,7 +181,6 @@ DPDK_17.11 { rte_bus_get_iommu_class; rte_eal_has_pci; rte_eal_iova_mode; - rte_eal_mbuf_default_mempool_ops; rte_eal_using_phys_addrs; rte_eal_vfio_intr_mode; rte_lcore_has_role; @@ -241,25 +240,53 @@ DPDK_18.05 { } DPDK_18.02; +DPDK_18.08 { + global: + + rte_eal_mbuf_user_pool_ops; + rte_uuid_compare; + rte_uuid_is_null; + rte_uuid_parse; + rte_uuid_unparse; + rte_vfio_container_create; + rte_vfio_container_destroy; + rte_vfio_container_dma_map; + rte_vfio_container_dma_unmap; + rte_vfio_container_group_bind; + rte_vfio_container_group_unbind; + rte_vfio_dma_map; + rte_vfio_dma_unmap; + rte_vfio_get_container_fd; + rte_vfio_get_group_fd; + rte_vfio_get_group_num; + +} DPDK_18.05; + EXPERIMENTAL { global: + rte_class_find; + rte_class_find_by_name; + rte_class_register; + rte_class_unregister; rte_ctrl_thread_create; rte_dev_event_callback_register; rte_dev_event_callback_unregister; rte_dev_event_monitor_start; rte_dev_event_monitor_stop; + rte_dev_iterator_init; + rte_dev_iterator_next; rte_devargs_add; rte_devargs_dump; rte_devargs_insert; rte_devargs_next; rte_devargs_parse; + rte_devargs_parsef; rte_devargs_remove; rte_devargs_type_count; rte_eal_cleanup; rte_eal_hotplug_add; rte_eal_hotplug_remove; - rte_eal_mbuf_user_pool_ops; rte_fbarray_attach; rte_fbarray_destroy; rte_fbarray_detach; @@ -269,8 +296,14 @@ EXPERIMENTAL { rte_fbarray_find_next_used; rte_fbarray_find_next_n_free; rte_fbarray_find_next_n_used; + rte_fbarray_find_prev_free; + rte_fbarray_find_prev_used; + rte_fbarray_find_prev_n_free; + rte_fbarray_find_prev_n_used; rte_fbarray_find_contig_free; rte_fbarray_find_contig_used; + rte_fbarray_find_rev_contig_free; + rte_fbarray_find_rev_contig_used; rte_fbarray_get; rte_fbarray_init; rte_fbarray_is_used; @@ -286,25 +319,20 @@ EXPERIMENTAL { rte_mem_virt2memseg; rte_mem_virt2memseg_list; rte_memseg_contig_walk; + rte_memseg_contig_walk_thread_unsafe; rte_memseg_list_walk; + rte_memseg_list_walk_thread_unsafe; rte_memseg_walk; + rte_memseg_walk_thread_unsafe; rte_mp_action_register; rte_mp_action_unregister; rte_mp_reply; rte_mp_request_sync; rte_mp_request_async; rte_mp_sendmsg; + rte_service_lcore_attr_get; + rte_service_lcore_attr_reset_all; + rte_service_may_be_active; rte_socket_count; rte_socket_id_by_idx; - rte_vfio_dma_map; - rte_vfio_dma_unmap; - rte_vfio_get_container_fd; - rte_vfio_get_group_fd; - rte_vfio_get_group_num; - rte_vfio_container_create; - rte_vfio_container_destroy; - rte_vfio_container_dma_map; - rte_vfio_container_dma_unmap; - rte_vfio_container_group_bind; - rte_vfio_container_group_unbind; }; diff --git a/lib/librte_ethdev/Makefile b/lib/librte_ethdev/Makefile index c2f2f7d8..0935a275 100644 --- a/lib/librte_ethdev/Makefile +++ b/lib/librte_ethdev/Makefile @@ -16,7 +16,7 @@ LDLIBS += -lrte_mbuf EXPORT_MAP := rte_ethdev_version.map -LIBABIVER := 9 +LIBABIVER := 10 SRCS-y += rte_ethdev.c SRCS-y += rte_flow.c diff --git a/lib/librte_ethdev/meson.build b/lib/librte_ethdev/meson.build index aed5d226..596cd0f3 100644 --- a/lib/librte_ethdev/meson.build +++ b/lib/librte_ethdev/meson.build @@ -2,7 +2,7 @@ # Copyright(c) 2017 Intel Corporation name = 'ethdev' -version = 9 +version = 10 allow_experimental_apis = true sources = files('ethdev_profile.c', 'rte_ethdev.c', diff --git a/lib/librte_ethdev/rte_ethdev.c b/lib/librte_ethdev/rte_ethdev.c index cd4bfd3c..4c320250 100644 --- a/lib/librte_ethdev/rte_ethdev.c +++ b/lib/librte_ethdev/rte_ethdev.c @@ -42,10 +42,7 @@ #include "rte_ethdev_driver.h" #include "ethdev_profile.h" -static int ethdev_logtype; - -#define ethdev_log(level, fmt, ...) \ - rte_log(RTE_LOG_ ## level, ethdev_logtype, fmt "\n", ## __VA_ARGS__) +int rte_eth_dev_logtype; static const char *MZ_RTE_ETH_DEV_DATA = "rte_eth_dev_data"; struct rte_eth_dev rte_eth_devices[RTE_MAX_ETHPORTS]; @@ -129,6 +126,7 @@ static const struct { RTE_RX_OFFLOAD_BIT2STR(SCATTER), RTE_RX_OFFLOAD_BIT2STR(TIMESTAMP), RTE_RX_OFFLOAD_BIT2STR(SECURITY), + RTE_RX_OFFLOAD_BIT2STR(KEEP_CRC), }; #undef RTE_RX_OFFLOAD_BIT2STR @@ -303,14 +301,16 @@ rte_eth_dev_allocate(const char *name) rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock); if (_rte_eth_dev_allocated(name) != NULL) { - ethdev_log(ERR, "Ethernet device with name %s already allocated", - name); + RTE_ETHDEV_LOG(ERR, + "Ethernet device with name %s already allocated\n", + name); goto unlock; } port_id = rte_eth_dev_find_free_port(); if (port_id == RTE_MAX_ETHPORTS) { - ethdev_log(ERR, "Reached maximum number of Ethernet ports"); + RTE_ETHDEV_LOG(ERR, + "Reached maximum number of Ethernet ports\n"); goto unlock; } @@ -346,8 +346,8 @@ rte_eth_dev_attach_secondary(const char *name) break; } if (i == RTE_MAX_ETHPORTS) { - RTE_PMD_DEBUG_TRACE( - "device %s is not driven by the primary process\n", + RTE_ETHDEV_LOG(ERR, + "Device %s is not driven by the primary process\n", name); } else { eth_dev = eth_dev_get(i); @@ -394,7 +394,8 @@ rte_eth_is_valid_owner_id(uint64_t owner_id) { if (owner_id == RTE_ETH_DEV_NO_OWNER || rte_eth_dev_shared_data->next_owner_id <= owner_id) { - RTE_PMD_DEBUG_TRACE("Invalid owner_id=%016"PRIX64".\n", owner_id); + RTE_ETHDEV_LOG(ERR, "Invalid owner_id=%016"PRIx64"\n", + owner_id); return 0; } return 1; @@ -437,7 +438,8 @@ _rte_eth_dev_owner_set(const uint16_t port_id, const uint64_t old_owner_id, int sret; if (port_id >= RTE_MAX_ETHPORTS || !is_allocated(ethdev)) { - RTE_PMD_DEBUG_TRACE("Port id %"PRIu16" is not allocated.\n", port_id); + RTE_ETHDEV_LOG(ERR, "Port id %"PRIu16" is not allocated\n", + port_id); return -ENODEV; } @@ -447,22 +449,22 @@ _rte_eth_dev_owner_set(const uint16_t port_id, const uint64_t old_owner_id, port_owner = &rte_eth_devices[port_id].data->owner; if (port_owner->id != old_owner_id) { - RTE_PMD_DEBUG_TRACE("Cannot set owner to port %d already owned" - " by %s_%016"PRIX64".\n", port_id, - port_owner->name, port_owner->id); + RTE_ETHDEV_LOG(ERR, + "Cannot set owner to port %u already owned by %s_%016"PRIX64"\n", + port_id, port_owner->name, port_owner->id); return -EPERM; } sret = snprintf(port_owner->name, RTE_ETH_MAX_OWNER_NAME_LEN, "%s", new_owner->name); if (sret < 0 || sret >= RTE_ETH_MAX_OWNER_NAME_LEN) - RTE_PMD_DEBUG_TRACE("Port %d owner name was truncated.\n", - port_id); + RTE_ETHDEV_LOG(ERR, "Port %u owner name was truncated\n", + port_id); port_owner->id = new_owner->id; - RTE_PMD_DEBUG_TRACE("Port %d owner is %s_%016"PRIX64".\n", port_id, - new_owner->name, new_owner->id); + RTE_ETHDEV_LOG(DEBUG, "Port %u owner is %s_%016"PRIx64"\n", + port_id, new_owner->name, new_owner->id); return 0; } @@ -514,8 +516,9 @@ rte_eth_dev_owner_delete(const uint64_t owner_id) if (rte_eth_devices[port_id].data->owner.id == owner_id) memset(&rte_eth_devices[port_id].data->owner, 0, sizeof(struct rte_eth_dev_owner)); - RTE_PMD_DEBUG_TRACE("All port owners owned by %016"PRIX64 - " identifier have removed.\n", owner_id); + RTE_ETHDEV_LOG(ERR, + "All port owners owned by %016"PRIx64" identifier have removed\n", + owner_id); } rte_spinlock_unlock(&rte_eth_dev_shared_data->ownership_lock); @@ -532,7 +535,8 @@ rte_eth_dev_owner_get(const uint16_t port_id, struct rte_eth_dev_owner *owner) rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock); if (port_id >= RTE_MAX_ETHPORTS || !is_allocated(ethdev)) { - RTE_PMD_DEBUG_TRACE("Port id %"PRIu16" is not allocated.\n", port_id); + RTE_ETHDEV_LOG(ERR, "Port id %"PRIu16" is not allocated\n", + port_id); ret = -ENODEV; } else { rte_memcpy(owner, ðdev->data->owner, sizeof(*owner)); @@ -596,7 +600,7 @@ rte_eth_dev_get_name_by_port(uint16_t port_id, char *name) RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL); if (name == NULL) { - RTE_PMD_DEBUG_TRACE("Null pointer is specified\n"); + RTE_ETHDEV_LOG(ERR, "Null pointer is specified\n"); return -EINVAL; } @@ -613,7 +617,7 @@ rte_eth_dev_get_port_by_name(const char *name, uint16_t *port_id) uint32_t pid; if (name == NULL) { - RTE_PMD_DEBUG_TRACE("Null pointer is specified\n"); + RTE_ETHDEV_LOG(ERR, "Null pointer is specified\n"); return -EINVAL; } @@ -654,7 +658,7 @@ rte_eth_dev_attach(const char *devargs, uint16_t *port_id) } /* parse devargs */ - if (rte_devargs_parse(&da, "%s", devargs)) + if (rte_devargs_parse(&da, devargs)) goto err; ret = rte_eal_hotplug_add(da.bus->name, da.name, da.args); @@ -663,7 +667,7 @@ rte_eth_dev_attach(const char *devargs, uint16_t *port_id) /* no point looking at the port count if no port exists */ if (!rte_eth_dev_count_total()) { - ethdev_log(ERR, "No port found for device (%s)", da.name); + RTE_ETHDEV_LOG(ERR, "No port found for device (%s)\n", da.name); ret = -1; goto err; } @@ -698,8 +702,8 @@ rte_eth_dev_detach(uint16_t port_id, char *name __rte_unused) dev_flags = rte_eth_devices[port_id].data->dev_flags; if (dev_flags & RTE_ETH_DEV_BONDED_SLAVE) { - ethdev_log(ERR, - "Port %" PRIu16 " is bonded, cannot detach", port_id); + RTE_ETHDEV_LOG(ERR, + "Port %"PRIu16" is bonded, cannot detach\n", port_id); return -ENOTSUP; } @@ -778,21 +782,22 @@ rte_eth_dev_rx_queue_start(uint16_t port_id, uint16_t rx_queue_id) dev = &rte_eth_devices[port_id]; if (!dev->data->dev_started) { - RTE_PMD_DEBUG_TRACE( - "port %d must be started before start any queue\n", port_id); + RTE_ETHDEV_LOG(ERR, + "Port %u must be started before start any queue\n", + port_id); return -EINVAL; } if (rx_queue_id >= dev->data->nb_rx_queues) { - RTE_PMD_DEBUG_TRACE("Invalid RX queue_id=%d\n", rx_queue_id); + RTE_ETHDEV_LOG(ERR, "Invalid RX queue_id=%u\n", rx_queue_id); return -EINVAL; } RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_start, -ENOTSUP); if (dev->data->rx_queue_state[rx_queue_id] != RTE_ETH_QUEUE_STATE_STOPPED) { - RTE_PMD_DEBUG_TRACE("Queue %" PRIu16" of device with port_id=%" PRIu8 - " already started\n", + RTE_ETHDEV_LOG(INFO, + "Queue %"PRIu16" of device with port_id=%"PRIu16" already started\n", rx_queue_id, port_id); return 0; } @@ -811,15 +816,15 @@ rte_eth_dev_rx_queue_stop(uint16_t port_id, uint16_t rx_queue_id) dev = &rte_eth_devices[port_id]; if (rx_queue_id >= dev->data->nb_rx_queues) { - RTE_PMD_DEBUG_TRACE("Invalid RX queue_id=%d\n", rx_queue_id); + RTE_ETHDEV_LOG(ERR, "Invalid RX queue_id=%u\n", rx_queue_id); return -EINVAL; } RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_stop, -ENOTSUP); if (dev->data->rx_queue_state[rx_queue_id] == RTE_ETH_QUEUE_STATE_STOPPED) { - RTE_PMD_DEBUG_TRACE("Queue %" PRIu16" of device with port_id=%" PRIu8 - " already stopped\n", + RTE_ETHDEV_LOG(INFO, + "Queue %"PRIu16" of device with port_id=%"PRIu16" already stopped\n", rx_queue_id, port_id); return 0; } @@ -837,28 +842,27 @@ rte_eth_dev_tx_queue_start(uint16_t port_id, uint16_t tx_queue_id) dev = &rte_eth_devices[port_id]; if (!dev->data->dev_started) { - RTE_PMD_DEBUG_TRACE( - "port %d must be started before start any queue\n", port_id); + RTE_ETHDEV_LOG(ERR, + "Port %u must be started before start any queue\n", + port_id); return -EINVAL; } if (tx_queue_id >= dev->data->nb_tx_queues) { - RTE_PMD_DEBUG_TRACE("Invalid TX queue_id=%d\n", tx_queue_id); + RTE_ETHDEV_LOG(ERR, "Invalid TX queue_id=%u\n", tx_queue_id); return -EINVAL; } RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->tx_queue_start, -ENOTSUP); if (dev->data->tx_queue_state[tx_queue_id] != RTE_ETH_QUEUE_STATE_STOPPED) { - RTE_PMD_DEBUG_TRACE("Queue %" PRIu16" of device with port_id=%" PRIu8 - " already started\n", + RTE_ETHDEV_LOG(INFO, + "Queue %"PRIu16" of device with port_id=%"PRIu16" already started\n", tx_queue_id, port_id); return 0; } - return eth_err(port_id, dev->dev_ops->tx_queue_start(dev, - tx_queue_id)); - + return eth_err(port_id, dev->dev_ops->tx_queue_start(dev, tx_queue_id)); } int @@ -870,15 +874,15 @@ rte_eth_dev_tx_queue_stop(uint16_t port_id, uint16_t tx_queue_id) dev = &rte_eth_devices[port_id]; if (tx_queue_id >= dev->data->nb_tx_queues) { - RTE_PMD_DEBUG_TRACE("Invalid TX queue_id=%d\n", tx_queue_id); + RTE_ETHDEV_LOG(ERR, "Invalid TX queue_id=%u\n", tx_queue_id); return -EINVAL; } RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->tx_queue_stop, -ENOTSUP); if (dev->data->tx_queue_state[tx_queue_id] == RTE_ETH_QUEUE_STATE_STOPPED) { - RTE_PMD_DEBUG_TRACE("Queue %" PRIu16" of device with port_id=%" PRIu8 - " already stopped\n", + RTE_ETHDEV_LOG(INFO, + "Queue %"PRIu16" of device with port_id=%"PRIu16" already stopped\n", tx_queue_id, port_id); return 0; } @@ -970,41 +974,6 @@ rte_eth_speed_bitflag(uint32_t speed, int duplex) } } -/** - * A conversion function from rxmode bitfield API. - */ -static void -rte_eth_convert_rx_offload_bitfield(const struct rte_eth_rxmode *rxmode, - uint64_t *rx_offloads) -{ - uint64_t offloads = 0; - - if (rxmode->header_split == 1) - offloads |= DEV_RX_OFFLOAD_HEADER_SPLIT; - if (rxmode->hw_ip_checksum == 1) - offloads |= DEV_RX_OFFLOAD_CHECKSUM; - if (rxmode->hw_vlan_filter == 1) - offloads |= DEV_RX_OFFLOAD_VLAN_FILTER; - if (rxmode->hw_vlan_strip == 1) - offloads |= DEV_RX_OFFLOAD_VLAN_STRIP; - if (rxmode->hw_vlan_extend == 1) - offloads |= DEV_RX_OFFLOAD_VLAN_EXTEND; - if (rxmode->jumbo_frame == 1) - offloads |= DEV_RX_OFFLOAD_JUMBO_FRAME; - if (rxmode->hw_strip_crc == 1) - offloads |= DEV_RX_OFFLOAD_CRC_STRIP; - if (rxmode->enable_scatter == 1) - offloads |= DEV_RX_OFFLOAD_SCATTER; - if (rxmode->enable_lro == 1) - offloads |= DEV_RX_OFFLOAD_TCP_LRO; - if (rxmode->hw_timestamp == 1) - offloads |= DEV_RX_OFFLOAD_TIMESTAMP; - if (rxmode->security == 1) - offloads |= DEV_RX_OFFLOAD_SECURITY; - - *rx_offloads = offloads; -} - const char * __rte_experimental rte_eth_dev_rx_offload_name(uint64_t offload) { @@ -1071,33 +1040,26 @@ rte_eth_dev_configure(uint16_t port_id, uint16_t nb_rx_q, uint16_t nb_tx_q, } if (nb_rx_q > RTE_MAX_QUEUES_PER_PORT) { - RTE_PMD_DEBUG_TRACE( + RTE_ETHDEV_LOG(ERR, "Number of RX queues requested (%u) is greater than max supported(%d)\n", nb_rx_q, RTE_MAX_QUEUES_PER_PORT); return -EINVAL; } if (nb_tx_q > RTE_MAX_QUEUES_PER_PORT) { - RTE_PMD_DEBUG_TRACE( + RTE_ETHDEV_LOG(ERR, "Number of TX queues requested (%u) is greater than max supported(%d)\n", nb_tx_q, RTE_MAX_QUEUES_PER_PORT); return -EINVAL; } if (dev->data->dev_started) { - RTE_PMD_DEBUG_TRACE( - "port %d must be stopped to allow configuration\n", port_id); + RTE_ETHDEV_LOG(ERR, + "Port %u must be stopped to allow configuration\n", + port_id); return -EBUSY; } - /* - * Convert between the offloads API to enable PMDs to support - * only one of them. - */ - if (dev_conf->rxmode.ignore_offload_bitfield == 0) - rte_eth_convert_rx_offload_bitfield( - &dev_conf->rxmode, &local_conf.rxmode.offloads); - /* Copy the dev_conf parameter into the dev structure */ memcpy(&dev->data->dev_conf, &local_conf, sizeof(dev->data->dev_conf)); @@ -1107,28 +1069,28 @@ rte_eth_dev_configure(uint16_t port_id, uint16_t nb_rx_q, uint16_t nb_tx_q, * configured device. */ if (nb_rx_q > dev_info.max_rx_queues) { - RTE_PMD_DEBUG_TRACE("ethdev port_id=%d nb_rx_queues=%d > %d\n", - port_id, nb_rx_q, dev_info.max_rx_queues); + RTE_ETHDEV_LOG(ERR, "Ethdev port_id=%u nb_rx_queues=%u > %u\n", + port_id, nb_rx_q, dev_info.max_rx_queues); return -EINVAL; } if (nb_tx_q > dev_info.max_tx_queues) { - RTE_PMD_DEBUG_TRACE("ethdev port_id=%d nb_tx_queues=%d > %d\n", - port_id, nb_tx_q, dev_info.max_tx_queues); + RTE_ETHDEV_LOG(ERR, "Ethdev port_id=%u nb_tx_queues=%u > %u\n", + port_id, nb_tx_q, dev_info.max_tx_queues); return -EINVAL; } /* Check that the device supports requested interrupts */ if ((dev_conf->intr_conf.lsc == 1) && - (!(dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC))) { - RTE_PMD_DEBUG_TRACE("driver %s does not support lsc\n", - dev->device->driver->name); - return -EINVAL; + (!(dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC))) { + RTE_ETHDEV_LOG(ERR, "Driver %s does not support lsc\n", + dev->device->driver->name); + return -EINVAL; } if ((dev_conf->intr_conf.rmv == 1) && - (!(dev->data->dev_flags & RTE_ETH_DEV_INTR_RMV))) { - RTE_PMD_DEBUG_TRACE("driver %s does not support rmv\n", - dev->device->driver->name); + (!(dev->data->dev_flags & RTE_ETH_DEV_INTR_RMV))) { + RTE_ETHDEV_LOG(ERR, "Driver %s does not support rmv\n", + dev->device->driver->name); return -EINVAL; } @@ -1137,19 +1099,16 @@ rte_eth_dev_configure(uint16_t port_id, uint16_t nb_rx_q, uint16_t nb_tx_q, * length is supported by the configured device. */ if (local_conf.rxmode.offloads & DEV_RX_OFFLOAD_JUMBO_FRAME) { - if (dev_conf->rxmode.max_rx_pkt_len > - dev_info.max_rx_pktlen) { - RTE_PMD_DEBUG_TRACE("ethdev port_id=%d max_rx_pkt_len %u" - " > max valid value %u\n", - port_id, - (unsigned)dev_conf->rxmode.max_rx_pkt_len, - (unsigned)dev_info.max_rx_pktlen); + if (dev_conf->rxmode.max_rx_pkt_len > dev_info.max_rx_pktlen) { + RTE_ETHDEV_LOG(ERR, + "Ethdev port_id=%u max_rx_pkt_len %u > max valid value %u\n", + port_id, dev_conf->rxmode.max_rx_pkt_len, + dev_info.max_rx_pktlen); return -EINVAL; } else if (dev_conf->rxmode.max_rx_pkt_len < ETHER_MIN_LEN) { - RTE_PMD_DEBUG_TRACE("ethdev port_id=%d max_rx_pkt_len %u" - " < min valid value %u\n", - port_id, - (unsigned)dev_conf->rxmode.max_rx_pkt_len, + RTE_ETHDEV_LOG(ERR, + "Ethdev port_id=%u max_rx_pkt_len %u < min valid value %u\n", + port_id, dev_conf->rxmode.max_rx_pkt_len, (unsigned)ETHER_MIN_LEN); return -EINVAL; } @@ -1164,36 +1123,42 @@ rte_eth_dev_configure(uint16_t port_id, uint16_t nb_rx_q, uint16_t nb_tx_q, /* Any requested offloading must be within its device capabilities */ if ((local_conf.rxmode.offloads & dev_info.rx_offload_capa) != local_conf.rxmode.offloads) { - ethdev_log(ERR, "ethdev port_id=%d requested Rx offloads " - "0x%" PRIx64 " doesn't match Rx offloads " - "capabilities 0x%" PRIx64 " in %s()\n", - port_id, - local_conf.rxmode.offloads, - dev_info.rx_offload_capa, - __func__); - /* Will return -EINVAL in the next release */ + RTE_ETHDEV_LOG(ERR, + "Ethdev port_id=%u requested Rx offloads 0x%"PRIx64" doesn't match Rx offloads " + "capabilities 0x%"PRIx64" in %s()\n", + port_id, local_conf.rxmode.offloads, + dev_info.rx_offload_capa, + __func__); + return -EINVAL; } if ((local_conf.txmode.offloads & dev_info.tx_offload_capa) != local_conf.txmode.offloads) { - ethdev_log(ERR, "ethdev port_id=%d requested Tx offloads " - "0x%" PRIx64 " doesn't match Tx offloads " - "capabilities 0x%" PRIx64 " in %s()\n", - port_id, - local_conf.txmode.offloads, - dev_info.tx_offload_capa, - __func__); - /* Will return -EINVAL in the next release */ + RTE_ETHDEV_LOG(ERR, + "Ethdev port_id=%u requested Tx offloads 0x%"PRIx64" doesn't match Tx offloads " + "capabilities 0x%"PRIx64" in %s()\n", + port_id, local_conf.txmode.offloads, + dev_info.tx_offload_capa, + __func__); + return -EINVAL; + } + + if ((local_conf.rxmode.offloads & DEV_RX_OFFLOAD_CRC_STRIP) && + (local_conf.rxmode.offloads & DEV_RX_OFFLOAD_KEEP_CRC)) { + RTE_ETHDEV_LOG(ERR, + "Port id=%u not allowed to set both CRC STRIP and KEEP CRC offload flags\n", + port_id); + return -EINVAL; } /* Check that device supports requested rss hash functions. */ if ((dev_info.flow_type_rss_offloads | dev_conf->rx_adv_conf.rss_conf.rss_hf) != dev_info.flow_type_rss_offloads) { - RTE_PMD_DEBUG_TRACE("ethdev port_id=%d invalid rss_hf: " - "0x%"PRIx64", valid value: 0x%"PRIx64"\n", - port_id, - dev_conf->rx_adv_conf.rss_conf.rss_hf, - dev_info.flow_type_rss_offloads); + RTE_ETHDEV_LOG(ERR, + "Ethdev port_id=%u invalid rss_hf: 0x%"PRIx64", valid value: 0x%"PRIx64"\n", + port_id, dev_conf->rx_adv_conf.rss_conf.rss_hf, + dev_info.flow_type_rss_offloads); + return -EINVAL; } /* @@ -1201,23 +1166,25 @@ rte_eth_dev_configure(uint16_t port_id, uint16_t nb_rx_q, uint16_t nb_tx_q, */ diag = rte_eth_dev_rx_queue_config(dev, nb_rx_q); if (diag != 0) { - RTE_PMD_DEBUG_TRACE("port%d rte_eth_dev_rx_queue_config = %d\n", - port_id, diag); + RTE_ETHDEV_LOG(ERR, + "Port%u rte_eth_dev_rx_queue_config = %d\n", + port_id, diag); return diag; } diag = rte_eth_dev_tx_queue_config(dev, nb_tx_q); if (diag != 0) { - RTE_PMD_DEBUG_TRACE("port%d rte_eth_dev_tx_queue_config = %d\n", - port_id, diag); + RTE_ETHDEV_LOG(ERR, + "Port%u rte_eth_dev_tx_queue_config = %d\n", + port_id, diag); rte_eth_dev_rx_queue_config(dev, 0); return diag; } diag = (*dev->dev_ops->dev_configure)(dev); if (diag != 0) { - RTE_PMD_DEBUG_TRACE("port%d dev_configure = %d\n", - port_id, diag); + RTE_ETHDEV_LOG(ERR, "Port%u dev_configure = %d\n", + port_id, diag); rte_eth_dev_rx_queue_config(dev, 0); rte_eth_dev_tx_queue_config(dev, 0); return eth_err(port_id, diag); @@ -1226,8 +1193,8 @@ rte_eth_dev_configure(uint16_t port_id, uint16_t nb_rx_q, uint16_t nb_tx_q, /* Initialize Rx profiling if enabled at compilation time. */ diag = __rte_eth_profile_rx_init(port_id, dev); if (diag != 0) { - RTE_PMD_DEBUG_TRACE("port%d __rte_eth_profile_rx_init = %d\n", - port_id, diag); + RTE_ETHDEV_LOG(ERR, "Port%u __rte_eth_profile_rx_init = %d\n", + port_id, diag); rte_eth_dev_rx_queue_config(dev, 0); rte_eth_dev_tx_queue_config(dev, 0); return eth_err(port_id, diag); @@ -1240,8 +1207,7 @@ void _rte_eth_dev_reset(struct rte_eth_dev *dev) { if (dev->data->dev_started) { - RTE_PMD_DEBUG_TRACE( - "port %d must be stopped to allow reset\n", + RTE_ETHDEV_LOG(ERR, "Port %u must be stopped to allow reset\n", dev->data->port_id); return; } @@ -1320,8 +1286,8 @@ rte_eth_dev_start(uint16_t port_id) RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_start, -ENOTSUP); if (dev->data->dev_started != 0) { - RTE_PMD_DEBUG_TRACE("Device with port_id=%" PRIu16 - " already started\n", + RTE_ETHDEV_LOG(INFO, + "Device with port_id=%"PRIu16" already started\n", port_id); return 0; } @@ -1352,8 +1318,8 @@ rte_eth_dev_stop(uint16_t port_id) RTE_FUNC_PTR_OR_RET(*dev->dev_ops->dev_stop); if (dev->data->dev_started == 0) { - RTE_PMD_DEBUG_TRACE("Device with port_id=%" PRIu16 - " already stopped\n", + RTE_ETHDEV_LOG(INFO, + "Device with port_id=%"PRIu16" already stopped\n", port_id); return; } @@ -1465,7 +1431,7 @@ rte_eth_rx_queue_setup(uint16_t port_id, uint16_t rx_queue_id, dev = &rte_eth_devices[port_id]; if (rx_queue_id >= dev->data->nb_rx_queues) { - RTE_PMD_DEBUG_TRACE("Invalid RX queue_id=%d\n", rx_queue_id); + RTE_ETHDEV_LOG(ERR, "Invalid RX queue_id=%u\n", rx_queue_id); return -EINVAL; } @@ -1479,23 +1445,20 @@ rte_eth_rx_queue_setup(uint16_t port_id, uint16_t rx_queue_id, */ rte_eth_dev_info_get(port_id, &dev_info); if (mp->private_data_size < sizeof(struct rte_pktmbuf_pool_private)) { - RTE_PMD_DEBUG_TRACE("%s private_data_size %d < %d\n", - mp->name, (int) mp->private_data_size, - (int) sizeof(struct rte_pktmbuf_pool_private)); + RTE_ETHDEV_LOG(ERR, "%s private_data_size %d < %d\n", + mp->name, (int)mp->private_data_size, + (int)sizeof(struct rte_pktmbuf_pool_private)); return -ENOSPC; } mbp_buf_size = rte_pktmbuf_data_room_size(mp); if ((mbp_buf_size - RTE_PKTMBUF_HEADROOM) < dev_info.min_rx_bufsize) { - RTE_PMD_DEBUG_TRACE("%s mbuf_data_room_size %d < %d " - "(RTE_PKTMBUF_HEADROOM=%d + min_rx_bufsize(dev)" - "=%d)\n", - mp->name, - (int)mbp_buf_size, - (int)(RTE_PKTMBUF_HEADROOM + - dev_info.min_rx_bufsize), - (int)RTE_PKTMBUF_HEADROOM, - (int)dev_info.min_rx_bufsize); + RTE_ETHDEV_LOG(ERR, + "%s mbuf_data_room_size %d < %d (RTE_PKTMBUF_HEADROOM=%d + min_rx_bufsize(dev)=%d)\n", + mp->name, (int)mbp_buf_size, + (int)(RTE_PKTMBUF_HEADROOM + dev_info.min_rx_bufsize), + (int)RTE_PKTMBUF_HEADROOM, + (int)dev_info.min_rx_bufsize); return -EINVAL; } @@ -1511,10 +1474,9 @@ rte_eth_rx_queue_setup(uint16_t port_id, uint16_t rx_queue_id, nb_rx_desc < dev_info.rx_desc_lim.nb_min || nb_rx_desc % dev_info.rx_desc_lim.nb_align != 0) { - RTE_PMD_DEBUG_TRACE("Invalid value for nb_rx_desc(=%hu), " - "should be: <= %hu, = %hu, and a product of %hu\n", - nb_rx_desc, - dev_info.rx_desc_lim.nb_max, + RTE_ETHDEV_LOG(ERR, + "Invalid value for nb_rx_desc(=%hu), should be: <= %hu, = %hu, and a product of %hu\n", + nb_rx_desc, dev_info.rx_desc_lim.nb_max, dev_info.rx_desc_lim.nb_min, dev_info.rx_desc_lim.nb_align); return -EINVAL; @@ -1542,14 +1504,6 @@ rte_eth_rx_queue_setup(uint16_t port_id, uint16_t rx_queue_id, rx_conf = &dev_info.default_rxconf; local_conf = *rx_conf; - if (dev->data->dev_conf.rxmode.ignore_offload_bitfield == 0) { - /** - * Reflect port offloads to queue offloads in order for - * offloads to not be discarded. - */ - rte_eth_convert_rx_offload_bitfield(&dev->data->dev_conf.rxmode, - &local_conf.offloads); - } /* * If an offloading has already been enabled in @@ -1571,16 +1525,13 @@ rte_eth_rx_queue_setup(uint16_t port_id, uint16_t rx_queue_id, */ if ((local_conf.offloads & dev_info.rx_queue_offload_capa) != local_conf.offloads) { - ethdev_log(ERR, "Ethdev port_id=%d rx_queue_id=%d, new " - "added offloads 0x%" PRIx64 " must be " - "within pre-queue offload capabilities 0x%" - PRIx64 " in %s()\n", - port_id, - rx_queue_id, - local_conf.offloads, - dev_info.rx_queue_offload_capa, - __func__); - /* Will return -EINVAL in the next release */ + RTE_ETHDEV_LOG(ERR, + "Ethdev port_id=%d rx_queue_id=%d, new added offloads 0x%"PRIx64" must be " + "within pre-queue offload capabilities 0x%"PRIx64" in %s()\n", + port_id, rx_queue_id, local_conf.offloads, + dev_info.rx_queue_offload_capa, + __func__); + return -EINVAL; } ret = (*dev->dev_ops->rx_queue_setup)(dev, rx_queue_id, nb_rx_desc, @@ -1594,55 +1545,6 @@ rte_eth_rx_queue_setup(uint16_t port_id, uint16_t rx_queue_id, return eth_err(port_id, ret); } -/** - * Convert from tx offloads to txq_flags. - */ -static void -rte_eth_convert_tx_offload(const uint64_t tx_offloads, uint32_t *txq_flags) -{ - uint32_t flags = 0; - - if (!(tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS)) - flags |= ETH_TXQ_FLAGS_NOMULTSEGS; - if (!(tx_offloads & DEV_TX_OFFLOAD_VLAN_INSERT)) - flags |= ETH_TXQ_FLAGS_NOVLANOFFL; - if (!(tx_offloads & DEV_TX_OFFLOAD_SCTP_CKSUM)) - flags |= ETH_TXQ_FLAGS_NOXSUMSCTP; - if (!(tx_offloads & DEV_TX_OFFLOAD_UDP_CKSUM)) - flags |= ETH_TXQ_FLAGS_NOXSUMUDP; - if (!(tx_offloads & DEV_TX_OFFLOAD_TCP_CKSUM)) - flags |= ETH_TXQ_FLAGS_NOXSUMTCP; - if (tx_offloads & DEV_TX_OFFLOAD_MBUF_FAST_FREE) - flags |= ETH_TXQ_FLAGS_NOREFCOUNT | ETH_TXQ_FLAGS_NOMULTMEMP; - - *txq_flags = flags; -} - -/** - * A conversion function from txq_flags API. - */ -static void -rte_eth_convert_txq_flags(const uint32_t txq_flags, uint64_t *tx_offloads) -{ - uint64_t offloads = 0; - - if (!(txq_flags & ETH_TXQ_FLAGS_NOMULTSEGS)) - offloads |= DEV_TX_OFFLOAD_MULTI_SEGS; - if (!(txq_flags & ETH_TXQ_FLAGS_NOVLANOFFL)) - offloads |= DEV_TX_OFFLOAD_VLAN_INSERT; - if (!(txq_flags & ETH_TXQ_FLAGS_NOXSUMSCTP)) - offloads |= DEV_TX_OFFLOAD_SCTP_CKSUM; - if (!(txq_flags & ETH_TXQ_FLAGS_NOXSUMUDP)) - offloads |= DEV_TX_OFFLOAD_UDP_CKSUM; - if (!(txq_flags & ETH_TXQ_FLAGS_NOXSUMTCP)) - offloads |= DEV_TX_OFFLOAD_TCP_CKSUM; - if ((txq_flags & ETH_TXQ_FLAGS_NOREFCOUNT) && - (txq_flags & ETH_TXQ_FLAGS_NOMULTMEMP)) - offloads |= DEV_TX_OFFLOAD_MBUF_FAST_FREE; - - *tx_offloads = offloads; -} - int rte_eth_tx_queue_setup(uint16_t port_id, uint16_t tx_queue_id, uint16_t nb_tx_desc, unsigned int socket_id, @@ -1657,7 +1559,7 @@ rte_eth_tx_queue_setup(uint16_t port_id, uint16_t tx_queue_id, dev = &rte_eth_devices[port_id]; if (tx_queue_id >= dev->data->nb_tx_queues) { - RTE_PMD_DEBUG_TRACE("Invalid TX queue_id=%d\n", tx_queue_id); + RTE_ETHDEV_LOG(ERR, "Invalid TX queue_id=%u\n", tx_queue_id); return -EINVAL; } @@ -1676,12 +1578,11 @@ rte_eth_tx_queue_setup(uint16_t port_id, uint16_t tx_queue_id, if (nb_tx_desc > dev_info.tx_desc_lim.nb_max || nb_tx_desc < dev_info.tx_desc_lim.nb_min || nb_tx_desc % dev_info.tx_desc_lim.nb_align != 0) { - RTE_PMD_DEBUG_TRACE("Invalid value for nb_tx_desc(=%hu), " - "should be: <= %hu, = %hu, and a product of %hu\n", - nb_tx_desc, - dev_info.tx_desc_lim.nb_max, - dev_info.tx_desc_lim.nb_min, - dev_info.tx_desc_lim.nb_align); + RTE_ETHDEV_LOG(ERR, + "Invalid value for nb_tx_desc(=%hu), should be: <= %hu, = %hu, and a product of %hu\n", + nb_tx_desc, dev_info.tx_desc_lim.nb_max, + dev_info.tx_desc_lim.nb_min, + dev_info.tx_desc_lim.nb_align); return -EINVAL; } @@ -1706,15 +1607,7 @@ rte_eth_tx_queue_setup(uint16_t port_id, uint16_t tx_queue_id, if (tx_conf == NULL) tx_conf = &dev_info.default_txconf; - /* - * Convert between the offloads API to enable PMDs to support - * only one of them. - */ local_conf = *tx_conf; - if (!(tx_conf->txq_flags & ETH_TXQ_FLAGS_IGNORE)) { - rte_eth_convert_txq_flags(tx_conf->txq_flags, - &local_conf.offloads); - } /* * If an offloading has already been enabled in @@ -1736,16 +1629,13 @@ rte_eth_tx_queue_setup(uint16_t port_id, uint16_t tx_queue_id, */ if ((local_conf.offloads & dev_info.tx_queue_offload_capa) != local_conf.offloads) { - ethdev_log(ERR, "Ethdev port_id=%d tx_queue_id=%d, new " - "added offloads 0x%" PRIx64 " must be " - "within pre-queue offload capabilities 0x%" - PRIx64 " in %s()\n", - port_id, - tx_queue_id, - local_conf.offloads, - dev_info.tx_queue_offload_capa, - __func__); - /* Will return -EINVAL in the next release */ + RTE_ETHDEV_LOG(ERR, + "Ethdev port_id=%d tx_queue_id=%d, new added offloads 0x%"PRIx64" must be " + "within pre-queue offload capabilities 0x%"PRIx64" in %s()\n", + port_id, tx_queue_id, local_conf.offloads, + dev_info.tx_queue_offload_capa, + __func__); + return -EINVAL; } return eth_err(port_id, (*dev->dev_ops->tx_queue_setup)(dev, @@ -2009,19 +1899,19 @@ rte_eth_xstats_get_id_by_name(uint16_t port_id, const char *xstat_name, RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); if (!id) { - RTE_PMD_DEBUG_TRACE("Error: id pointer is NULL\n"); + RTE_ETHDEV_LOG(ERR, "Id pointer is NULL\n"); return -ENOMEM; } if (!xstat_name) { - RTE_PMD_DEBUG_TRACE("Error: xstat_name pointer is NULL\n"); + RTE_ETHDEV_LOG(ERR, "xstat_name pointer is NULL\n"); return -ENOMEM; } /* Get count */ cnt_xstats = rte_eth_xstats_get_names_by_id(port_id, NULL, 0, NULL); if (cnt_xstats < 0) { - RTE_PMD_DEBUG_TRACE("Error: Cannot get count of xstats\n"); + RTE_ETHDEV_LOG(ERR, "Cannot get count of xstats\n"); return -ENODEV; } @@ -2030,7 +1920,7 @@ rte_eth_xstats_get_id_by_name(uint16_t port_id, const char *xstat_name, if (cnt_xstats != rte_eth_xstats_get_names_by_id( port_id, xstats_names, cnt_xstats, NULL)) { - RTE_PMD_DEBUG_TRACE("Error: Cannot get xstats lookup\n"); + RTE_ETHDEV_LOG(ERR, "Cannot get xstats lookup\n"); return -1; } @@ -2153,7 +2043,7 @@ rte_eth_xstats_get_names_by_id(uint16_t port_id, sizeof(struct rte_eth_xstat_name)); if (!xstats_names_copy) { - RTE_PMD_DEBUG_TRACE("ERROR: can't allocate memory"); + RTE_ETHDEV_LOG(ERR, "Can't allocate memory\n"); return -ENOMEM; } @@ -2181,7 +2071,7 @@ rte_eth_xstats_get_names_by_id(uint16_t port_id, /* Filter stats */ for (i = 0; i < size; i++) { if (ids[i] >= expected_entries) { - RTE_PMD_DEBUG_TRACE("ERROR: id value isn't valid\n"); + RTE_ETHDEV_LOG(ERR, "Id value isn't valid\n"); free(xstats_names_copy); return -1; } @@ -2366,7 +2256,7 @@ rte_eth_xstats_get_by_id(uint16_t port_id, const uint64_t *ids, /* Filter stats */ for (i = 0; i < size; i++) { if (ids[i] >= expected_entries) { - RTE_PMD_DEBUG_TRACE("ERROR: id value isn't valid\n"); + RTE_ETHDEV_LOG(ERR, "Id value isn't valid\n"); return -1; } values[i] = xstats[ids[i]].value; @@ -2456,6 +2346,16 @@ set_queue_stats_mapping(uint16_t port_id, uint16_t queue_id, uint8_t stat_idx, dev = &rte_eth_devices[port_id]; RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->queue_stats_mapping_set, -ENOTSUP); + + if (is_rx && (queue_id >= dev->data->nb_rx_queues)) + return -EINVAL; + + if (!is_rx && (queue_id >= dev->data->nb_tx_queues)) + return -EINVAL; + + if (stat_idx >= RTE_ETHDEV_QUEUE_STAT_CNTRS) + return -EINVAL; + return (*dev->dev_ops->queue_stats_mapping_set) (dev, queue_id, stat_idx, is_rx); } @@ -2495,7 +2395,6 @@ void rte_eth_dev_info_get(uint16_t port_id, struct rte_eth_dev_info *dev_info) { struct rte_eth_dev *dev; - struct rte_eth_txconf *txconf; const struct rte_eth_desc_lim lim = { .nb_max = UINT16_MAX, .nb_min = 0, @@ -2517,9 +2416,6 @@ rte_eth_dev_info_get(uint16_t port_id, struct rte_eth_dev_info *dev_info) dev_info->nb_tx_queues = dev->data->nb_tx_queues; dev_info->dev_flags = &dev->data->dev_flags; - txconf = &dev_info->default_txconf; - /* convert offload to txq_flags to support legacy app */ - rte_eth_convert_tx_offload(txconf->offloads, &txconf->txq_flags); } int @@ -2598,13 +2494,14 @@ rte_eth_dev_vlan_filter(uint16_t port_id, uint16_t vlan_id, int on) dev = &rte_eth_devices[port_id]; if (!(dev->data->dev_conf.rxmode.offloads & DEV_RX_OFFLOAD_VLAN_FILTER)) { - RTE_PMD_DEBUG_TRACE("port %d: vlan-filtering disabled\n", port_id); + RTE_ETHDEV_LOG(ERR, "Port %u: vlan-filtering disabled\n", + port_id); return -ENOSYS; } if (vlan_id > 4095) { - RTE_PMD_DEBUG_TRACE("(port_id=%d) invalid vlan_id=%u > 4095\n", - port_id, (unsigned) vlan_id); + RTE_ETHDEV_LOG(ERR, "Port_id=%u invalid vlan_id=%u > 4095\n", + port_id, vlan_id); return -EINVAL; } RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->vlan_filter_set, -ENOTSUP); @@ -2637,7 +2534,7 @@ rte_eth_dev_set_vlan_strip_on_queue(uint16_t port_id, uint16_t rx_queue_id, RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); dev = &rte_eth_devices[port_id]; if (rx_queue_id >= dev->data->nb_rx_queues) { - RTE_PMD_DEBUG_TRACE("Invalid rx_queue_id=%d\n", port_id); + RTE_ETHDEV_LOG(ERR, "Invalid rx_queue_id=%u\n", rx_queue_id); return -EINVAL; } @@ -2786,7 +2683,7 @@ rte_eth_dev_flow_ctrl_set(uint16_t port_id, struct rte_eth_fc_conf *fc_conf) RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); if ((fc_conf->send_xon != 0) && (fc_conf->send_xon != 1)) { - RTE_PMD_DEBUG_TRACE("Invalid send_xon, only 0/1 allowed\n"); + RTE_ETHDEV_LOG(ERR, "Invalid send_xon, only 0/1 allowed\n"); return -EINVAL; } @@ -2803,7 +2700,7 @@ rte_eth_dev_priority_flow_ctrl_set(uint16_t port_id, RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); if (pfc_conf->priority > (ETH_DCB_NUM_USER_PRIORITIES - 1)) { - RTE_PMD_DEBUG_TRACE("Invalid priority, only 0-7 allowed\n"); + RTE_ETHDEV_LOG(ERR, "Invalid priority, only 0-7 allowed\n"); return -EINVAL; } @@ -2844,7 +2741,7 @@ rte_eth_check_reta_entry(struct rte_eth_rss_reta_entry64 *reta_conf, return -EINVAL; if (max_rxq == 0) { - RTE_PMD_DEBUG_TRACE("No receive queue is available\n"); + RTE_ETHDEV_LOG(ERR, "No receive queue is available\n"); return -EINVAL; } @@ -2853,8 +2750,9 @@ rte_eth_check_reta_entry(struct rte_eth_rss_reta_entry64 *reta_conf, shift = i % RTE_RETA_GROUP_SIZE; if ((reta_conf[idx].mask & (1ULL << shift)) && (reta_conf[idx].reta[shift] >= max_rxq)) { - RTE_PMD_DEBUG_TRACE("reta_conf[%u]->reta[%u]: %u exceeds " - "the maximum rxq index: %u\n", idx, shift, + RTE_ETHDEV_LOG(ERR, + "reta_conf[%u]->reta[%u]: %u exceeds the maximum rxq index: %u\n", + idx, shift, reta_conf[idx].reta[shift], max_rxq); return -EINVAL; } @@ -2923,11 +2821,11 @@ rte_eth_dev_rss_hash_update(uint16_t port_id, rte_eth_dev_info_get(port_id, &dev_info); if ((dev_info.flow_type_rss_offloads | rss_conf->rss_hf) != dev_info.flow_type_rss_offloads) { - RTE_PMD_DEBUG_TRACE("ethdev port_id=%d invalid rss_hf: " - "0x%"PRIx64", valid value: 0x%"PRIx64"\n", - port_id, - rss_conf->rss_hf, - dev_info.flow_type_rss_offloads); + RTE_ETHDEV_LOG(ERR, + "Ethdev port_id=%u invalid rss_hf: 0x%"PRIx64", valid value: 0x%"PRIx64"\n", + port_id, rss_conf->rss_hf, + dev_info.flow_type_rss_offloads); + return -EINVAL; } RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rss_hash_update, -ENOTSUP); return eth_err(port_id, (*dev->dev_ops->rss_hash_update)(dev, @@ -2955,12 +2853,12 @@ rte_eth_dev_udp_tunnel_port_add(uint16_t port_id, RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); if (udp_tunnel == NULL) { - RTE_PMD_DEBUG_TRACE("Invalid udp_tunnel parameter\n"); + RTE_ETHDEV_LOG(ERR, "Invalid udp_tunnel parameter\n"); return -EINVAL; } if (udp_tunnel->prot_type >= RTE_TUNNEL_TYPE_MAX) { - RTE_PMD_DEBUG_TRACE("Invalid tunnel type\n"); + RTE_ETHDEV_LOG(ERR, "Invalid tunnel type\n"); return -EINVAL; } @@ -2980,12 +2878,12 @@ rte_eth_dev_udp_tunnel_port_delete(uint16_t port_id, dev = &rte_eth_devices[port_id]; if (udp_tunnel == NULL) { - RTE_PMD_DEBUG_TRACE("Invalid udp_tunnel parameter\n"); + RTE_ETHDEV_LOG(ERR, "Invalid udp_tunnel parameter\n"); return -EINVAL; } if (udp_tunnel->prot_type >= RTE_TUNNEL_TYPE_MAX) { - RTE_PMD_DEBUG_TRACE("Invalid tunnel type\n"); + RTE_ETHDEV_LOG(ERR, "Invalid tunnel type\n"); return -EINVAL; } @@ -3053,12 +2951,12 @@ rte_eth_dev_mac_addr_add(uint16_t port_id, struct ether_addr *addr, RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->mac_addr_add, -ENOTSUP); if (is_zero_ether_addr(addr)) { - RTE_PMD_DEBUG_TRACE("port %d: Cannot add NULL MAC address\n", + RTE_ETHDEV_LOG(ERR, "Port %u: Cannot add NULL MAC address\n", port_id); return -EINVAL; } if (pool >= ETH_64_POOLS) { - RTE_PMD_DEBUG_TRACE("pool id must be 0-%d\n", ETH_64_POOLS - 1); + RTE_ETHDEV_LOG(ERR, "Pool id must be 0-%d\n", ETH_64_POOLS - 1); return -EINVAL; } @@ -3066,7 +2964,7 @@ rte_eth_dev_mac_addr_add(uint16_t port_id, struct ether_addr *addr, if (index < 0) { index = get_mac_addr_index(port_id, &null_mac_addr); if (index < 0) { - RTE_PMD_DEBUG_TRACE("port %d: MAC address array full\n", + RTE_ETHDEV_LOG(ERR, "Port %u: MAC address array full\n", port_id); return -ENOSPC; } @@ -3104,7 +3002,9 @@ rte_eth_dev_mac_addr_remove(uint16_t port_id, struct ether_addr *addr) index = get_mac_addr_index(port_id, addr); if (index == 0) { - RTE_PMD_DEBUG_TRACE("port %d: Cannot remove default MAC address\n", port_id); + RTE_ETHDEV_LOG(ERR, + "Port %u: Cannot remove default MAC address\n", + port_id); return -EADDRINUSE; } else if (index < 0) return 0; /* Do nothing if address wasn't found */ @@ -3181,7 +3081,7 @@ rte_eth_dev_uc_hash_table_set(uint16_t port_id, struct ether_addr *addr, dev = &rte_eth_devices[port_id]; if (is_zero_ether_addr(addr)) { - RTE_PMD_DEBUG_TRACE("port %d: Cannot add NULL MAC address\n", + RTE_ETHDEV_LOG(ERR, "Port %u: Cannot add NULL MAC address\n", port_id); return -EINVAL; } @@ -3193,15 +3093,16 @@ rte_eth_dev_uc_hash_table_set(uint16_t port_id, struct ether_addr *addr, if (index < 0) { if (!on) { - RTE_PMD_DEBUG_TRACE("port %d: the MAC address was not " - "set in UTA\n", port_id); + RTE_ETHDEV_LOG(ERR, + "Port %u: the MAC address was not set in UTA\n", + port_id); return -EINVAL; } index = get_hash_mac_addr_index(port_id, &null_mac_addr); if (index < 0) { - RTE_PMD_DEBUG_TRACE("port %d: MAC address array full\n", - port_id); + RTE_ETHDEV_LOG(ERR, "Port %u: MAC address array full\n", + port_id); return -ENOSPC; } } @@ -3249,14 +3150,15 @@ int rte_eth_set_queue_rate_limit(uint16_t port_id, uint16_t queue_idx, link = dev->data->dev_link; if (queue_idx > dev_info.max_tx_queues) { - RTE_PMD_DEBUG_TRACE("set queue rate limit:port %d: " - "invalid queue id=%d\n", port_id, queue_idx); + RTE_ETHDEV_LOG(ERR, + "Set queue rate limit:port %u: invalid queue id=%u\n", + port_id, queue_idx); return -EINVAL; } if (tx_rate > link.link_speed) { - RTE_PMD_DEBUG_TRACE("set queue rate limit:invalid tx_rate=%d, " - "bigger than link speed= %d\n", + RTE_ETHDEV_LOG(ERR, + "Set queue rate limit:invalid tx_rate=%u, bigger than link speed= %d\n", tx_rate, link.link_speed); return -EINVAL; } @@ -3275,26 +3177,28 @@ rte_eth_mirror_rule_set(uint16_t port_id, RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); if (mirror_conf->rule_type == 0) { - RTE_PMD_DEBUG_TRACE("mirror rule type can not be 0.\n"); + RTE_ETHDEV_LOG(ERR, "Mirror rule type can not be 0\n"); return -EINVAL; } if (mirror_conf->dst_pool >= ETH_64_POOLS) { - RTE_PMD_DEBUG_TRACE("Invalid dst pool, pool id must be 0-%d\n", - ETH_64_POOLS - 1); + RTE_ETHDEV_LOG(ERR, "Invalid dst pool, pool id must be 0-%d\n", + ETH_64_POOLS - 1); return -EINVAL; } if ((mirror_conf->rule_type & (ETH_MIRROR_VIRTUAL_POOL_UP | ETH_MIRROR_VIRTUAL_POOL_DOWN)) && (mirror_conf->pool_mask == 0)) { - RTE_PMD_DEBUG_TRACE("Invalid mirror pool, pool mask can not be 0.\n"); + RTE_ETHDEV_LOG(ERR, + "Invalid mirror pool, pool mask can not be 0\n"); return -EINVAL; } if ((mirror_conf->rule_type & ETH_MIRROR_VLAN) && mirror_conf->vlan.vlan_mask == 0) { - RTE_PMD_DEBUG_TRACE("Invalid vlan mask, vlan mask can not be 0.\n"); + RTE_ETHDEV_LOG(ERR, + "Invalid vlan mask, vlan mask can not be 0\n"); return -EINVAL; } @@ -3341,7 +3245,7 @@ rte_eth_dev_callback_register(uint16_t port_id, return -EINVAL; if (!rte_eth_dev_is_valid_port(port_id) && port_id != RTE_ETH_ALL) { - ethdev_log(ERR, "Invalid port_id=%d", port_id); + RTE_ETHDEV_LOG(ERR, "Invalid port_id=%d\n", port_id); return -EINVAL; } @@ -3404,7 +3308,7 @@ rte_eth_dev_callback_unregister(uint16_t port_id, return -EINVAL; if (!rte_eth_dev_is_valid_port(port_id) && port_id != RTE_ETH_ALL) { - ethdev_log(ERR, "Invalid port_id=%d", port_id); + RTE_ETHDEV_LOG(ERR, "Invalid port_id=%d\n", port_id); return -EINVAL; } @@ -3498,13 +3402,13 @@ rte_eth_dev_rx_intr_ctl(uint16_t port_id, int epfd, int op, void *data) dev = &rte_eth_devices[port_id]; if (!dev->intr_handle) { - RTE_PMD_DEBUG_TRACE("RX Intr handle unset\n"); + RTE_ETHDEV_LOG(ERR, "RX Intr handle unset\n"); return -ENOTSUP; } intr_handle = dev->intr_handle; if (!intr_handle->intr_vec) { - RTE_PMD_DEBUG_TRACE("RX Intr vector unset\n"); + RTE_ETHDEV_LOG(ERR, "RX Intr vector unset\n"); return -EPERM; } @@ -3512,9 +3416,9 @@ rte_eth_dev_rx_intr_ctl(uint16_t port_id, int epfd, int op, void *data) vec = intr_handle->intr_vec[qid]; rc = rte_intr_rx_ctl(intr_handle, epfd, op, vec, data); if (rc && rc != -EEXIST) { - RTE_PMD_DEBUG_TRACE("p %u q %u rx ctl error" - " op %d epfd %d vec %u\n", - port_id, qid, op, epfd, vec); + RTE_ETHDEV_LOG(ERR, + "p %u q %u rx ctl error op %d epfd %d vec %u\n", + port_id, qid, op, epfd, vec); } } @@ -3649,27 +3553,27 @@ rte_eth_dev_rx_intr_ctl_q(uint16_t port_id, uint16_t queue_id, dev = &rte_eth_devices[port_id]; if (queue_id >= dev->data->nb_rx_queues) { - RTE_PMD_DEBUG_TRACE("Invalid RX queue_id=%u\n", queue_id); + RTE_ETHDEV_LOG(ERR, "Invalid RX queue_id=%u\n", queue_id); return -EINVAL; } if (!dev->intr_handle) { - RTE_PMD_DEBUG_TRACE("RX Intr handle unset\n"); + RTE_ETHDEV_LOG(ERR, "RX Intr handle unset\n"); return -ENOTSUP; } intr_handle = dev->intr_handle; if (!intr_handle->intr_vec) { - RTE_PMD_DEBUG_TRACE("RX Intr vector unset\n"); + RTE_ETHDEV_LOG(ERR, "RX Intr vector unset\n"); return -EPERM; } vec = intr_handle->intr_vec[queue_id]; rc = rte_intr_rx_ctl(intr_handle, epfd, op, vec, data); if (rc && rc != -EEXIST) { - RTE_PMD_DEBUG_TRACE("p %u q %u rx ctl error" - " op %d epfd %d vec %u\n", - port_id, queue_id, op, epfd, vec); + RTE_ETHDEV_LOG(ERR, + "p %u q %u rx ctl error op %d epfd %d vec %u\n", + port_id, queue_id, op, epfd, vec); return rc; } @@ -3936,7 +3840,7 @@ rte_eth_rx_queue_info_get(uint16_t port_id, uint16_t queue_id, dev = &rte_eth_devices[port_id]; if (queue_id >= dev->data->nb_rx_queues) { - RTE_PMD_DEBUG_TRACE("Invalid RX queue_id=%d\n", queue_id); + RTE_ETHDEV_LOG(ERR, "Invalid RX queue_id=%u\n", queue_id); return -EINVAL; } @@ -3952,7 +3856,6 @@ rte_eth_tx_queue_info_get(uint16_t port_id, uint16_t queue_id, struct rte_eth_txq_info *qinfo) { struct rte_eth_dev *dev; - struct rte_eth_txconf *txconf = &qinfo->conf; RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); @@ -3961,7 +3864,7 @@ rte_eth_tx_queue_info_get(uint16_t port_id, uint16_t queue_id, dev = &rte_eth_devices[port_id]; if (queue_id >= dev->data->nb_tx_queues) { - RTE_PMD_DEBUG_TRACE("Invalid TX queue_id=%d\n", queue_id); + RTE_ETHDEV_LOG(ERR, "Invalid TX queue_id=%u\n", queue_id); return -EINVAL; } @@ -3969,8 +3872,6 @@ rte_eth_tx_queue_info_get(uint16_t port_id, uint16_t queue_id, memset(qinfo, 0, sizeof(*qinfo)); dev->dev_ops->txq_info_get(dev, queue_id, qinfo); - /* convert offload to txq_flags to support legacy app */ - rte_eth_convert_tx_offload(txconf->offloads, &txconf->txq_flags); return 0; } @@ -4178,12 +4079,12 @@ rte_eth_dev_l2_tunnel_eth_type_conf(uint16_t port_id, RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); if (l2_tunnel == NULL) { - RTE_PMD_DEBUG_TRACE("Invalid l2_tunnel parameter\n"); + RTE_ETHDEV_LOG(ERR, "Invalid l2_tunnel parameter\n"); return -EINVAL; } if (l2_tunnel->l2_tunnel_type >= RTE_TUNNEL_TYPE_MAX) { - RTE_PMD_DEBUG_TRACE("Invalid tunnel type\n"); + RTE_ETHDEV_LOG(ERR, "Invalid tunnel type\n"); return -EINVAL; } @@ -4205,17 +4106,17 @@ rte_eth_dev_l2_tunnel_offload_set(uint16_t port_id, RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); if (l2_tunnel == NULL) { - RTE_PMD_DEBUG_TRACE("Invalid l2_tunnel parameter\n"); + RTE_ETHDEV_LOG(ERR, "Invalid l2_tunnel parameter\n"); return -EINVAL; } if (l2_tunnel->l2_tunnel_type >= RTE_TUNNEL_TYPE_MAX) { - RTE_PMD_DEBUG_TRACE("Invalid tunnel type.\n"); + RTE_ETHDEV_LOG(ERR, "Invalid tunnel type\n"); return -EINVAL; } if (mask == 0) { - RTE_PMD_DEBUG_TRACE("Mask should have a value.\n"); + RTE_ETHDEV_LOG(ERR, "Mask should have a value\n"); return -EINVAL; } @@ -4516,11 +4417,9 @@ parse_cleanup: return result; } -RTE_INIT(ethdev_init_log); -static void -ethdev_init_log(void) +RTE_INIT(ethdev_init_log) { - ethdev_logtype = rte_log_register("lib.ethdev"); - if (ethdev_logtype >= 0) - rte_log_set_level(ethdev_logtype, RTE_LOG_INFO); + rte_eth_dev_logtype = rte_log_register("lib.ethdev"); + if (rte_eth_dev_logtype >= 0) + rte_log_set_level(rte_eth_dev_logtype, RTE_LOG_INFO); } diff --git a/lib/librte_ethdev/rte_ethdev.h b/lib/librte_ethdev/rte_ethdev.h index 36e3984e..7070e9ab 100644 --- a/lib/librte_ethdev/rte_ethdev.h +++ b/lib/librte_ethdev/rte_ethdev.h @@ -159,6 +159,11 @@ extern "C" { #include "rte_eth_ctrl.h" #include "rte_dev_info.h" +extern int rte_eth_dev_logtype; + +#define RTE_ETHDEV_LOG(level, ...) \ + rte_log(RTE_LOG_ ## level, rte_eth_dev_logtype, "" __VA_ARGS__) + struct rte_mbuf; /** @@ -321,7 +326,7 @@ enum rte_eth_tx_mq_mode { struct rte_eth_rxmode { /** The multi-queue packet distribution mode to be used, e.g. RSS. */ enum rte_eth_rx_mq_mode mq_mode; - uint32_t max_rx_pkt_len; /**< Only used if jumbo_frame enabled. */ + uint32_t max_rx_pkt_len; /**< Only used if JUMBO_FRAME enabled. */ uint16_t split_hdr_size; /**< hdr buf size (header_split enabled).*/ /** * Per-port Rx offloads to be set using DEV_RX_OFFLOAD_* flags. @@ -329,33 +334,6 @@ struct rte_eth_rxmode { * structure are allowed to be set. */ uint64_t offloads; - __extension__ - /** - * Below bitfield API is obsolete. Application should - * enable per-port offloads using the offload field - * above. - */ - uint16_t header_split : 1, /**< Header Split enable. */ - hw_ip_checksum : 1, /**< IP/UDP/TCP checksum offload enable. */ - hw_vlan_filter : 1, /**< VLAN filter enable. */ - hw_vlan_strip : 1, /**< VLAN strip enable. */ - hw_vlan_extend : 1, /**< Extended VLAN enable. */ - jumbo_frame : 1, /**< Jumbo Frame Receipt enable. */ - hw_strip_crc : 1, /**< Enable CRC stripping by hardware. */ - enable_scatter : 1, /**< Enable scatter packets rx handler */ - enable_lro : 1, /**< Enable LRO */ - hw_timestamp : 1, /**< Enable HW timestamp */ - security : 1, /**< Enable rte_security offloads */ - /** - * When set the offload bitfield should be ignored. - * Instead per-port Rx offloads should be set on offloads - * field above. - * Per-queue offloads shuold be set on rte_eth_rxq_conf - * structure. - * This bit is temporary till rxmode bitfield offloads API will - * be deprecated. - */ - ignore_offload_bitfield : 1; }; /** @@ -702,28 +680,6 @@ struct rte_eth_rxconf { uint64_t offloads; }; -#define ETH_TXQ_FLAGS_NOMULTSEGS 0x0001 /**< nb_segs=1 for all mbufs */ -#define ETH_TXQ_FLAGS_NOREFCOUNT 0x0002 /**< refcnt can be ignored */ -#define ETH_TXQ_FLAGS_NOMULTMEMP 0x0004 /**< all bufs come from same mempool */ -#define ETH_TXQ_FLAGS_NOVLANOFFL 0x0100 /**< disable VLAN offload */ -#define ETH_TXQ_FLAGS_NOXSUMSCTP 0x0200 /**< disable SCTP checksum offload */ -#define ETH_TXQ_FLAGS_NOXSUMUDP 0x0400 /**< disable UDP checksum offload */ -#define ETH_TXQ_FLAGS_NOXSUMTCP 0x0800 /**< disable TCP checksum offload */ -#define ETH_TXQ_FLAGS_NOOFFLOADS \ - (ETH_TXQ_FLAGS_NOVLANOFFL | ETH_TXQ_FLAGS_NOXSUMSCTP | \ - ETH_TXQ_FLAGS_NOXSUMUDP | ETH_TXQ_FLAGS_NOXSUMTCP) -#define ETH_TXQ_FLAGS_NOXSUMS \ - (ETH_TXQ_FLAGS_NOXSUMSCTP | ETH_TXQ_FLAGS_NOXSUMUDP | \ - ETH_TXQ_FLAGS_NOXSUMTCP) -/** - * When set the txq_flags should be ignored, - * instead per-queue Tx offloads will be set on offloads field - * located on rte_eth_txq_conf struct. - * This flag is temporary till the rte_eth_txq_conf.txq_flags - * API will be deprecated. - */ -#define ETH_TXQ_FLAGS_IGNORE 0x8000 - /** * A structure used to configure a TX ring of an Ethernet port. */ @@ -733,7 +689,6 @@ struct rte_eth_txconf { uint16_t tx_free_thresh; /**< Start freeing TX buffers if there are less free descriptors than this value. */ - uint32_t txq_flags; /**< Set flags for the Tx queue */ uint8_t tx_deferred_start; /**< Do not start queue with rte_eth_dev_start(). */ /** * Per-queue Tx offloads to be set using DEV_TX_OFFLOAD_* flags. @@ -939,6 +894,12 @@ struct rte_eth_conf { #define DEV_RX_OFFLOAD_SCATTER 0x00002000 #define DEV_RX_OFFLOAD_TIMESTAMP 0x00004000 #define DEV_RX_OFFLOAD_SECURITY 0x00008000 + +/** + * Invalid to set both DEV_RX_OFFLOAD_CRC_STRIP and DEV_RX_OFFLOAD_KEEP_CRC + * No DEV_RX_OFFLOAD_CRC_STRIP flag means keep CRC + */ +#define DEV_RX_OFFLOAD_KEEP_CRC 0x00010000 #define DEV_RX_OFFLOAD_CHECKSUM (DEV_RX_OFFLOAD_IPV4_CKSUM | \ DEV_RX_OFFLOAD_UDP_CKSUM | \ DEV_RX_OFFLOAD_TCP_CKSUM) @@ -1003,8 +964,6 @@ struct rte_eth_conf { * mentioned in rte_tx_offload_names in rte_ethdev.c file. */ -struct rte_pci_device; - /* * Fallback default preferred Rx/Tx port parameters. * These are used if an application requests default parameters @@ -1194,14 +1153,14 @@ struct rte_eth_dcb_info { /* Macros to check for valid port */ #define RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, retval) do { \ if (!rte_eth_dev_is_valid_port(port_id)) { \ - RTE_PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id); \ + RTE_ETHDEV_LOG(ERR, "Invalid port_id=%u\n", port_id); \ return retval; \ } \ } while (0) #define RTE_ETH_VALID_PORTID_OR_RET(port_id) do { \ if (!rte_eth_dev_is_valid_port(port_id)) { \ - RTE_PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id); \ + RTE_ETHDEV_LOG(ERR, "Invalid port_id=%u\n", port_id); \ return; \ } \ } while (0) @@ -1472,6 +1431,7 @@ uint16_t __rte_experimental rte_eth_dev_count_total(void); * @return * 0 on success and port_id is filled, negative on error */ +__rte_deprecated int rte_eth_dev_attach(const char *devargs, uint16_t *port_id); /** @@ -1487,6 +1447,7 @@ int rte_eth_dev_attach(const char *devargs, uint16_t *port_id); * @return * 0 on success and devname is filled, negative on error */ +__rte_deprecated int rte_eth_dev_detach(uint16_t port_id, char *devname); /** @@ -1554,9 +1515,11 @@ const char * __rte_experimental rte_eth_dev_tx_offload_name(uint64_t offload); * the [rt]x_offload_capa returned from rte_eth_dev_infos_get(). * Any type of device supported offloading set in the input argument * eth_conf->[rt]xmode.offloads to rte_eth_dev_configure() is enabled - * on all queues and it can't be disabled in rte_eth_[rt]x_queue_setup(). - * - the Receive Side Scaling (RSS) configuration when using multiple RX - * queues per port. + * on all queues and it can't be disabled in rte_eth_[rt]x_queue_setup() + * - the Receive Side Scaling (RSS) configuration when using multiple RX + * queues per port. Any RSS hash function set in eth_conf->rss_conf.rss_hf + * must be within the flow_type_rss_offloads provided by drivers via + * rte_eth_dev_infos_get() API. * * Embedding all configuration information in a single data structure * is the more flexible method that allows the addition of new features @@ -1669,12 +1632,6 @@ int rte_eth_rx_queue_setup(uint16_t port_id, uint16_t rx_queue_id, * The *tx_rs_thresh* value should be less or equal then * *tx_free_thresh* value, and both of them should be less then * *nb_tx_desc* - 3. - * - The *txq_flags* member contains flags to pass to the TX queue setup - * function to configure the behavior of the TX queue. This should be set - * to 0 if no special configuration is required. - * This API is obsolete and will be deprecated. Applications - * should set it to ETH_TXQ_FLAGS_IGNORE and use - * the offloads field below. * - The *offloads* member contains Tx offloads to be enabled. * If an offloading set in tx_conf->offloads * hasn't been set in the input argument eth_conf->txmode.offloads @@ -2003,6 +1960,15 @@ int rte_eth_stats_reset(uint16_t port_id); /** * Retrieve names of extended statistics of an Ethernet device. * + * There is an assumption that 'xstat_names' and 'xstats' arrays are matched + * by array index: + * xstats_names[i].name => xstats[i].value + * + * And the array index is same with id field of 'struct rte_eth_xstat': + * xstats[i].id == i + * + * This assumption makes key-value pair matching less flexible but simpler. + * * @param port_id * The port identifier of the Ethernet device. * @param xstats_names @@ -2027,13 +1993,20 @@ int rte_eth_xstats_get_names(uint16_t port_id, /** * Retrieve extended statistics of an Ethernet device. * + * There is an assumption that 'xstat_names' and 'xstats' arrays are matched + * by array index: + * xstats_names[i].name => xstats[i].value + * + * And the array index is same with id field of 'struct rte_eth_xstat': + * xstats[i].id == i + * + * This assumption makes key-value pair matching less flexible but simpler. + * * @param port_id * The port identifier of the Ethernet device. * @param xstats * A pointer to a table of structure of type *rte_eth_xstat* - * to be filled with device statistics ids and values: id is the - * index of the name string in xstats_names (see rte_eth_xstats_get_names()), - * and value is the statistic counter. + * to be filled with device statistics ids and values. * This parameter can be set to NULL if n is 0. * @param n * The size of the xstats array (number of elements). @@ -2144,7 +2117,7 @@ void rte_eth_xstats_reset(uint16_t port_id); * @param stat_idx * The per-queue packet statistics functionality number that the transmit * queue is to be assigned. - * The value must be in the range [0, RTE_MAX_ETHPORT_QUEUE_STATS_MAPS - 1]. + * The value must be in the range [0, RTE_ETHDEV_QUEUE_STAT_CNTRS - 1]. * @return * Zero if successful. Non-zero otherwise. */ @@ -2164,7 +2137,7 @@ int rte_eth_dev_set_tx_queue_stats_mapping(uint16_t port_id, * @param stat_idx * The per-queue packet statistics functionality number that the receive * queue is to be assigned. - * The value must be in the range [0, RTE_MAX_ETHPORT_QUEUE_STATS_MAPS - 1]. + * The value must be in the range [0, RTE_ETHDEV_QUEUE_STAT_CNTRS - 1]. * @return * Zero if successful. Non-zero otherwise. */ @@ -3656,11 +3629,11 @@ rte_eth_dev_l2_tunnel_offload_set(uint16_t port_id, uint8_t en); /** -* Get the port id from pci address or device name -* Example: -* - PCIe, 0000:2:00.0 -* - SoC, fsl-gmac0 -* - vdev, net_pcap0 +* Get the port id from device name. The device name should be specified +* as below: +* - PCIe address (Domain:Bus:Device.Function), for example- 0000:2:00.0 +* - SoC device name, for example- fsl-gmac0 +* - vdev dpdk name, for example- net_[pcap0|null0|tap0] * * @param name * pci address or name of the device @@ -3674,11 +3647,10 @@ int rte_eth_dev_get_port_by_name(const char *name, uint16_t *port_id); /** -* Get the device name from port id -* Example: -* - PCIe Bus:Domain:Function, 0000:02:00.0 -* - SoC device name, fsl-gmac0 -* - vdev dpdk name, net_[pcap0|null0|tun0|tap0] +* Get the device name from port id. The device name is specified as below: +* - PCIe address (Domain:Bus:Device.Function), for example- 0000:02:00.0 +* - SoC device name, for example- fsl-gmac0 +* - vdev dpdk name, for example- net_[pcap0|null0|tun0|tap0] * * @param port_id * Port identifier of the device. @@ -3837,7 +3809,7 @@ rte_eth_rx_burst(uint16_t port_id, uint16_t queue_id, RTE_FUNC_PTR_OR_ERR_RET(*dev->rx_pkt_burst, 0); if (queue_id >= dev->data->nb_rx_queues) { - RTE_PMD_DEBUG_TRACE("Invalid RX queue_id=%d\n", queue_id); + RTE_ETHDEV_LOG(ERR, "Invalid RX queue_id=%u\n", queue_id); return 0; } #endif @@ -4070,7 +4042,7 @@ static inline int rte_eth_tx_descriptor_status(uint16_t port_id, * * If the PMD is DEV_TX_OFFLOAD_MT_LOCKFREE capable, multiple threads can * invoke this function concurrently on the same tx queue without SW lock. - * @see rte_eth_dev_info_get, struct rte_eth_txconf::txq_flags + * @see rte_eth_dev_info_get, struct rte_eth_txconf::offloads * * @see rte_eth_tx_prepare to perform some prior checks or adjustments * for offloads. @@ -4103,7 +4075,7 @@ rte_eth_tx_burst(uint16_t port_id, uint16_t queue_id, RTE_FUNC_PTR_OR_ERR_RET(*dev->tx_pkt_burst, 0); if (queue_id >= dev->data->nb_tx_queues) { - RTE_PMD_DEBUG_TRACE("Invalid TX queue_id=%d\n", queue_id); + RTE_ETHDEV_LOG(ERR, "Invalid TX queue_id=%u\n", queue_id); return 0; } #endif @@ -4189,7 +4161,7 @@ rte_eth_tx_prepare(uint16_t port_id, uint16_t queue_id, #ifdef RTE_LIBRTE_ETHDEV_DEBUG if (!rte_eth_dev_is_valid_port(port_id)) { - RTE_PMD_DEBUG_TRACE("Invalid TX port_id=%d\n", port_id); + RTE_ETHDEV_LOG(ERR, "Invalid TX port_id=%u\n", port_id); rte_errno = -EINVAL; return 0; } @@ -4199,7 +4171,7 @@ rte_eth_tx_prepare(uint16_t port_id, uint16_t queue_id, #ifdef RTE_LIBRTE_ETHDEV_DEBUG if (queue_id >= dev->data->nb_tx_queues) { - RTE_PMD_DEBUG_TRACE("Invalid TX queue_id=%d\n", queue_id); + RTE_ETHDEV_LOG(ERR, "Invalid TX queue_id=%u\n", queue_id); rte_errno = -EINVAL; return 0; } diff --git a/lib/librte_ethdev/rte_ethdev_driver.h b/lib/librte_ethdev/rte_ethdev_driver.h index c9c825e3..c6d9bc1a 100644 --- a/lib/librte_ethdev/rte_ethdev_driver.h +++ b/lib/librte_ethdev/rte_ethdev_driver.h @@ -38,7 +38,6 @@ struct rte_eth_dev *rte_eth_dev_allocated(const char *name); * to that slot for the driver to use. * * @param name Unique identifier name for each Ethernet device - * @param type Device type of this Ethernet device * @return * - Slot in the rte_dev_devices array for a new device; */ @@ -325,6 +324,32 @@ typedef int (*ethdev_uninit_t)(struct rte_eth_dev *ethdev); int __rte_experimental rte_eth_dev_destroy(struct rte_eth_dev *ethdev, ethdev_uninit_t ethdev_uninit); +/** + * PMD helper function to check if keeping CRC is requested + * + * @note + * When CRC_STRIP offload flag is removed and default behavior switch to + * strip CRC, as planned, this helper function is not that useful and will be + * removed. In PMDs this function will be replaced with check: + * if (offloads & DEV_RX_OFFLOAD_KEEP_CRC) + * + * @param rx_offloads + * offload bits to be applied + * + * @return + * Return positive if keeping CRC is requested, + * zero if stripping CRC is requested + */ +static inline int +rte_eth_dev_must_keep_crc(uint64_t rx_offloads) +{ + if (rx_offloads & DEV_RX_OFFLOAD_CRC_STRIP) + return 0; + + /* no KEEP_CRC or CRC_STRIP offload flags means keep CRC */ + return 1; +} + #ifdef __cplusplus } #endif diff --git a/lib/librte_ethdev/rte_ethdev_pci.h b/lib/librte_ethdev/rte_ethdev_pci.h index 2cfd3727..f652596f 100644 --- a/lib/librte_ethdev/rte_ethdev_pci.h +++ b/lib/librte_ethdev/rte_ethdev_pci.h @@ -53,8 +53,8 @@ rte_eth_copy_pci_info(struct rte_eth_dev *eth_dev, struct rte_pci_device *pci_dev) { if ((eth_dev == NULL) || (pci_dev == NULL)) { - RTE_PMD_DEBUG_TRACE("NULL pointer eth_dev=%p pci_dev=%p\n", - eth_dev, pci_dev); + RTE_ETHDEV_LOG(ERR, "NULL pointer eth_dev=%p pci_dev=%p", + (void *)eth_dev, (void *)pci_dev); return; } diff --git a/lib/librte_ethdev/rte_ethdev_version.map b/lib/librte_ethdev/rte_ethdev_version.map index 40cf42b8..38f117f0 100644 --- a/lib/librte_ethdev/rte_ethdev_version.map +++ b/lib/librte_ethdev/rte_ethdev_version.map @@ -213,6 +213,13 @@ DPDK_18.05 { } DPDK_18.02; +DPDK_18.08 { + global: + + rte_eth_dev_logtype; + +} DPDK_18.05; + EXPERIMENTAL { global: @@ -232,6 +239,7 @@ EXPERIMENTAL { rte_eth_dev_tx_offload_name; rte_eth_switch_domain_alloc; rte_eth_switch_domain_free; + rte_flow_expand_rss; rte_mtr_capabilities_get; rte_mtr_create; rte_mtr_destroy; diff --git a/lib/librte_ethdev/rte_flow.c b/lib/librte_ethdev/rte_flow.c index b2afba08..cff4b520 100644 --- a/lib/librte_ethdev/rte_flow.c +++ b/lib/librte_ethdev/rte_flow.c @@ -84,7 +84,7 @@ static const struct rte_flow_desc_data rte_flow_desc_action[] = { MK_FLOW_ACTION(FLAG, 0), MK_FLOW_ACTION(QUEUE, sizeof(struct rte_flow_action_queue)), MK_FLOW_ACTION(DROP, 0), - MK_FLOW_ACTION(COUNT, 0), + MK_FLOW_ACTION(COUNT, sizeof(struct rte_flow_action_count)), MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)), MK_FLOW_ACTION(PF, 0), MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)), @@ -526,3 +526,110 @@ store: } return 0; } + +/** + * Expand RSS flows into several possible flows according to the RSS hash + * fields requested and the driver capabilities. + */ +int __rte_experimental +rte_flow_expand_rss(struct rte_flow_expand_rss *buf, size_t size, + const struct rte_flow_item *pattern, uint64_t types, + const struct rte_flow_expand_node graph[], + int graph_root_index) +{ + const int elt_n = 8; + const struct rte_flow_item *item; + const struct rte_flow_expand_node *node = &graph[graph_root_index]; + const int *next_node; + const int *stack[elt_n]; + int stack_pos = 0; + struct rte_flow_item flow_items[elt_n]; + unsigned int i; + size_t lsize; + size_t user_pattern_size = 0; + void *addr = NULL; + + lsize = offsetof(struct rte_flow_expand_rss, entry) + + elt_n * sizeof(buf->entry[0]); + if (lsize <= size) { + buf->entry[0].priority = 0; + buf->entry[0].pattern = (void *)&buf->entry[elt_n]; + buf->entries = 0; + addr = buf->entry[0].pattern; + } + for (item = pattern; item->type != RTE_FLOW_ITEM_TYPE_END; item++) { + const struct rte_flow_expand_node *next = NULL; + + for (i = 0; node->next && node->next[i]; ++i) { + next = &graph[node->next[i]]; + if (next->type == item->type) + break; + } + if (next) + node = next; + user_pattern_size += sizeof(*item); + } + user_pattern_size += sizeof(*item); /* Handle END item. */ + lsize += user_pattern_size; + /* Copy the user pattern in the first entry of the buffer. */ + if (lsize <= size) { + rte_memcpy(addr, pattern, user_pattern_size); + addr = (void *)(((uintptr_t)addr) + user_pattern_size); + buf->entries = 1; + } + /* Start expanding. */ + memset(flow_items, 0, sizeof(flow_items)); + user_pattern_size -= sizeof(*item); + next_node = node->next; + stack[stack_pos] = next_node; + node = next_node ? &graph[*next_node] : NULL; + while (node) { + flow_items[stack_pos].type = node->type; + if (node->rss_types & types) { + /* + * compute the number of items to copy from the + * expansion and copy it. + * When the stack_pos is 0, there are 1 element in it, + * plus the addition END item. + */ + int elt = stack_pos + 2; + + flow_items[stack_pos + 1].type = RTE_FLOW_ITEM_TYPE_END; + lsize += elt * sizeof(*item) + user_pattern_size; + if (lsize <= size) { + size_t n = elt * sizeof(*item); + + buf->entry[buf->entries].priority = + stack_pos + 1; + buf->entry[buf->entries].pattern = addr; + buf->entries++; + rte_memcpy(addr, buf->entry[0].pattern, + user_pattern_size); + addr = (void *)(((uintptr_t)addr) + + user_pattern_size); + rte_memcpy(addr, flow_items, n); + addr = (void *)(((uintptr_t)addr) + n); + } + } + /* Go deeper. */ + if (node->next) { + next_node = node->next; + if (stack_pos++ == elt_n) { + rte_errno = E2BIG; + return -rte_errno; + } + stack[stack_pos] = next_node; + } else if (*(next_node + 1)) { + /* Follow up with the next possibility. */ + ++next_node; + } else { + /* Move to the next path. */ + if (stack_pos) + next_node = stack[--stack_pos]; + next_node++; + stack[stack_pos] = next_node; + } + node = *next_node ? &graph[*next_node] : NULL; + }; + return lsize; +} diff --git a/lib/librte_ethdev/rte_flow_driver.h b/lib/librte_ethdev/rte_flow_driver.h index 1c90c600..688f7230 100644 --- a/lib/librte_ethdev/rte_flow_driver.h +++ b/lib/librte_ethdev/rte_flow_driver.h @@ -114,6 +114,69 @@ struct rte_flow_ops { const struct rte_flow_ops * rte_flow_ops_get(uint16_t port_id, struct rte_flow_error *error); +/** Helper macro to build input graph for rte_flow_expand_rss(). */ +#define RTE_FLOW_EXPAND_RSS_NEXT(...) \ + (const int []){ \ + __VA_ARGS__, 0, \ + } + +/** Node object of input graph for rte_flow_expand_rss(). */ +struct rte_flow_expand_node { + const int *const next; + /**< + * List of next node indexes. Index 0 is interpreted as a terminator. + */ + const enum rte_flow_item_type type; + /**< Pattern item type of current node. */ + uint64_t rss_types; + /**< + * RSS types bit-field associated with this node + * (see ETH_RSS_* definitions). + */ +}; + +/** Object returned by rte_flow_expand_rss(). */ +struct rte_flow_expand_rss { + uint32_t entries; + /**< Number of entries @p patterns and @p priorities. */ + struct { + struct rte_flow_item *pattern; /**< Expanded pattern array. */ + uint32_t priority; /**< Priority offset for each expansion. */ + } entry[]; +}; + +/** + * Expand RSS flows into several possible flows according to the RSS hash + * fields requested and the driver capabilities. + * + * @b EXPERIMENTAL: this API may change without prior notice + * + * @param[out] buf + * Buffer to store the result expansion. + * @param[in] size + * Buffer size in bytes. If 0, @p buf can be NULL. + * @param[in] pattern + * User flow pattern. + * @param[in] types + * RSS types to expand (see ETH_RSS_* definitions). + * @param[in] graph + * Input graph to expand @p pattern according to @p types. + * @param[in] graph_root_index + * Index of root node in @p graph, typically 0. + * + * @return + * A positive value representing the size of @p buf in bytes regardless of + * @p size on success, a negative errno value otherwise and rte_errno is + * set, the following errors are defined: + * + * -E2BIG: graph-depth @p graph is too deep. + */ +int __rte_experimental +rte_flow_expand_rss(struct rte_flow_expand_rss *buf, size_t size, + const struct rte_flow_item *pattern, uint64_t types, + const struct rte_flow_expand_node graph[], + int graph_root_index); + #ifdef __cplusplus } #endif diff --git a/lib/librte_ethdev/rte_tm.h b/lib/librte_ethdev/rte_tm.h index 72554038..955f02ff 100644 --- a/lib/librte_ethdev/rte_tm.h +++ b/lib/librte_ethdev/rte_tm.h @@ -1569,6 +1569,10 @@ rte_tm_hierarchy_commit(uint16_t port_id, /** * Traffic manager node parent update * + * This function may be used to move a node and its children to a different + * parent. Additionally, if the new parent is the same as the current parent, + * this function will update the priority/weight of an existing node. + * * Restriction for root node: its parent cannot be changed. * * This function can only be called after the rte_tm_hierarchy_commit() diff --git a/lib/librte_eventdev/Makefile b/lib/librte_eventdev/Makefile index b3e25464..47f599a6 100644 --- a/lib/librte_eventdev/Makefile +++ b/lib/librte_eventdev/Makefile @@ -8,14 +8,19 @@ include $(RTE_SDK)/mk/rte.vars.mk LIB = librte_eventdev.a # library version -LIBABIVER := 4 +LIBABIVER := 5 # build flags CFLAGS += -DALLOW_EXPERIMENTAL_API CFLAGS += -O3 CFLAGS += $(WERROR_FLAGS) +ifeq ($(CONFIG_RTE_EXEC_ENV_LINUXAPP),y) +CFLAGS += -DLINUX +else +CFLAGS += -DBSD +endif LDLIBS += -lrte_eal -lrte_ring -lrte_ethdev -lrte_hash -lrte_mempool -lrte_timer -LDLIBS += -lrte_mbuf -lrte_cryptodev +LDLIBS += -lrte_mbuf -lrte_cryptodev -lpthread # library source files SRCS-y += rte_eventdev.c diff --git a/lib/librte_eventdev/meson.build b/lib/librte_eventdev/meson.build index bd138bd5..3cbaf298 100644 --- a/lib/librte_eventdev/meson.build +++ b/lib/librte_eventdev/meson.build @@ -1,8 +1,15 @@ # SPDX-License-Identifier: BSD-3-Clause # Copyright(c) 2017 Intel Corporation -version = 4 +version = 5 allow_experimental_apis = true + +if host_machine.system() == 'linux' + cflags += '-DLINUX' +else + cflags += '-DBSD' +endif + sources = files('rte_eventdev.c', 'rte_event_ring.c', 'rte_event_eth_rx_adapter.c', diff --git a/lib/librte_eventdev/rte_event_crypto_adapter.c b/lib/librte_eventdev/rte_event_crypto_adapter.c index ba63a87b..11b28ca9 100644 --- a/lib/librte_eventdev/rte_event_crypto_adapter.c +++ b/lib/librte_eventdev/rte_event_crypto_adapter.c @@ -342,7 +342,7 @@ eca_enq_to_cryptodev(struct rte_event_crypto_adapter *adapter, if (crypto_op == NULL) continue; if (crypto_op->sess_type == RTE_CRYPTO_OP_WITH_SESSION) { - m_data = rte_cryptodev_sym_session_get_private_data( + m_data = rte_cryptodev_sym_session_get_user_data( crypto_op->sym->session); if (m_data == NULL) { rte_pktmbuf_free(crypto_op->sym->m_src); @@ -512,7 +512,7 @@ eca_ops_enqueue_burst(struct rte_event_crypto_adapter *adapter, for (i = 0; i < num; i++) { struct rte_event *ev = &events[nb_ev++]; if (ops[i]->sess_type == RTE_CRYPTO_OP_WITH_SESSION) { - m_data = rte_cryptodev_sym_session_get_private_data( + m_data = rte_cryptodev_sym_session_get_user_data( ops[i]->sym->session); } else if (ops[i]->sess_type == RTE_CRYPTO_OP_SESSIONLESS && ops[i]->private_data_offset) { diff --git a/lib/librte_eventdev/rte_event_eth_rx_adapter.c b/lib/librte_eventdev/rte_event_eth_rx_adapter.c index 6f705095..f5e5a0b5 100644 --- a/lib/librte_eventdev/rte_event_eth_rx_adapter.c +++ b/lib/librte_eventdev/rte_event_eth_rx_adapter.c @@ -2,6 +2,11 @@ * Copyright(c) 2017 Intel Corporation. * All rights reserved. */ +#if defined(LINUX) +#include <sys/epoll.h> +#endif +#include <unistd.h> + #include <rte_cycles.h> #include <rte_common.h> #include <rte_dev.h> @@ -11,6 +16,7 @@ #include <rte_malloc.h> #include <rte_service_component.h> #include <rte_thash.h> +#include <rte_interrupts.h> #include "rte_eventdev.h" #include "rte_eventdev_pmd.h" @@ -24,6 +30,22 @@ #define ETH_RX_ADAPTER_MEM_NAME_LEN 32 #define RSS_KEY_SIZE 40 +/* value written to intr thread pipe to signal thread exit */ +#define ETH_BRIDGE_INTR_THREAD_EXIT 1 +/* Sentinel value to detect initialized file handle */ +#define INIT_FD -1 + +/* + * Used to store port and queue ID of interrupting Rx queue + */ +union queue_data { + RTE_STD_C11 + void *ptr; + struct { + uint16_t port; + uint16_t queue; + }; +}; /* * There is an instance of this struct per polled Rx queue added to the @@ -75,6 +97,30 @@ struct rte_event_eth_rx_adapter { uint16_t enq_block_count; /* Block start ts */ uint64_t rx_enq_block_start_ts; + /* epoll fd used to wait for Rx interrupts */ + int epd; + /* Num of interrupt driven interrupt queues */ + uint32_t num_rx_intr; + /* Used to send <dev id, queue id> of interrupting Rx queues from + * the interrupt thread to the Rx thread + */ + struct rte_ring *intr_ring; + /* Rx Queue data (dev id, queue id) for the last non-empty + * queue polled + */ + union queue_data qd; + /* queue_data is valid */ + int qd_valid; + /* Interrupt ring lock, synchronizes Rx thread + * and interrupt thread + */ + rte_spinlock_t intr_ring_lock; + /* event array passed to rte_poll_wait */ + struct rte_epoll_event *epoll_events; + /* Count of interrupt vectors in use */ + uint32_t num_intr_vec; + /* Thread blocked on Rx interrupts */ + pthread_t rx_intr_thread; /* Configuration callback for rte_service configuration */ rte_event_eth_rx_adapter_conf_cb conf_cb; /* Configuration callback argument */ @@ -91,12 +137,20 @@ struct rte_event_eth_rx_adapter { int socket_id; /* Per adapter EAL service */ uint32_t service_id; + /* Adapter started flag */ + uint8_t rxa_started; + /* Adapter ID */ + uint8_t id; } __rte_cache_aligned; /* Per eth device */ struct eth_device_info { struct rte_eth_dev *dev; struct eth_rx_queue_info *rx_queue; + /* Rx callback */ + rte_event_eth_rx_adapter_cb_fn cb_fn; + /* Rx callback argument */ + void *cb_arg; /* Set if ethdev->eventdev packet transfer uses a * hardware mechanism */ @@ -107,15 +161,42 @@ struct eth_device_info { * rx_adapter_stop callback needs to be invoked */ uint8_t dev_rx_started; - /* If nb_dev_queues > 0, the start callback will + /* Number of queues added for this device */ + uint16_t nb_dev_queues; + /* Number of poll based queues + * If nb_rx_poll > 0, the start callback will * be invoked if not already invoked */ - uint16_t nb_dev_queues; + uint16_t nb_rx_poll; + /* Number of interrupt based queues + * If nb_rx_intr > 0, the start callback will + * be invoked if not already invoked. + */ + uint16_t nb_rx_intr; + /* Number of queues that use the shared interrupt */ + uint16_t nb_shared_intr; + /* sum(wrr(q)) for all queues within the device + * useful when deleting all device queues + */ + uint32_t wrr_len; + /* Intr based queue index to start polling from, this is used + * if the number of shared interrupts is non-zero + */ + uint16_t next_q_idx; + /* Intr based queue indices */ + uint16_t *intr_queue; + /* device generates per Rx queue interrupt for queue index + * for queue indices < RTE_MAX_RXTX_INTR_VEC_ID - 1 + */ + int multi_intr_cap; + /* shared interrupt enabled */ + int shared_intr_enabled; }; /* Per Rx queue */ struct eth_rx_queue_info { int queue_enabled; /* True if added */ + int intr_enabled; uint16_t wt; /* Polling weight */ uint8_t event_queue_id; /* Event queue to enqueue packets to */ uint8_t sched_type; /* Sched type for events */ @@ -127,30 +208,30 @@ struct eth_rx_queue_info { static struct rte_event_eth_rx_adapter **event_eth_rx_adapter; static inline int -valid_id(uint8_t id) +rxa_validate_id(uint8_t id) { return id < RTE_EVENT_ETH_RX_ADAPTER_MAX_INSTANCE; } #define RTE_EVENT_ETH_RX_ADAPTER_ID_VALID_OR_ERR_RET(id, retval) do { \ - if (!valid_id(id)) { \ + if (!rxa_validate_id(id)) { \ RTE_EDEV_LOG_ERR("Invalid eth Rx adapter id = %d\n", id); \ return retval; \ } \ } while (0) static inline int -sw_rx_adapter_queue_count(struct rte_event_eth_rx_adapter *rx_adapter) +rxa_sw_adapter_queue_count(struct rte_event_eth_rx_adapter *rx_adapter) { - return rx_adapter->num_rx_polled; + return rx_adapter->num_rx_polled + rx_adapter->num_rx_intr; } /* Greatest common divisor */ -static uint16_t gcd_u16(uint16_t a, uint16_t b) +static uint16_t rxa_gcd_u16(uint16_t a, uint16_t b) { uint16_t r = a % b; - return r ? gcd_u16(b, r) : b; + return r ? rxa_gcd_u16(b, r) : b; } /* Returns the next queue in the polling sequence @@ -158,7 +239,7 @@ static uint16_t gcd_u16(uint16_t a, uint16_t b) * http://kb.linuxvirtualserver.org/wiki/Weighted_Round-Robin_Scheduling */ static int -wrr_next(struct rte_event_eth_rx_adapter *rx_adapter, +rxa_wrr_next(struct rte_event_eth_rx_adapter *rx_adapter, unsigned int n, int *cw, struct eth_rx_poll_entry *eth_rx_poll, uint16_t max_wt, uint16_t gcd, int prev) @@ -186,13 +267,298 @@ wrr_next(struct rte_event_eth_rx_adapter *rx_adapter, } } -/* Precalculate WRR polling sequence for all queues in rx_adapter */ +static inline int +rxa_shared_intr(struct eth_device_info *dev_info, + int rx_queue_id) +{ + int multi_intr_cap; + + if (dev_info->dev->intr_handle == NULL) + return 0; + + multi_intr_cap = rte_intr_cap_multiple(dev_info->dev->intr_handle); + return !multi_intr_cap || + rx_queue_id >= RTE_MAX_RXTX_INTR_VEC_ID - 1; +} + +static inline int +rxa_intr_queue(struct eth_device_info *dev_info, + int rx_queue_id) +{ + struct eth_rx_queue_info *queue_info; + + queue_info = &dev_info->rx_queue[rx_queue_id]; + return dev_info->rx_queue && + !dev_info->internal_event_port && + queue_info->queue_enabled && queue_info->wt == 0; +} + +static inline int +rxa_polled_queue(struct eth_device_info *dev_info, + int rx_queue_id) +{ + struct eth_rx_queue_info *queue_info; + + queue_info = &dev_info->rx_queue[rx_queue_id]; + return !dev_info->internal_event_port && + dev_info->rx_queue && + queue_info->queue_enabled && queue_info->wt != 0; +} + +/* Calculate change in number of vectors after Rx queue ID is add/deleted */ static int -eth_poll_wrr_calc(struct rte_event_eth_rx_adapter *rx_adapter) +rxa_nb_intr_vect(struct eth_device_info *dev_info, int rx_queue_id, int add) +{ + uint16_t i; + int n, s; + uint16_t nbq; + + nbq = dev_info->dev->data->nb_rx_queues; + n = 0; /* non shared count */ + s = 0; /* shared count */ + + if (rx_queue_id == -1) { + for (i = 0; i < nbq; i++) { + if (!rxa_shared_intr(dev_info, i)) + n += add ? !rxa_intr_queue(dev_info, i) : + rxa_intr_queue(dev_info, i); + else + s += add ? !rxa_intr_queue(dev_info, i) : + rxa_intr_queue(dev_info, i); + } + + if (s > 0) { + if ((add && dev_info->nb_shared_intr == 0) || + (!add && dev_info->nb_shared_intr)) + n += 1; + } + } else { + if (!rxa_shared_intr(dev_info, rx_queue_id)) + n = add ? !rxa_intr_queue(dev_info, rx_queue_id) : + rxa_intr_queue(dev_info, rx_queue_id); + else + n = add ? !dev_info->nb_shared_intr : + dev_info->nb_shared_intr == 1; + } + + return add ? n : -n; +} + +/* Calculate nb_rx_intr after deleting interrupt mode rx queues + */ +static void +rxa_calc_nb_post_intr_del(struct rte_event_eth_rx_adapter *rx_adapter, + struct eth_device_info *dev_info, + int rx_queue_id, + uint32_t *nb_rx_intr) +{ + uint32_t intr_diff; + + if (rx_queue_id == -1) + intr_diff = dev_info->nb_rx_intr; + else + intr_diff = rxa_intr_queue(dev_info, rx_queue_id); + + *nb_rx_intr = rx_adapter->num_rx_intr - intr_diff; +} + +/* Calculate nb_rx_* after adding interrupt mode rx queues, newly added + * interrupt queues could currently be poll mode Rx queues + */ +static void +rxa_calc_nb_post_add_intr(struct rte_event_eth_rx_adapter *rx_adapter, + struct eth_device_info *dev_info, + int rx_queue_id, + uint32_t *nb_rx_poll, + uint32_t *nb_rx_intr, + uint32_t *nb_wrr) +{ + uint32_t intr_diff; + uint32_t poll_diff; + uint32_t wrr_len_diff; + + if (rx_queue_id == -1) { + intr_diff = dev_info->dev->data->nb_rx_queues - + dev_info->nb_rx_intr; + poll_diff = dev_info->nb_rx_poll; + wrr_len_diff = dev_info->wrr_len; + } else { + intr_diff = !rxa_intr_queue(dev_info, rx_queue_id); + poll_diff = rxa_polled_queue(dev_info, rx_queue_id); + wrr_len_diff = poll_diff ? dev_info->rx_queue[rx_queue_id].wt : + 0; + } + + *nb_rx_intr = rx_adapter->num_rx_intr + intr_diff; + *nb_rx_poll = rx_adapter->num_rx_polled - poll_diff; + *nb_wrr = rx_adapter->wrr_len - wrr_len_diff; +} + +/* Calculate size of the eth_rx_poll and wrr_sched arrays + * after deleting poll mode rx queues + */ +static void +rxa_calc_nb_post_poll_del(struct rte_event_eth_rx_adapter *rx_adapter, + struct eth_device_info *dev_info, + int rx_queue_id, + uint32_t *nb_rx_poll, + uint32_t *nb_wrr) +{ + uint32_t poll_diff; + uint32_t wrr_len_diff; + + if (rx_queue_id == -1) { + poll_diff = dev_info->nb_rx_poll; + wrr_len_diff = dev_info->wrr_len; + } else { + poll_diff = rxa_polled_queue(dev_info, rx_queue_id); + wrr_len_diff = poll_diff ? dev_info->rx_queue[rx_queue_id].wt : + 0; + } + + *nb_rx_poll = rx_adapter->num_rx_polled - poll_diff; + *nb_wrr = rx_adapter->wrr_len - wrr_len_diff; +} + +/* Calculate nb_rx_* after adding poll mode rx queues + */ +static void +rxa_calc_nb_post_add_poll(struct rte_event_eth_rx_adapter *rx_adapter, + struct eth_device_info *dev_info, + int rx_queue_id, + uint16_t wt, + uint32_t *nb_rx_poll, + uint32_t *nb_rx_intr, + uint32_t *nb_wrr) +{ + uint32_t intr_diff; + uint32_t poll_diff; + uint32_t wrr_len_diff; + + if (rx_queue_id == -1) { + intr_diff = dev_info->nb_rx_intr; + poll_diff = dev_info->dev->data->nb_rx_queues - + dev_info->nb_rx_poll; + wrr_len_diff = wt*dev_info->dev->data->nb_rx_queues + - dev_info->wrr_len; + } else { + intr_diff = rxa_intr_queue(dev_info, rx_queue_id); + poll_diff = !rxa_polled_queue(dev_info, rx_queue_id); + wrr_len_diff = rxa_polled_queue(dev_info, rx_queue_id) ? + wt - dev_info->rx_queue[rx_queue_id].wt : + wt; + } + + *nb_rx_poll = rx_adapter->num_rx_polled + poll_diff; + *nb_rx_intr = rx_adapter->num_rx_intr - intr_diff; + *nb_wrr = rx_adapter->wrr_len + wrr_len_diff; +} + +/* Calculate nb_rx_* after adding rx_queue_id */ +static void +rxa_calc_nb_post_add(struct rte_event_eth_rx_adapter *rx_adapter, + struct eth_device_info *dev_info, + int rx_queue_id, + uint16_t wt, + uint32_t *nb_rx_poll, + uint32_t *nb_rx_intr, + uint32_t *nb_wrr) +{ + if (wt != 0) + rxa_calc_nb_post_add_poll(rx_adapter, dev_info, rx_queue_id, + wt, nb_rx_poll, nb_rx_intr, nb_wrr); + else + rxa_calc_nb_post_add_intr(rx_adapter, dev_info, rx_queue_id, + nb_rx_poll, nb_rx_intr, nb_wrr); +} + +/* Calculate nb_rx_* after deleting rx_queue_id */ +static void +rxa_calc_nb_post_del(struct rte_event_eth_rx_adapter *rx_adapter, + struct eth_device_info *dev_info, + int rx_queue_id, + uint32_t *nb_rx_poll, + uint32_t *nb_rx_intr, + uint32_t *nb_wrr) +{ + rxa_calc_nb_post_poll_del(rx_adapter, dev_info, rx_queue_id, nb_rx_poll, + nb_wrr); + rxa_calc_nb_post_intr_del(rx_adapter, dev_info, rx_queue_id, + nb_rx_intr); +} + +/* + * Allocate the rx_poll array + */ +static struct eth_rx_poll_entry * +rxa_alloc_poll(struct rte_event_eth_rx_adapter *rx_adapter, + uint32_t num_rx_polled) +{ + size_t len; + + len = RTE_ALIGN(num_rx_polled * sizeof(*rx_adapter->eth_rx_poll), + RTE_CACHE_LINE_SIZE); + return rte_zmalloc_socket(rx_adapter->mem_name, + len, + RTE_CACHE_LINE_SIZE, + rx_adapter->socket_id); +} + +/* + * Allocate the WRR array + */ +static uint32_t * +rxa_alloc_wrr(struct rte_event_eth_rx_adapter *rx_adapter, int nb_wrr) +{ + size_t len; + + len = RTE_ALIGN(nb_wrr * sizeof(*rx_adapter->wrr_sched), + RTE_CACHE_LINE_SIZE); + return rte_zmalloc_socket(rx_adapter->mem_name, + len, + RTE_CACHE_LINE_SIZE, + rx_adapter->socket_id); +} + +static int +rxa_alloc_poll_arrays(struct rte_event_eth_rx_adapter *rx_adapter, + uint32_t nb_poll, + uint32_t nb_wrr, + struct eth_rx_poll_entry **rx_poll, + uint32_t **wrr_sched) +{ + + if (nb_poll == 0) { + *rx_poll = NULL; + *wrr_sched = NULL; + return 0; + } + + *rx_poll = rxa_alloc_poll(rx_adapter, nb_poll); + if (*rx_poll == NULL) { + *wrr_sched = NULL; + return -ENOMEM; + } + + *wrr_sched = rxa_alloc_wrr(rx_adapter, nb_wrr); + if (*wrr_sched == NULL) { + rte_free(*rx_poll); + return -ENOMEM; + } + return 0; +} + +/* Precalculate WRR polling sequence for all queues in rx_adapter */ +static void +rxa_calc_wrr_sequence(struct rte_event_eth_rx_adapter *rx_adapter, + struct eth_rx_poll_entry *rx_poll, + uint32_t *rx_wrr) { uint16_t d; uint16_t q; unsigned int i; + int prev = -1; + int cw = -1; /* Initialize variables for calculation of wrr schedule */ uint16_t max_wrr_pos = 0; @@ -200,79 +566,52 @@ eth_poll_wrr_calc(struct rte_event_eth_rx_adapter *rx_adapter) uint16_t max_wt = 0; uint16_t gcd = 0; - struct eth_rx_poll_entry *rx_poll = NULL; - uint32_t *rx_wrr = NULL; + if (rx_poll == NULL) + return; - if (rx_adapter->num_rx_polled) { - size_t len = RTE_ALIGN(rx_adapter->num_rx_polled * - sizeof(*rx_adapter->eth_rx_poll), - RTE_CACHE_LINE_SIZE); - rx_poll = rte_zmalloc_socket(rx_adapter->mem_name, - len, - RTE_CACHE_LINE_SIZE, - rx_adapter->socket_id); - if (rx_poll == NULL) - return -ENOMEM; + /* Generate array of all queues to poll, the size of this + * array is poll_q + */ + RTE_ETH_FOREACH_DEV(d) { + uint16_t nb_rx_queues; + struct eth_device_info *dev_info = + &rx_adapter->eth_devices[d]; + nb_rx_queues = dev_info->dev->data->nb_rx_queues; + if (dev_info->rx_queue == NULL) + continue; + if (dev_info->internal_event_port) + continue; + dev_info->wrr_len = 0; + for (q = 0; q < nb_rx_queues; q++) { + struct eth_rx_queue_info *queue_info = + &dev_info->rx_queue[q]; + uint16_t wt; - /* Generate array of all queues to poll, the size of this - * array is poll_q - */ - RTE_ETH_FOREACH_DEV(d) { - uint16_t nb_rx_queues; - struct eth_device_info *dev_info = - &rx_adapter->eth_devices[d]; - nb_rx_queues = dev_info->dev->data->nb_rx_queues; - if (dev_info->rx_queue == NULL) + if (!rxa_polled_queue(dev_info, q)) continue; - for (q = 0; q < nb_rx_queues; q++) { - struct eth_rx_queue_info *queue_info = - &dev_info->rx_queue[q]; - if (queue_info->queue_enabled == 0) - continue; - - uint16_t wt = queue_info->wt; - rx_poll[poll_q].eth_dev_id = d; - rx_poll[poll_q].eth_rx_qid = q; - max_wrr_pos += wt; - max_wt = RTE_MAX(max_wt, wt); - gcd = (gcd) ? gcd_u16(gcd, wt) : wt; - poll_q++; - } - } - - len = RTE_ALIGN(max_wrr_pos * sizeof(*rx_wrr), - RTE_CACHE_LINE_SIZE); - rx_wrr = rte_zmalloc_socket(rx_adapter->mem_name, - len, - RTE_CACHE_LINE_SIZE, - rx_adapter->socket_id); - if (rx_wrr == NULL) { - rte_free(rx_poll); - return -ENOMEM; - } - - /* Generate polling sequence based on weights */ - int prev = -1; - int cw = -1; - for (i = 0; i < max_wrr_pos; i++) { - rx_wrr[i] = wrr_next(rx_adapter, poll_q, &cw, - rx_poll, max_wt, gcd, prev); - prev = rx_wrr[i]; + wt = queue_info->wt; + rx_poll[poll_q].eth_dev_id = d; + rx_poll[poll_q].eth_rx_qid = q; + max_wrr_pos += wt; + dev_info->wrr_len += wt; + max_wt = RTE_MAX(max_wt, wt); + gcd = (gcd) ? rxa_gcd_u16(gcd, wt) : wt; + poll_q++; } } - rte_free(rx_adapter->eth_rx_poll); - rte_free(rx_adapter->wrr_sched); - - rx_adapter->eth_rx_poll = rx_poll; - rx_adapter->wrr_sched = rx_wrr; - rx_adapter->wrr_len = max_wrr_pos; - - return 0; + /* Generate polling sequence based on weights */ + prev = -1; + cw = -1; + for (i = 0; i < max_wrr_pos; i++) { + rx_wrr[i] = rxa_wrr_next(rx_adapter, poll_q, &cw, + rx_poll, max_wt, gcd, prev); + prev = rx_wrr[i]; + } } static inline void -mtoip(struct rte_mbuf *m, struct ipv4_hdr **ipv4_hdr, +rxa_mtoip(struct rte_mbuf *m, struct ipv4_hdr **ipv4_hdr, struct ipv6_hdr **ipv6_hdr) { struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); @@ -311,7 +650,7 @@ mtoip(struct rte_mbuf *m, struct ipv4_hdr **ipv4_hdr, /* Calculate RSS hash for IPv4/6 */ static inline uint32_t -do_softrss(struct rte_mbuf *m, const uint8_t *rss_key_be) +rxa_do_softrss(struct rte_mbuf *m, const uint8_t *rss_key_be) { uint32_t input_len; void *tuple; @@ -320,7 +659,7 @@ do_softrss(struct rte_mbuf *m, const uint8_t *rss_key_be) struct ipv4_hdr *ipv4_hdr; struct ipv6_hdr *ipv6_hdr; - mtoip(m, &ipv4_hdr, &ipv6_hdr); + rxa_mtoip(m, &ipv4_hdr, &ipv6_hdr); if (ipv4_hdr) { ipv4_tuple.src_addr = rte_be_to_cpu_32(ipv4_hdr->src_addr); @@ -339,13 +678,13 @@ do_softrss(struct rte_mbuf *m, const uint8_t *rss_key_be) } static inline int -rx_enq_blocked(struct rte_event_eth_rx_adapter *rx_adapter) +rxa_enq_blocked(struct rte_event_eth_rx_adapter *rx_adapter) { return !!rx_adapter->enq_block_count; } static inline void -rx_enq_block_start_ts(struct rte_event_eth_rx_adapter *rx_adapter) +rxa_enq_block_start_ts(struct rte_event_eth_rx_adapter *rx_adapter) { if (rx_adapter->rx_enq_block_start_ts) return; @@ -358,13 +697,13 @@ rx_enq_block_start_ts(struct rte_event_eth_rx_adapter *rx_adapter) } static inline void -rx_enq_block_end_ts(struct rte_event_eth_rx_adapter *rx_adapter, +rxa_enq_block_end_ts(struct rte_event_eth_rx_adapter *rx_adapter, struct rte_event_eth_rx_adapter_stats *stats) { if (unlikely(!stats->rx_enq_start_ts)) stats->rx_enq_start_ts = rte_get_tsc_cycles(); - if (likely(!rx_enq_blocked(rx_adapter))) + if (likely(!rxa_enq_blocked(rx_adapter))) return; rx_adapter->enq_block_count = 0; @@ -380,8 +719,8 @@ rx_enq_block_end_ts(struct rte_event_eth_rx_adapter *rx_adapter, * this function */ static inline void -buf_event_enqueue(struct rte_event_eth_rx_adapter *rx_adapter, - struct rte_event *ev) +rxa_buffer_event(struct rte_event_eth_rx_adapter *rx_adapter, + struct rte_event *ev) { struct rte_eth_event_enqueue_buffer *buf = &rx_adapter->event_enqueue_buffer; @@ -390,7 +729,7 @@ buf_event_enqueue(struct rte_event_eth_rx_adapter *rx_adapter, /* Enqueue buffered events to event device */ static inline uint16_t -flush_event_buffer(struct rte_event_eth_rx_adapter *rx_adapter) +rxa_flush_event_buffer(struct rte_event_eth_rx_adapter *rx_adapter) { struct rte_eth_event_enqueue_buffer *buf = &rx_adapter->event_enqueue_buffer; @@ -407,8 +746,8 @@ flush_event_buffer(struct rte_event_eth_rx_adapter *rx_adapter) stats->rx_enq_retry++; } - n ? rx_enq_block_end_ts(rx_adapter, stats) : - rx_enq_block_start_ts(rx_adapter); + n ? rxa_enq_block_end_ts(rx_adapter, stats) : + rxa_enq_block_start_ts(rx_adapter); buf->count -= n; stats->rx_enq_count += n; @@ -417,18 +756,19 @@ flush_event_buffer(struct rte_event_eth_rx_adapter *rx_adapter) } static inline void -fill_event_buffer(struct rte_event_eth_rx_adapter *rx_adapter, - uint8_t dev_id, - uint16_t rx_queue_id, - struct rte_mbuf **mbufs, - uint16_t num) +rxa_buffer_mbufs(struct rte_event_eth_rx_adapter *rx_adapter, + uint16_t eth_dev_id, + uint16_t rx_queue_id, + struct rte_mbuf **mbufs, + uint16_t num) { uint32_t i; - struct eth_device_info *eth_device_info = - &rx_adapter->eth_devices[dev_id]; + struct eth_device_info *dev_info = + &rx_adapter->eth_devices[eth_dev_id]; struct eth_rx_queue_info *eth_rx_queue_info = - ð_device_info->rx_queue[rx_queue_id]; - + &dev_info->rx_queue[rx_queue_id]; + struct rte_eth_event_enqueue_buffer *buf = + &rx_adapter->event_enqueue_buffer; int32_t qid = eth_rx_queue_info->event_queue_id; uint8_t sched_type = eth_rx_queue_info->sched_type; uint8_t priority = eth_rx_queue_info->priority; @@ -439,6 +779,8 @@ fill_event_buffer(struct rte_event_eth_rx_adapter *rx_adapter, uint32_t rss; int do_rss; uint64_t ts; + struct rte_mbuf *cb_mbufs[BATCH_SIZE]; + uint16_t nb_cb; /* 0xffff ffff if PKT_RX_RSS_HASH is set, otherwise 0 */ rss_mask = ~(((m->ol_flags & PKT_RX_RSS_HASH) != 0) - 1); @@ -454,12 +796,26 @@ fill_event_buffer(struct rte_event_eth_rx_adapter *rx_adapter, } } + + nb_cb = dev_info->cb_fn ? dev_info->cb_fn(eth_dev_id, rx_queue_id, + ETH_EVENT_BUFFER_SIZE, + buf->count, mbufs, + num, + dev_info->cb_arg, + cb_mbufs) : + num; + if (nb_cb < num) { + mbufs = cb_mbufs; + num = nb_cb; + } + for (i = 0; i < num; i++) { m = mbufs[i]; struct rte_event *ev = &events[i]; rss = do_rss ? - do_softrss(m, rx_adapter->rss_key_be) : m->hash.rss; + rxa_do_softrss(m, rx_adapter->rss_key_be) : + m->hash.rss; flow_id = eth_rx_queue_info->flow_id & eth_rx_queue_info->flow_id_mask; @@ -473,8 +829,275 @@ fill_event_buffer(struct rte_event_eth_rx_adapter *rx_adapter, ev->priority = priority; ev->mbuf = m; - buf_event_enqueue(rx_adapter, ev); + rxa_buffer_event(rx_adapter, ev); + } +} + +/* Enqueue packets from <port, q> to event buffer */ +static inline uint32_t +rxa_eth_rx(struct rte_event_eth_rx_adapter *rx_adapter, + uint16_t port_id, + uint16_t queue_id, + uint32_t rx_count, + uint32_t max_rx, + int *rxq_empty) +{ + struct rte_mbuf *mbufs[BATCH_SIZE]; + struct rte_eth_event_enqueue_buffer *buf = + &rx_adapter->event_enqueue_buffer; + struct rte_event_eth_rx_adapter_stats *stats = + &rx_adapter->stats; + uint16_t n; + uint32_t nb_rx = 0; + + if (rxq_empty) + *rxq_empty = 0; + /* Don't do a batch dequeue from the rx queue if there isn't + * enough space in the enqueue buffer. + */ + while (BATCH_SIZE <= (RTE_DIM(buf->events) - buf->count)) { + if (buf->count >= BATCH_SIZE) + rxa_flush_event_buffer(rx_adapter); + + stats->rx_poll_count++; + n = rte_eth_rx_burst(port_id, queue_id, mbufs, BATCH_SIZE); + if (unlikely(!n)) { + if (rxq_empty) + *rxq_empty = 1; + break; + } + rxa_buffer_mbufs(rx_adapter, port_id, queue_id, mbufs, n); + nb_rx += n; + if (rx_count + nb_rx > max_rx) + break; } + + if (buf->count >= BATCH_SIZE) + rxa_flush_event_buffer(rx_adapter); + + return nb_rx; +} + +static inline void +rxa_intr_ring_enqueue(struct rte_event_eth_rx_adapter *rx_adapter, + void *data) +{ + uint16_t port_id; + uint16_t queue; + int err; + union queue_data qd; + struct eth_device_info *dev_info; + struct eth_rx_queue_info *queue_info; + int *intr_enabled; + + qd.ptr = data; + port_id = qd.port; + queue = qd.queue; + + dev_info = &rx_adapter->eth_devices[port_id]; + queue_info = &dev_info->rx_queue[queue]; + rte_spinlock_lock(&rx_adapter->intr_ring_lock); + if (rxa_shared_intr(dev_info, queue)) + intr_enabled = &dev_info->shared_intr_enabled; + else + intr_enabled = &queue_info->intr_enabled; + + if (*intr_enabled) { + *intr_enabled = 0; + err = rte_ring_enqueue(rx_adapter->intr_ring, data); + /* Entry should always be available. + * The ring size equals the maximum number of interrupt + * vectors supported (an interrupt vector is shared in + * case of shared interrupts) + */ + if (err) + RTE_EDEV_LOG_ERR("Failed to enqueue interrupt" + " to ring: %s", strerror(err)); + else + rte_eth_dev_rx_intr_disable(port_id, queue); + } + rte_spinlock_unlock(&rx_adapter->intr_ring_lock); +} + +static int +rxa_intr_ring_check_avail(struct rte_event_eth_rx_adapter *rx_adapter, + uint32_t num_intr_vec) +{ + if (rx_adapter->num_intr_vec + num_intr_vec > + RTE_EVENT_ETH_INTR_RING_SIZE) { + RTE_EDEV_LOG_ERR("Exceeded intr ring slots current" + " %d needed %d limit %d", rx_adapter->num_intr_vec, + num_intr_vec, RTE_EVENT_ETH_INTR_RING_SIZE); + return -ENOSPC; + } + + return 0; +} + +/* Delete entries for (dev, queue) from the interrupt ring */ +static void +rxa_intr_ring_del_entries(struct rte_event_eth_rx_adapter *rx_adapter, + struct eth_device_info *dev_info, + uint16_t rx_queue_id) +{ + int i, n; + union queue_data qd; + + rte_spinlock_lock(&rx_adapter->intr_ring_lock); + + n = rte_ring_count(rx_adapter->intr_ring); + for (i = 0; i < n; i++) { + rte_ring_dequeue(rx_adapter->intr_ring, &qd.ptr); + if (!rxa_shared_intr(dev_info, rx_queue_id)) { + if (qd.port == dev_info->dev->data->port_id && + qd.queue == rx_queue_id) + continue; + } else { + if (qd.port == dev_info->dev->data->port_id) + continue; + } + rte_ring_enqueue(rx_adapter->intr_ring, qd.ptr); + } + + rte_spinlock_unlock(&rx_adapter->intr_ring_lock); +} + +/* pthread callback handling interrupt mode receive queues + * After receiving an Rx interrupt, it enqueues the port id and queue id of the + * interrupting queue to the adapter's ring buffer for interrupt events. + * These events are picked up by rxa_intr_ring_dequeue() which is invoked from + * the adapter service function. + */ +static void * +rxa_intr_thread(void *arg) +{ + struct rte_event_eth_rx_adapter *rx_adapter = arg; + struct rte_epoll_event *epoll_events = rx_adapter->epoll_events; + int n, i; + + while (1) { + n = rte_epoll_wait(rx_adapter->epd, epoll_events, + RTE_EVENT_ETH_INTR_RING_SIZE, -1); + if (unlikely(n < 0)) + RTE_EDEV_LOG_ERR("rte_epoll_wait returned error %d", + n); + for (i = 0; i < n; i++) { + rxa_intr_ring_enqueue(rx_adapter, + epoll_events[i].epdata.data); + } + } + + return NULL; +} + +/* Dequeue <port, q> from interrupt ring and enqueue received + * mbufs to eventdev + */ +static inline uint32_t +rxa_intr_ring_dequeue(struct rte_event_eth_rx_adapter *rx_adapter) +{ + uint32_t n; + uint32_t nb_rx = 0; + int rxq_empty; + struct rte_eth_event_enqueue_buffer *buf; + rte_spinlock_t *ring_lock; + uint8_t max_done = 0; + + if (rx_adapter->num_rx_intr == 0) + return 0; + + if (rte_ring_count(rx_adapter->intr_ring) == 0 + && !rx_adapter->qd_valid) + return 0; + + buf = &rx_adapter->event_enqueue_buffer; + ring_lock = &rx_adapter->intr_ring_lock; + + if (buf->count >= BATCH_SIZE) + rxa_flush_event_buffer(rx_adapter); + + while (BATCH_SIZE <= (RTE_DIM(buf->events) - buf->count)) { + struct eth_device_info *dev_info; + uint16_t port; + uint16_t queue; + union queue_data qd = rx_adapter->qd; + int err; + + if (!rx_adapter->qd_valid) { + struct eth_rx_queue_info *queue_info; + + rte_spinlock_lock(ring_lock); + err = rte_ring_dequeue(rx_adapter->intr_ring, &qd.ptr); + if (err) { + rte_spinlock_unlock(ring_lock); + break; + } + + port = qd.port; + queue = qd.queue; + rx_adapter->qd = qd; + rx_adapter->qd_valid = 1; + dev_info = &rx_adapter->eth_devices[port]; + if (rxa_shared_intr(dev_info, queue)) + dev_info->shared_intr_enabled = 1; + else { + queue_info = &dev_info->rx_queue[queue]; + queue_info->intr_enabled = 1; + } + rte_eth_dev_rx_intr_enable(port, queue); + rte_spinlock_unlock(ring_lock); + } else { + port = qd.port; + queue = qd.queue; + + dev_info = &rx_adapter->eth_devices[port]; + } + + if (rxa_shared_intr(dev_info, queue)) { + uint16_t i; + uint16_t nb_queues; + + nb_queues = dev_info->dev->data->nb_rx_queues; + n = 0; + for (i = dev_info->next_q_idx; i < nb_queues; i++) { + uint8_t enq_buffer_full; + + if (!rxa_intr_queue(dev_info, i)) + continue; + n = rxa_eth_rx(rx_adapter, port, i, nb_rx, + rx_adapter->max_nb_rx, + &rxq_empty); + nb_rx += n; + + enq_buffer_full = !rxq_empty && n == 0; + max_done = nb_rx > rx_adapter->max_nb_rx; + + if (enq_buffer_full || max_done) { + dev_info->next_q_idx = i; + goto done; + } + } + + rx_adapter->qd_valid = 0; + + /* Reinitialize for next interrupt */ + dev_info->next_q_idx = dev_info->multi_intr_cap ? + RTE_MAX_RXTX_INTR_VEC_ID - 1 : + 0; + } else { + n = rxa_eth_rx(rx_adapter, port, queue, nb_rx, + rx_adapter->max_nb_rx, + &rxq_empty); + rx_adapter->qd_valid = !rxq_empty; + nb_rx += n; + if (nb_rx > rx_adapter->max_nb_rx) + break; + } + } + +done: + rx_adapter->stats.rx_intr_packets += nb_rx; + return nb_rx; } /* @@ -491,12 +1114,10 @@ fill_event_buffer(struct rte_event_eth_rx_adapter *rx_adapter, * it. */ static inline uint32_t -eth_rx_poll(struct rte_event_eth_rx_adapter *rx_adapter) +rxa_poll(struct rte_event_eth_rx_adapter *rx_adapter) { uint32_t num_queue; - uint16_t n; uint32_t nb_rx = 0; - struct rte_mbuf *mbufs[BATCH_SIZE]; struct rte_eth_event_enqueue_buffer *buf; uint32_t wrr_pos; uint32_t max_nb_rx; @@ -504,7 +1125,7 @@ eth_rx_poll(struct rte_event_eth_rx_adapter *rx_adapter) wrr_pos = rx_adapter->wrr_pos; max_nb_rx = rx_adapter->max_nb_rx; buf = &rx_adapter->event_enqueue_buffer; - struct rte_event_eth_rx_adapter_stats *stats = &rx_adapter->stats; + stats = &rx_adapter->stats; /* Iterate through a WRR sequence */ for (num_queue = 0; num_queue < rx_adapter->wrr_len; num_queue++) { @@ -516,45 +1137,42 @@ eth_rx_poll(struct rte_event_eth_rx_adapter *rx_adapter) * enough space in the enqueue buffer. */ if (buf->count >= BATCH_SIZE) - flush_event_buffer(rx_adapter); - if (BATCH_SIZE > (ETH_EVENT_BUFFER_SIZE - buf->count)) - break; - - stats->rx_poll_count++; - n = rte_eth_rx_burst(d, qid, mbufs, BATCH_SIZE); + rxa_flush_event_buffer(rx_adapter); + if (BATCH_SIZE > (ETH_EVENT_BUFFER_SIZE - buf->count)) { + rx_adapter->wrr_pos = wrr_pos; + return nb_rx; + } - if (n) { - stats->rx_packets += n; - /* The check before rte_eth_rx_burst() ensures that - * all n mbufs can be buffered - */ - fill_event_buffer(rx_adapter, d, qid, mbufs, n); - nb_rx += n; - if (nb_rx > max_nb_rx) { - rx_adapter->wrr_pos = + nb_rx += rxa_eth_rx(rx_adapter, d, qid, nb_rx, max_nb_rx, + NULL); + if (nb_rx > max_nb_rx) { + rx_adapter->wrr_pos = (wrr_pos + 1) % rx_adapter->wrr_len; - return nb_rx; - } + break; } if (++wrr_pos == rx_adapter->wrr_len) wrr_pos = 0; } - return nb_rx; } static int -event_eth_rx_adapter_service_func(void *args) +rxa_service_func(void *args) { struct rte_event_eth_rx_adapter *rx_adapter = args; - struct rte_eth_event_enqueue_buffer *buf; + struct rte_event_eth_rx_adapter_stats *stats; - buf = &rx_adapter->event_enqueue_buffer; if (rte_spinlock_trylock(&rx_adapter->rx_lock) == 0) return 0; - if (eth_rx_poll(rx_adapter) == 0 && buf->count) - flush_event_buffer(rx_adapter); + if (!rx_adapter->rxa_started) { + return 0; + rte_spinlock_unlock(&rx_adapter->rx_lock); + } + + stats = &rx_adapter->stats; + stats->rx_packets += rxa_intr_ring_dequeue(rx_adapter); + stats->rx_packets += rxa_poll(rx_adapter); rte_spinlock_unlock(&rx_adapter->rx_lock); return 0; } @@ -586,14 +1204,14 @@ rte_event_eth_rx_adapter_init(void) } static inline struct rte_event_eth_rx_adapter * -id_to_rx_adapter(uint8_t id) +rxa_id_to_adapter(uint8_t id) { return event_eth_rx_adapter ? event_eth_rx_adapter[id] : NULL; } static int -default_conf_cb(uint8_t id, uint8_t dev_id, +rxa_default_conf_cb(uint8_t id, uint8_t dev_id, struct rte_event_eth_rx_adapter_conf *conf, void *arg) { int ret; @@ -602,7 +1220,7 @@ default_conf_cb(uint8_t id, uint8_t dev_id, int started; uint8_t port_id; struct rte_event_port_conf *port_conf = arg; - struct rte_event_eth_rx_adapter *rx_adapter = id_to_rx_adapter(id); + struct rte_event_eth_rx_adapter *rx_adapter = rxa_id_to_adapter(id); dev = &rte_eventdevs[rx_adapter->eventdev_id]; dev_conf = dev->data->dev_conf; @@ -639,7 +1257,351 @@ default_conf_cb(uint8_t id, uint8_t dev_id, } static int -init_service(struct rte_event_eth_rx_adapter *rx_adapter, uint8_t id) +rxa_epoll_create1(void) +{ +#if defined(LINUX) + int fd; + fd = epoll_create1(EPOLL_CLOEXEC); + return fd < 0 ? -errno : fd; +#elif defined(BSD) + return -ENOTSUP; +#endif +} + +static int +rxa_init_epd(struct rte_event_eth_rx_adapter *rx_adapter) +{ + if (rx_adapter->epd != INIT_FD) + return 0; + + rx_adapter->epd = rxa_epoll_create1(); + if (rx_adapter->epd < 0) { + int err = rx_adapter->epd; + rx_adapter->epd = INIT_FD; + RTE_EDEV_LOG_ERR("epoll_create1() failed, err %d", err); + return err; + } + + return 0; +} + +static int +rxa_create_intr_thread(struct rte_event_eth_rx_adapter *rx_adapter) +{ + int err; + char thread_name[RTE_MAX_THREAD_NAME_LEN]; + + if (rx_adapter->intr_ring) + return 0; + + rx_adapter->intr_ring = rte_ring_create("intr_ring", + RTE_EVENT_ETH_INTR_RING_SIZE, + rte_socket_id(), 0); + if (!rx_adapter->intr_ring) + return -ENOMEM; + + rx_adapter->epoll_events = rte_zmalloc_socket(rx_adapter->mem_name, + RTE_EVENT_ETH_INTR_RING_SIZE * + sizeof(struct rte_epoll_event), + RTE_CACHE_LINE_SIZE, + rx_adapter->socket_id); + if (!rx_adapter->epoll_events) { + err = -ENOMEM; + goto error; + } + + rte_spinlock_init(&rx_adapter->intr_ring_lock); + + snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, + "rx-intr-thread-%d", rx_adapter->id); + + err = rte_ctrl_thread_create(&rx_adapter->rx_intr_thread, thread_name, + NULL, rxa_intr_thread, rx_adapter); + if (!err) { + rte_thread_setname(rx_adapter->rx_intr_thread, thread_name); + return 0; + } + + RTE_EDEV_LOG_ERR("Failed to create interrupt thread err = %d\n", err); +error: + rte_ring_free(rx_adapter->intr_ring); + rx_adapter->intr_ring = NULL; + rx_adapter->epoll_events = NULL; + return err; +} + +static int +rxa_destroy_intr_thread(struct rte_event_eth_rx_adapter *rx_adapter) +{ + int err; + + err = pthread_cancel(rx_adapter->rx_intr_thread); + if (err) + RTE_EDEV_LOG_ERR("Can't cancel interrupt thread err = %d\n", + err); + + err = pthread_join(rx_adapter->rx_intr_thread, NULL); + if (err) + RTE_EDEV_LOG_ERR("Can't join interrupt thread err = %d\n", err); + + rte_free(rx_adapter->epoll_events); + rte_ring_free(rx_adapter->intr_ring); + rx_adapter->intr_ring = NULL; + rx_adapter->epoll_events = NULL; + return 0; +} + +static int +rxa_free_intr_resources(struct rte_event_eth_rx_adapter *rx_adapter) +{ + int ret; + + if (rx_adapter->num_rx_intr == 0) + return 0; + + ret = rxa_destroy_intr_thread(rx_adapter); + if (ret) + return ret; + + close(rx_adapter->epd); + rx_adapter->epd = INIT_FD; + + return ret; +} + +static int +rxa_disable_intr(struct rte_event_eth_rx_adapter *rx_adapter, + struct eth_device_info *dev_info, + uint16_t rx_queue_id) +{ + int err; + uint16_t eth_dev_id = dev_info->dev->data->port_id; + int sintr = rxa_shared_intr(dev_info, rx_queue_id); + + err = rte_eth_dev_rx_intr_disable(eth_dev_id, rx_queue_id); + if (err) { + RTE_EDEV_LOG_ERR("Could not disable interrupt for Rx queue %u", + rx_queue_id); + return err; + } + + err = rte_eth_dev_rx_intr_ctl_q(eth_dev_id, rx_queue_id, + rx_adapter->epd, + RTE_INTR_EVENT_DEL, + 0); + if (err) + RTE_EDEV_LOG_ERR("Interrupt event deletion failed %d", err); + + if (sintr) + dev_info->rx_queue[rx_queue_id].intr_enabled = 0; + else + dev_info->shared_intr_enabled = 0; + return err; +} + +static int +rxa_del_intr_queue(struct rte_event_eth_rx_adapter *rx_adapter, + struct eth_device_info *dev_info, + int rx_queue_id) +{ + int err; + int i; + int s; + + if (dev_info->nb_rx_intr == 0) + return 0; + + err = 0; + if (rx_queue_id == -1) { + s = dev_info->nb_shared_intr; + for (i = 0; i < dev_info->nb_rx_intr; i++) { + int sintr; + uint16_t q; + + q = dev_info->intr_queue[i]; + sintr = rxa_shared_intr(dev_info, q); + s -= sintr; + + if (!sintr || s == 0) { + + err = rxa_disable_intr(rx_adapter, dev_info, + q); + if (err) + return err; + rxa_intr_ring_del_entries(rx_adapter, dev_info, + q); + } + } + } else { + if (!rxa_intr_queue(dev_info, rx_queue_id)) + return 0; + if (!rxa_shared_intr(dev_info, rx_queue_id) || + dev_info->nb_shared_intr == 1) { + err = rxa_disable_intr(rx_adapter, dev_info, + rx_queue_id); + if (err) + return err; + rxa_intr_ring_del_entries(rx_adapter, dev_info, + rx_queue_id); + } + + for (i = 0; i < dev_info->nb_rx_intr; i++) { + if (dev_info->intr_queue[i] == rx_queue_id) { + for (; i < dev_info->nb_rx_intr - 1; i++) + dev_info->intr_queue[i] = + dev_info->intr_queue[i + 1]; + break; + } + } + } + + return err; +} + +static int +rxa_config_intr(struct rte_event_eth_rx_adapter *rx_adapter, + struct eth_device_info *dev_info, + uint16_t rx_queue_id) +{ + int err, err1; + uint16_t eth_dev_id = dev_info->dev->data->port_id; + union queue_data qd; + int init_fd; + uint16_t *intr_queue; + int sintr = rxa_shared_intr(dev_info, rx_queue_id); + + if (rxa_intr_queue(dev_info, rx_queue_id)) + return 0; + + intr_queue = dev_info->intr_queue; + if (dev_info->intr_queue == NULL) { + size_t len = + dev_info->dev->data->nb_rx_queues * sizeof(uint16_t); + dev_info->intr_queue = + rte_zmalloc_socket( + rx_adapter->mem_name, + len, + 0, + rx_adapter->socket_id); + if (dev_info->intr_queue == NULL) + return -ENOMEM; + } + + init_fd = rx_adapter->epd; + err = rxa_init_epd(rx_adapter); + if (err) + goto err_free_queue; + + qd.port = eth_dev_id; + qd.queue = rx_queue_id; + + err = rte_eth_dev_rx_intr_ctl_q(eth_dev_id, rx_queue_id, + rx_adapter->epd, + RTE_INTR_EVENT_ADD, + qd.ptr); + if (err) { + RTE_EDEV_LOG_ERR("Failed to add interrupt event for" + " Rx Queue %u err %d", rx_queue_id, err); + goto err_del_fd; + } + + err = rte_eth_dev_rx_intr_enable(eth_dev_id, rx_queue_id); + if (err) { + RTE_EDEV_LOG_ERR("Could not enable interrupt for" + " Rx Queue %u err %d", rx_queue_id, err); + + goto err_del_event; + } + + err = rxa_create_intr_thread(rx_adapter); + if (!err) { + if (sintr) + dev_info->shared_intr_enabled = 1; + else + dev_info->rx_queue[rx_queue_id].intr_enabled = 1; + return 0; + } + + + err = rte_eth_dev_rx_intr_disable(eth_dev_id, rx_queue_id); + if (err) + RTE_EDEV_LOG_ERR("Could not disable interrupt for" + " Rx Queue %u err %d", rx_queue_id, err); +err_del_event: + err1 = rte_eth_dev_rx_intr_ctl_q(eth_dev_id, rx_queue_id, + rx_adapter->epd, + RTE_INTR_EVENT_DEL, + 0); + if (err1) { + RTE_EDEV_LOG_ERR("Could not delete event for" + " Rx Queue %u err %d", rx_queue_id, err1); + } +err_del_fd: + if (init_fd == INIT_FD) { + close(rx_adapter->epd); + rx_adapter->epd = -1; + } +err_free_queue: + if (intr_queue == NULL) + rte_free(dev_info->intr_queue); + + return err; +} + +static int +rxa_add_intr_queue(struct rte_event_eth_rx_adapter *rx_adapter, + struct eth_device_info *dev_info, + int rx_queue_id) + +{ + int i, j, err; + int si = -1; + int shared_done = (dev_info->nb_shared_intr > 0); + + if (rx_queue_id != -1) { + if (rxa_shared_intr(dev_info, rx_queue_id) && shared_done) + return 0; + return rxa_config_intr(rx_adapter, dev_info, rx_queue_id); + } + + err = 0; + for (i = 0; i < dev_info->dev->data->nb_rx_queues; i++) { + + if (rxa_shared_intr(dev_info, i) && shared_done) + continue; + + err = rxa_config_intr(rx_adapter, dev_info, i); + + shared_done = err == 0 && rxa_shared_intr(dev_info, i); + if (shared_done) { + si = i; + dev_info->shared_intr_enabled = 1; + } + if (err) + break; + } + + if (err == 0) + return 0; + + shared_done = (dev_info->nb_shared_intr > 0); + for (j = 0; j < i; j++) { + if (rxa_intr_queue(dev_info, j)) + continue; + if (rxa_shared_intr(dev_info, j) && si != j) + continue; + err = rxa_disable_intr(rx_adapter, dev_info, j); + if (err) + break; + + } + + return err; +} + + +static int +rxa_init_service(struct rte_event_eth_rx_adapter *rx_adapter, uint8_t id) { int ret; struct rte_service_spec service; @@ -652,7 +1614,7 @@ init_service(struct rte_event_eth_rx_adapter *rx_adapter, uint8_t id) snprintf(service.name, ETH_RX_ADAPTER_SERVICE_NAME_LEN, "rte_event_eth_rx_adapter_%d", id); service.socket_id = rx_adapter->socket_id; - service.callback = event_eth_rx_adapter_service_func; + service.callback = rxa_service_func; service.callback_userdata = rx_adapter; /* Service function handles locking for queue add/del updates */ service.capabilities = RTE_SERVICE_CAP_MT_SAFE; @@ -673,6 +1635,7 @@ init_service(struct rte_event_eth_rx_adapter *rx_adapter, uint8_t id) rx_adapter->event_port_id = rx_adapter_conf.event_port_id; rx_adapter->max_nb_rx = rx_adapter_conf.max_nb_rx; rx_adapter->service_inited = 1; + rx_adapter->epd = INIT_FD; return 0; err_done: @@ -680,9 +1643,8 @@ err_done: return ret; } - static void -update_queue_info(struct rte_event_eth_rx_adapter *rx_adapter, +rxa_update_queue(struct rte_event_eth_rx_adapter *rx_adapter, struct eth_device_info *dev_info, int32_t rx_queue_id, uint8_t add) @@ -696,7 +1658,7 @@ update_queue_info(struct rte_event_eth_rx_adapter *rx_adapter, if (rx_queue_id == -1) { for (i = 0; i < dev_info->dev->data->nb_rx_queues; i++) - update_queue_info(rx_adapter, dev_info, i, add); + rxa_update_queue(rx_adapter, dev_info, i, add); } else { queue_info = &dev_info->rx_queue[rx_queue_id]; enabled = queue_info->queue_enabled; @@ -711,31 +1673,65 @@ update_queue_info(struct rte_event_eth_rx_adapter *rx_adapter, } } -static int -event_eth_rx_adapter_queue_del(struct rte_event_eth_rx_adapter *rx_adapter, - struct eth_device_info *dev_info, - uint16_t rx_queue_id) +static void +rxa_sw_del(struct rte_event_eth_rx_adapter *rx_adapter, + struct eth_device_info *dev_info, + int32_t rx_queue_id) { - struct eth_rx_queue_info *queue_info; + int pollq; + int intrq; + int sintrq; + if (rx_adapter->nb_queues == 0) - return 0; + return; - queue_info = &dev_info->rx_queue[rx_queue_id]; - rx_adapter->num_rx_polled -= queue_info->queue_enabled; - update_queue_info(rx_adapter, dev_info, rx_queue_id, 0); - return 0; + if (rx_queue_id == -1) { + uint16_t nb_rx_queues; + uint16_t i; + + nb_rx_queues = dev_info->dev->data->nb_rx_queues; + for (i = 0; i < nb_rx_queues; i++) + rxa_sw_del(rx_adapter, dev_info, i); + return; + } + + pollq = rxa_polled_queue(dev_info, rx_queue_id); + intrq = rxa_intr_queue(dev_info, rx_queue_id); + sintrq = rxa_shared_intr(dev_info, rx_queue_id); + rxa_update_queue(rx_adapter, dev_info, rx_queue_id, 0); + rx_adapter->num_rx_polled -= pollq; + dev_info->nb_rx_poll -= pollq; + rx_adapter->num_rx_intr -= intrq; + dev_info->nb_rx_intr -= intrq; + dev_info->nb_shared_intr -= intrq && sintrq; } static void -event_eth_rx_adapter_queue_add(struct rte_event_eth_rx_adapter *rx_adapter, - struct eth_device_info *dev_info, - uint16_t rx_queue_id, - const struct rte_event_eth_rx_adapter_queue_conf *conf) - +rxa_add_queue(struct rte_event_eth_rx_adapter *rx_adapter, + struct eth_device_info *dev_info, + int32_t rx_queue_id, + const struct rte_event_eth_rx_adapter_queue_conf *conf) { struct eth_rx_queue_info *queue_info; const struct rte_event *ev = &conf->ev; + int pollq; + int intrq; + int sintrq; + + if (rx_queue_id == -1) { + uint16_t nb_rx_queues; + uint16_t i; + + nb_rx_queues = dev_info->dev->data->nb_rx_queues; + for (i = 0; i < nb_rx_queues; i++) + rxa_add_queue(rx_adapter, dev_info, i, conf); + return; + } + + pollq = rxa_polled_queue(dev_info, rx_queue_id); + intrq = rxa_intr_queue(dev_info, rx_queue_id); + sintrq = rxa_shared_intr(dev_info, rx_queue_id); queue_info = &dev_info->rx_queue[rx_queue_id]; queue_info->event_queue_id = ev->queue_id; @@ -749,69 +1745,162 @@ event_eth_rx_adapter_queue_add(struct rte_event_eth_rx_adapter *rx_adapter, queue_info->flow_id_mask = ~0; } - /* The same queue can be added more than once */ - rx_adapter->num_rx_polled += !queue_info->queue_enabled; - update_queue_info(rx_adapter, dev_info, rx_queue_id, 1); + rxa_update_queue(rx_adapter, dev_info, rx_queue_id, 1); + if (rxa_polled_queue(dev_info, rx_queue_id)) { + rx_adapter->num_rx_polled += !pollq; + dev_info->nb_rx_poll += !pollq; + rx_adapter->num_rx_intr -= intrq; + dev_info->nb_rx_intr -= intrq; + dev_info->nb_shared_intr -= intrq && sintrq; + } + + if (rxa_intr_queue(dev_info, rx_queue_id)) { + rx_adapter->num_rx_polled -= pollq; + dev_info->nb_rx_poll -= pollq; + rx_adapter->num_rx_intr += !intrq; + dev_info->nb_rx_intr += !intrq; + dev_info->nb_shared_intr += !intrq && sintrq; + if (dev_info->nb_shared_intr == 1) { + if (dev_info->multi_intr_cap) + dev_info->next_q_idx = + RTE_MAX_RXTX_INTR_VEC_ID - 1; + else + dev_info->next_q_idx = 0; + } + } } -static int add_rx_queue(struct rte_event_eth_rx_adapter *rx_adapter, +static int rxa_sw_add(struct rte_event_eth_rx_adapter *rx_adapter, uint16_t eth_dev_id, int rx_queue_id, const struct rte_event_eth_rx_adapter_queue_conf *queue_conf) { struct eth_device_info *dev_info = &rx_adapter->eth_devices[eth_dev_id]; struct rte_event_eth_rx_adapter_queue_conf temp_conf; - uint32_t i; int ret; + struct eth_rx_poll_entry *rx_poll; + struct eth_rx_queue_info *rx_queue; + uint32_t *rx_wrr; + uint16_t nb_rx_queues; + uint32_t nb_rx_poll, nb_wrr; + uint32_t nb_rx_intr; + int num_intr_vec; + uint16_t wt; if (queue_conf->servicing_weight == 0) { - struct rte_eth_dev_data *data = dev_info->dev->data; - if (data->dev_conf.intr_conf.rxq) { - RTE_EDEV_LOG_ERR("Interrupt driven queues" - " not supported"); - return -ENOTSUP; - } - temp_conf = *queue_conf; - /* If Rx interrupts are disabled set wt = 1 */ - temp_conf.servicing_weight = 1; + temp_conf = *queue_conf; + if (!data->dev_conf.intr_conf.rxq) { + /* If Rx interrupts are disabled set wt = 1 */ + temp_conf.servicing_weight = 1; + } queue_conf = &temp_conf; } + nb_rx_queues = dev_info->dev->data->nb_rx_queues; + rx_queue = dev_info->rx_queue; + wt = queue_conf->servicing_weight; + if (dev_info->rx_queue == NULL) { dev_info->rx_queue = rte_zmalloc_socket(rx_adapter->mem_name, - dev_info->dev->data->nb_rx_queues * + nb_rx_queues * sizeof(struct eth_rx_queue_info), 0, rx_adapter->socket_id); if (dev_info->rx_queue == NULL) return -ENOMEM; } + rx_wrr = NULL; + rx_poll = NULL; - if (rx_queue_id == -1) { - for (i = 0; i < dev_info->dev->data->nb_rx_queues; i++) - event_eth_rx_adapter_queue_add(rx_adapter, - dev_info, i, - queue_conf); + rxa_calc_nb_post_add(rx_adapter, dev_info, rx_queue_id, + queue_conf->servicing_weight, + &nb_rx_poll, &nb_rx_intr, &nb_wrr); + + if (dev_info->dev->intr_handle) + dev_info->multi_intr_cap = + rte_intr_cap_multiple(dev_info->dev->intr_handle); + + ret = rxa_alloc_poll_arrays(rx_adapter, nb_rx_poll, nb_wrr, + &rx_poll, &rx_wrr); + if (ret) + goto err_free_rxqueue; + + if (wt == 0) { + num_intr_vec = rxa_nb_intr_vect(dev_info, rx_queue_id, 1); + + ret = rxa_intr_ring_check_avail(rx_adapter, num_intr_vec); + if (ret) + goto err_free_rxqueue; + + ret = rxa_add_intr_queue(rx_adapter, dev_info, rx_queue_id); + if (ret) + goto err_free_rxqueue; } else { - event_eth_rx_adapter_queue_add(rx_adapter, dev_info, - (uint16_t)rx_queue_id, - queue_conf); + + num_intr_vec = 0; + if (rx_adapter->num_rx_intr > nb_rx_intr) { + num_intr_vec = rxa_nb_intr_vect(dev_info, + rx_queue_id, 0); + /* interrupt based queues are being converted to + * poll mode queues, delete the interrupt configuration + * for those. + */ + ret = rxa_del_intr_queue(rx_adapter, + dev_info, rx_queue_id); + if (ret) + goto err_free_rxqueue; + } } - ret = eth_poll_wrr_calc(rx_adapter); - if (ret) { - event_eth_rx_adapter_queue_del(rx_adapter, - dev_info, rx_queue_id); - return ret; + if (nb_rx_intr == 0) { + ret = rxa_free_intr_resources(rx_adapter); + if (ret) + goto err_free_rxqueue; } - return ret; + if (wt == 0) { + uint16_t i; + + if (rx_queue_id == -1) { + for (i = 0; i < dev_info->dev->data->nb_rx_queues; i++) + dev_info->intr_queue[i] = i; + } else { + if (!rxa_intr_queue(dev_info, rx_queue_id)) + dev_info->intr_queue[nb_rx_intr - 1] = + rx_queue_id; + } + } + + + + rxa_add_queue(rx_adapter, dev_info, rx_queue_id, queue_conf); + rxa_calc_wrr_sequence(rx_adapter, rx_poll, rx_wrr); + + rte_free(rx_adapter->eth_rx_poll); + rte_free(rx_adapter->wrr_sched); + + rx_adapter->eth_rx_poll = rx_poll; + rx_adapter->wrr_sched = rx_wrr; + rx_adapter->wrr_len = nb_wrr; + rx_adapter->num_intr_vec += num_intr_vec; + return 0; + +err_free_rxqueue: + if (rx_queue == NULL) { + rte_free(dev_info->rx_queue); + dev_info->rx_queue = NULL; + } + + rte_free(rx_poll); + rte_free(rx_wrr); + + return 0; } static int -rx_adapter_ctrl(uint8_t id, int start) +rxa_ctrl(uint8_t id, int start) { struct rte_event_eth_rx_adapter *rx_adapter; struct rte_eventdev *dev; @@ -821,7 +1910,7 @@ rx_adapter_ctrl(uint8_t id, int start) int stop = !start; RTE_EVENT_ETH_RX_ADAPTER_ID_VALID_OR_ERR_RET(id, -EINVAL); - rx_adapter = id_to_rx_adapter(id); + rx_adapter = rxa_id_to_adapter(id); if (rx_adapter == NULL) return -EINVAL; @@ -845,8 +1934,12 @@ rx_adapter_ctrl(uint8_t id, int start) &rte_eth_devices[i]); } - if (use_service) + if (use_service) { + rte_spinlock_lock(&rx_adapter->rx_lock); + rx_adapter->rxa_started = start; rte_service_runstate_set(rx_adapter->service_id, start); + rte_spinlock_unlock(&rx_adapter->rx_lock); + } return 0; } @@ -880,7 +1973,7 @@ rte_event_eth_rx_adapter_create_ext(uint8_t id, uint8_t dev_id, return ret; } - rx_adapter = id_to_rx_adapter(id); + rx_adapter = rxa_id_to_adapter(id); if (rx_adapter != NULL) { RTE_EDEV_LOG_ERR("Eth Rx adapter exists id = %" PRIu8, id); return -EEXIST; @@ -902,6 +1995,7 @@ rte_event_eth_rx_adapter_create_ext(uint8_t id, uint8_t dev_id, rx_adapter->socket_id = socket_id; rx_adapter->conf_cb = conf_cb; rx_adapter->conf_arg = conf_arg; + rx_adapter->id = id; strcpy(rx_adapter->mem_name, mem_name); rx_adapter->eth_devices = rte_zmalloc_socket(rx_adapter->mem_name, /* FIXME: incompatible with hotplug */ @@ -922,7 +2016,7 @@ rte_event_eth_rx_adapter_create_ext(uint8_t id, uint8_t dev_id, rx_adapter->eth_devices[i].dev = &rte_eth_devices[i]; event_eth_rx_adapter[id] = rx_adapter; - if (conf_cb == default_conf_cb) + if (conf_cb == rxa_default_conf_cb) rx_adapter->default_cb_arg = 1; return 0; } @@ -943,7 +2037,7 @@ rte_event_eth_rx_adapter_create(uint8_t id, uint8_t dev_id, return -ENOMEM; *pc = *port_config; ret = rte_event_eth_rx_adapter_create_ext(id, dev_id, - default_conf_cb, + rxa_default_conf_cb, pc); if (ret) rte_free(pc); @@ -957,7 +2051,7 @@ rte_event_eth_rx_adapter_free(uint8_t id) RTE_EVENT_ETH_RX_ADAPTER_ID_VALID_OR_ERR_RET(id, -EINVAL); - rx_adapter = id_to_rx_adapter(id); + rx_adapter = rxa_id_to_adapter(id); if (rx_adapter == NULL) return -EINVAL; @@ -987,12 +2081,11 @@ rte_event_eth_rx_adapter_queue_add(uint8_t id, struct rte_event_eth_rx_adapter *rx_adapter; struct rte_eventdev *dev; struct eth_device_info *dev_info; - int start_service; RTE_EVENT_ETH_RX_ADAPTER_ID_VALID_OR_ERR_RET(id, -EINVAL); RTE_ETH_VALID_PORTID_OR_ERR_RET(eth_dev_id, -EINVAL); - rx_adapter = id_to_rx_adapter(id); + rx_adapter = rxa_id_to_adapter(id); if ((rx_adapter == NULL) || (queue_conf == NULL)) return -EINVAL; @@ -1030,7 +2123,6 @@ rte_event_eth_rx_adapter_queue_add(uint8_t id, return -EINVAL; } - start_service = 0; dev_info = &rx_adapter->eth_devices[eth_dev_id]; if (cap & RTE_EVENT_ETH_RX_ADAPTER_CAP_INTERNAL_PORT) { @@ -1050,28 +2142,29 @@ rte_event_eth_rx_adapter_queue_add(uint8_t id, &rte_eth_devices[eth_dev_id], rx_queue_id, queue_conf); if (ret == 0) { - update_queue_info(rx_adapter, + dev_info->internal_event_port = 1; + rxa_update_queue(rx_adapter, &rx_adapter->eth_devices[eth_dev_id], rx_queue_id, 1); } } else { rte_spinlock_lock(&rx_adapter->rx_lock); - ret = init_service(rx_adapter, id); - if (ret == 0) - ret = add_rx_queue(rx_adapter, eth_dev_id, rx_queue_id, + dev_info->internal_event_port = 0; + ret = rxa_init_service(rx_adapter, id); + if (ret == 0) { + uint32_t service_id = rx_adapter->service_id; + ret = rxa_sw_add(rx_adapter, eth_dev_id, rx_queue_id, queue_conf); + rte_service_component_runstate_set(service_id, + rxa_sw_adapter_queue_count(rx_adapter)); + } rte_spinlock_unlock(&rx_adapter->rx_lock); - if (ret == 0) - start_service = !!sw_rx_adapter_queue_count(rx_adapter); } if (ret) return ret; - if (start_service) - rte_service_component_runstate_set(rx_adapter->service_id, 1); - return 0; } @@ -1084,12 +2177,17 @@ rte_event_eth_rx_adapter_queue_del(uint8_t id, uint16_t eth_dev_id, struct rte_event_eth_rx_adapter *rx_adapter; struct eth_device_info *dev_info; uint32_t cap; - uint16_t i; + uint32_t nb_rx_poll = 0; + uint32_t nb_wrr = 0; + uint32_t nb_rx_intr; + struct eth_rx_poll_entry *rx_poll = NULL; + uint32_t *rx_wrr = NULL; + int num_intr_vec; RTE_EVENT_ETH_RX_ADAPTER_ID_VALID_OR_ERR_RET(id, -EINVAL); RTE_ETH_VALID_PORTID_OR_ERR_RET(eth_dev_id, -EINVAL); - rx_adapter = id_to_rx_adapter(id); + rx_adapter = rxa_id_to_adapter(id); if (rx_adapter == NULL) return -EINVAL; @@ -1116,7 +2214,7 @@ rte_event_eth_rx_adapter_queue_del(uint8_t id, uint16_t eth_dev_id, &rte_eth_devices[eth_dev_id], rx_queue_id); if (ret == 0) { - update_queue_info(rx_adapter, + rxa_update_queue(rx_adapter, &rx_adapter->eth_devices[eth_dev_id], rx_queue_id, 0); @@ -1126,48 +2224,78 @@ rte_event_eth_rx_adapter_queue_del(uint8_t id, uint16_t eth_dev_id, } } } else { - int rc; + rxa_calc_nb_post_del(rx_adapter, dev_info, rx_queue_id, + &nb_rx_poll, &nb_rx_intr, &nb_wrr); + + ret = rxa_alloc_poll_arrays(rx_adapter, nb_rx_poll, nb_wrr, + &rx_poll, &rx_wrr); + if (ret) + return ret; + rte_spinlock_lock(&rx_adapter->rx_lock); - if (rx_queue_id == -1) { - for (i = 0; i < dev_info->dev->data->nb_rx_queues; i++) - event_eth_rx_adapter_queue_del(rx_adapter, - dev_info, - i); - } else { - event_eth_rx_adapter_queue_del(rx_adapter, - dev_info, - (uint16_t)rx_queue_id); + + num_intr_vec = 0; + if (rx_adapter->num_rx_intr > nb_rx_intr) { + + num_intr_vec = rxa_nb_intr_vect(dev_info, + rx_queue_id, 0); + ret = rxa_del_intr_queue(rx_adapter, dev_info, + rx_queue_id); + if (ret) + goto unlock_ret; + } + + if (nb_rx_intr == 0) { + ret = rxa_free_intr_resources(rx_adapter); + if (ret) + goto unlock_ret; + } + + rxa_sw_del(rx_adapter, dev_info, rx_queue_id); + rxa_calc_wrr_sequence(rx_adapter, rx_poll, rx_wrr); + + rte_free(rx_adapter->eth_rx_poll); + rte_free(rx_adapter->wrr_sched); + + if (nb_rx_intr == 0) { + rte_free(dev_info->intr_queue); + dev_info->intr_queue = NULL; } - rc = eth_poll_wrr_calc(rx_adapter); - if (rc) - RTE_EDEV_LOG_ERR("WRR recalculation failed %" PRId32, - rc); + rx_adapter->eth_rx_poll = rx_poll; + rx_adapter->wrr_sched = rx_wrr; + rx_adapter->wrr_len = nb_wrr; + rx_adapter->num_intr_vec += num_intr_vec; if (dev_info->nb_dev_queues == 0) { rte_free(dev_info->rx_queue); dev_info->rx_queue = NULL; } - +unlock_ret: rte_spinlock_unlock(&rx_adapter->rx_lock); + if (ret) { + rte_free(rx_poll); + rte_free(rx_wrr); + return ret; + } + rte_service_component_runstate_set(rx_adapter->service_id, - sw_rx_adapter_queue_count(rx_adapter)); + rxa_sw_adapter_queue_count(rx_adapter)); } return ret; } - int rte_event_eth_rx_adapter_start(uint8_t id) { - return rx_adapter_ctrl(id, 1); + return rxa_ctrl(id, 1); } int rte_event_eth_rx_adapter_stop(uint8_t id) { - return rx_adapter_ctrl(id, 0); + return rxa_ctrl(id, 0); } int @@ -1184,7 +2312,7 @@ rte_event_eth_rx_adapter_stats_get(uint8_t id, RTE_EVENT_ETH_RX_ADAPTER_ID_VALID_OR_ERR_RET(id, -EINVAL); - rx_adapter = id_to_rx_adapter(id); + rx_adapter = rxa_id_to_adapter(id); if (rx_adapter == NULL || stats == NULL) return -EINVAL; @@ -1222,7 +2350,7 @@ rte_event_eth_rx_adapter_stats_reset(uint8_t id) RTE_EVENT_ETH_RX_ADAPTER_ID_VALID_OR_ERR_RET(id, -EINVAL); - rx_adapter = id_to_rx_adapter(id); + rx_adapter = rxa_id_to_adapter(id); if (rx_adapter == NULL) return -EINVAL; @@ -1247,7 +2375,7 @@ rte_event_eth_rx_adapter_service_id_get(uint8_t id, uint32_t *service_id) RTE_EVENT_ETH_RX_ADAPTER_ID_VALID_OR_ERR_RET(id, -EINVAL); - rx_adapter = id_to_rx_adapter(id); + rx_adapter = rxa_id_to_adapter(id); if (rx_adapter == NULL || service_id == NULL) return -EINVAL; @@ -1256,3 +2384,47 @@ rte_event_eth_rx_adapter_service_id_get(uint8_t id, uint32_t *service_id) return rx_adapter->service_inited ? 0 : -ESRCH; } + +int rte_event_eth_rx_adapter_cb_register(uint8_t id, + uint16_t eth_dev_id, + rte_event_eth_rx_adapter_cb_fn cb_fn, + void *cb_arg) +{ + struct rte_event_eth_rx_adapter *rx_adapter; + struct eth_device_info *dev_info; + uint32_t cap; + int ret; + + RTE_EVENT_ETH_RX_ADAPTER_ID_VALID_OR_ERR_RET(id, -EINVAL); + RTE_ETH_VALID_PORTID_OR_ERR_RET(eth_dev_id, -EINVAL); + + rx_adapter = rxa_id_to_adapter(id); + if (rx_adapter == NULL) + return -EINVAL; + + dev_info = &rx_adapter->eth_devices[eth_dev_id]; + if (dev_info->rx_queue == NULL) + return -EINVAL; + + ret = rte_event_eth_rx_adapter_caps_get(rx_adapter->eventdev_id, + eth_dev_id, + &cap); + if (ret) { + RTE_EDEV_LOG_ERR("Failed to get adapter caps edev %" PRIu8 + "eth port %" PRIu16, id, eth_dev_id); + return ret; + } + + if (cap & RTE_EVENT_ETH_RX_ADAPTER_CAP_INTERNAL_PORT) { + RTE_EDEV_LOG_ERR("Rx callback not supported for eth port %" + PRIu16, eth_dev_id); + return -EINVAL; + } + + rte_spinlock_lock(&rx_adapter->rx_lock); + dev_info->cb_fn = cb_fn; + dev_info->cb_arg = cb_arg; + rte_spinlock_unlock(&rx_adapter->rx_lock); + + return 0; +} diff --git a/lib/librte_eventdev/rte_event_eth_rx_adapter.h b/lib/librte_eventdev/rte_event_eth_rx_adapter.h index 307b2b50..332ee216 100644 --- a/lib/librte_eventdev/rte_event_eth_rx_adapter.h +++ b/lib/librte_eventdev/rte_event_eth_rx_adapter.h @@ -63,9 +63,22 @@ * rte_event_eth_rx_adapter_service_id_get() function can be used to retrieve * the service function ID of the adapter in this case. * + * For SW based packet transfers, i.e., when the + * RTE_EVENT_ETH_RX_ADAPTER_CAP_INTERNAL_PORT is not set in the adapter's + * capabilities flags for a particular ethernet device, the service function + * temporarily enqueues mbufs to an event buffer before batch enqueueing these + * to the event device. If the buffer fills up, the service function stops + * dequeueing packets from the ethernet device. The application may want to + * monitor the buffer fill level and instruct the service function to + * selectively buffer packets. The application may also use some other + * criteria to decide which packets should enter the event device even when + * the event buffer fill level is low. The + * rte_event_eth_rx_adapter_cb_register() function allows the + * application to register a callback that selects which packets to enqueue + * to the event device. + * * Note: - * 1) Interrupt driven receive queues are currently unimplemented. - * 2) Devices created after an instance of rte_event_eth_rx_adapter_create + * 1) Devices created after an instance of rte_event_eth_rx_adapter_create * should be added to a new instance of the rx adapter. */ @@ -199,12 +212,55 @@ struct rte_event_eth_rx_adapter_stats { * block cycles can be used to compute the percentage of * cycles the service is blocked by the event device. */ + uint64_t rx_intr_packets; + /**< Received packet count for interrupt mode Rx queues */ }; /** * @warning * @b EXPERIMENTAL: this API may change without prior notice * + * Callback function invoked by the SW adapter before it continues + * to process packets. The callback is passed the size of the enqueue + * buffer in the SW adapter and the occupancy of the buffer. The + * callback can use these values to decide which mbufs should be + * enqueued to the event device. If the return value of the callback + * is less than nb_mbuf then the SW adapter uses the return value to + * enqueue enq_mbuf[] to the event device. + * + * @param eth_dev_id + * Port identifier of the Ethernet device. + * @param queue_id + * Receive queue index. + * @param enqueue_buf_size + * Total enqueue buffer size. + * @param enqueue_buf_count + * mbuf count in enqueue buffer. + * @param mbuf + * mbuf array. + * @param nb_mbuf + * mbuf count. + * @param cb_arg + * Callback argument. + * @param[out] enq_mbuf + * The adapter enqueues enq_mbuf[] if the return value of the + * callback is less than nb_mbuf + * @return + * Returns the number of mbufs should be enqueued to eventdev + */ +typedef uint16_t (*rte_event_eth_rx_adapter_cb_fn)(uint16_t eth_dev_id, + uint16_t queue_id, + uint32_t enqueue_buf_size, + uint32_t enqueue_buf_count, + struct rte_mbuf **mbuf, + uint16_t nb_mbuf, + void *cb_arg, + struct rte_mbuf **enq_buf); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * * Create a new ethernet Rx event adapter with the specified identifier. * * @param id @@ -425,6 +481,32 @@ int rte_event_eth_rx_adapter_stats_reset(uint8_t id); */ int rte_event_eth_rx_adapter_service_id_get(uint8_t id, uint32_t *service_id); +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Register callback to process Rx packets, this is supported for + * SW based packet transfers. + * @see rte_event_eth_rx_cb_fn + * + * @param id + * Adapter identifier. + * @param eth_dev_id + * Port identifier of Ethernet device. + * @param cb_fn + * Callback function. + * @param cb_arg + * Callback arg. + * @return + * - 0: Success + * - <0: Error code on failure. + */ +int __rte_experimental +rte_event_eth_rx_adapter_cb_register(uint8_t id, + uint16_t eth_dev_id, + rte_event_eth_rx_adapter_cb_fn cb_fn, + void *cb_arg); + #ifdef __cplusplus } #endif diff --git a/lib/librte_eventdev/rte_event_ring.c b/lib/librte_eventdev/rte_event_ring.c index eb67751d..16d02a95 100644 --- a/lib/librte_eventdev/rte_event_ring.c +++ b/lib/librte_eventdev/rte_event_ring.c @@ -82,11 +82,16 @@ rte_event_ring_create(const char *name, unsigned int count, int socket_id, mz = rte_memzone_reserve(mz_name, ring_size, socket_id, mz_flags); if (mz != NULL) { r = mz->addr; - /* - * no need to check return value here, we already checked the - * arguments above - */ - rte_event_ring_init(r, name, requested_count, flags); + /* Check return value in case rte_ring_init() fails on size */ + int err = rte_event_ring_init(r, name, requested_count, flags); + if (err) { + RTE_LOG(ERR, RING, "Ring init failed\n"); + if (rte_memzone_free(mz) != 0) + RTE_LOG(ERR, RING, "Cannot free memzone\n"); + rte_free(te); + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + return NULL; + } te->data = (void *) r; r->r.memzone = mz; diff --git a/lib/librte_eventdev/rte_event_timer_adapter.c b/lib/librte_eventdev/rte_event_timer_adapter.c index 6f1d672c..79070d48 100644 --- a/lib/librte_eventdev/rte_event_timer_adapter.c +++ b/lib/librte_eventdev/rte_event_timer_adapter.c @@ -1282,9 +1282,7 @@ static const struct rte_event_timer_adapter_ops sw_event_adapter_timer_ops = { .cancel_burst = sw_event_timer_cancel_burst, }; -RTE_INIT(event_timer_adapter_init_log); -static void -event_timer_adapter_init_log(void) +RTE_INIT(event_timer_adapter_init_log) { evtim_logtype = rte_log_register("lib.eventdev.adapter.timer"); if (evtim_logtype >= 0) diff --git a/lib/librte_eventdev/rte_eventdev.c b/lib/librte_eventdev/rte_eventdev.c index 7ca9fd14..801810ed 100644 --- a/lib/librte_eventdev/rte_eventdev.c +++ b/lib/librte_eventdev/rte_eventdev.c @@ -57,16 +57,21 @@ int rte_event_dev_get_dev_id(const char *name) { int i; + uint8_t cmp; if (!name) return -EINVAL; - for (i = 0; i < rte_eventdev_globals->nb_devs; i++) - if ((strcmp(rte_event_devices[i].data->name, name) - == 0) && - (rte_event_devices[i].attached == - RTE_EVENTDEV_ATTACHED)) + for (i = 0; i < rte_eventdev_globals->nb_devs; i++) { + cmp = (strncmp(rte_event_devices[i].data->name, name, + RTE_EVENTDEV_NAME_MAX_LEN) == 0) || + (rte_event_devices[i].dev ? (strncmp( + rte_event_devices[i].dev->driver->name, name, + RTE_EVENTDEV_NAME_MAX_LEN) == 0) : 0); + if (cmp && (rte_event_devices[i].attached == + RTE_EVENTDEV_ATTACHED)) return i; + } return -ENODEV; } diff --git a/lib/librte_eventdev/rte_eventdev_version.map b/lib/librte_eventdev/rte_eventdev_version.map index c3f18d6d..12835e9f 100644 --- a/lib/librte_eventdev/rte_eventdev_version.map +++ b/lib/librte_eventdev/rte_eventdev_version.map @@ -83,6 +83,19 @@ DPDK_18.05 { EXPERIMENTAL { global: + rte_event_crypto_adapter_caps_get; + rte_event_crypto_adapter_create; + rte_event_crypto_adapter_create_ext; + rte_event_crypto_adapter_event_port_get; + rte_event_crypto_adapter_free; + rte_event_crypto_adapter_queue_pair_add; + rte_event_crypto_adapter_queue_pair_del; + rte_event_crypto_adapter_service_id_get; + rte_event_crypto_adapter_start; + rte_event_crypto_adapter_stats_get; + rte_event_crypto_adapter_stats_reset; + rte_event_crypto_adapter_stop; + rte_event_eth_rx_adapter_cb_register; rte_event_timer_adapter_caps_get; rte_event_timer_adapter_create; rte_event_timer_adapter_create_ext; @@ -97,16 +110,4 @@ EXPERIMENTAL { rte_event_timer_arm_burst; rte_event_timer_arm_tmo_tick_burst; rte_event_timer_cancel_burst; - rte_event_crypto_adapter_caps_get; - rte_event_crypto_adapter_create; - rte_event_crypto_adapter_create_ext; - rte_event_crypto_adapter_event_port_get; - rte_event_crypto_adapter_free; - rte_event_crypto_adapter_queue_pair_add; - rte_event_crypto_adapter_queue_pair_del; - rte_event_crypto_adapter_service_id_get; - rte_event_crypto_adapter_start; - rte_event_crypto_adapter_stats_get; - rte_event_crypto_adapter_stats_reset; - rte_event_crypto_adapter_stop; }; diff --git a/lib/librte_flow_classify/rte_flow_classify.c b/lib/librte_flow_classify/rte_flow_classify.c index 591d98e2..4c3469da 100644 --- a/lib/librte_flow_classify/rte_flow_classify.c +++ b/lib/librte_flow_classify/rte_flow_classify.c @@ -673,10 +673,7 @@ rte_flow_classifier_query(struct rte_flow_classifier *cls, return ret; } -RTE_INIT(librte_flow_classify_init_log); - -static void -librte_flow_classify_init_log(void) +RTE_INIT(librte_flow_classify_init_log) { librte_flow_classify_logtype = rte_log_register("lib.flow_classify"); diff --git a/lib/librte_gso/Makefile b/lib/librte_gso/Makefile index 3648ec09..1fac53a8 100644 --- a/lib/librte_gso/Makefile +++ b/lib/librte_gso/Makefile @@ -19,6 +19,7 @@ SRCS-$(CONFIG_RTE_LIBRTE_GSO) += rte_gso.c SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_common.c SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_tcp4.c SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_tunnel_tcp4.c +SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_udp4.c # install this header file SYMLINK-$(CONFIG_RTE_LIBRTE_GSO)-include += rte_gso.h diff --git a/lib/librte_gso/gso_common.h b/lib/librte_gso/gso_common.h index 5ca59745..6cd764ff 100644 --- a/lib/librte_gso/gso_common.h +++ b/lib/librte_gso/gso_common.h @@ -31,6 +31,9 @@ (PKT_TX_TCP_SEG | PKT_TX_IPV4 | PKT_TX_OUTER_IPV4 | \ PKT_TX_TUNNEL_GRE)) +#define IS_IPV4_UDP(flag) (((flag) & (PKT_TX_UDP_SEG | PKT_TX_IPV4)) == \ + (PKT_TX_UDP_SEG | PKT_TX_IPV4)) + /** * Internal function which updates the UDP header of a packet, following * segmentation. This is required to update the header's datagram length field. diff --git a/lib/librte_gso/gso_udp4.c b/lib/librte_gso/gso_udp4.c new file mode 100644 index 00000000..927dee12 --- /dev/null +++ b/lib/librte_gso/gso_udp4.c @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include "gso_common.h" +#include "gso_udp4.h" + +#define IPV4_HDR_MF_BIT (1U << 13) + +static inline void +update_ipv4_udp_headers(struct rte_mbuf *pkt, struct rte_mbuf **segs, + uint16_t nb_segs) +{ + struct ipv4_hdr *ipv4_hdr; + uint16_t frag_offset = 0, is_mf; + uint16_t l2_hdrlen = pkt->l2_len, l3_hdrlen = pkt->l3_len; + uint16_t tail_idx = nb_segs - 1, length, i; + + /* + * Update IP header fields for output segments. Specifically, + * keep the same IP id, update fragment offset and total + * length. + */ + for (i = 0; i < nb_segs; i++) { + ipv4_hdr = rte_pktmbuf_mtod_offset(segs[i], struct ipv4_hdr *, + l2_hdrlen); + length = segs[i]->pkt_len - l2_hdrlen; + ipv4_hdr->total_length = rte_cpu_to_be_16(length); + + is_mf = i < tail_idx ? IPV4_HDR_MF_BIT : 0; + ipv4_hdr->fragment_offset = + rte_cpu_to_be_16(frag_offset | is_mf); + frag_offset += ((length - l3_hdrlen) >> 3); + } +} + +int +gso_udp4_segment(struct rte_mbuf *pkt, + uint16_t gso_size, + struct rte_mempool *direct_pool, + struct rte_mempool *indirect_pool, + struct rte_mbuf **pkts_out, + uint16_t nb_pkts_out) +{ + struct ipv4_hdr *ipv4_hdr; + uint16_t pyld_unit_size, hdr_offset; + uint16_t frag_off; + int ret; + + /* Don't process the fragmented packet */ + ipv4_hdr = rte_pktmbuf_mtod_offset(pkt, struct ipv4_hdr *, + pkt->l2_len); + frag_off = rte_be_to_cpu_16(ipv4_hdr->fragment_offset); + if (unlikely(IS_FRAGMENTED(frag_off))) { + pkts_out[0] = pkt; + return 1; + } + + /* + * UDP fragmentation is the same as IP fragmentation. + * Except the first one, other output packets just have l2 + * and l3 headers. + */ + hdr_offset = pkt->l2_len + pkt->l3_len; + + /* Don't process the packet without data. */ + if (unlikely(hdr_offset + pkt->l4_len >= pkt->pkt_len)) { + pkts_out[0] = pkt; + return 1; + } + + pyld_unit_size = gso_size - hdr_offset; + + /* Segment the payload */ + ret = gso_do_segment(pkt, hdr_offset, pyld_unit_size, direct_pool, + indirect_pool, pkts_out, nb_pkts_out); + if (ret > 1) + update_ipv4_udp_headers(pkt, pkts_out, ret); + + return ret; +} diff --git a/lib/librte_gso/gso_udp4.h b/lib/librte_gso/gso_udp4.h new file mode 100644 index 00000000..b2a2908e --- /dev/null +++ b/lib/librte_gso/gso_udp4.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#ifndef _GSO_UDP4_H_ +#define _GSO_UDP4_H_ + +#include <stdint.h> +#include <rte_mbuf.h> + +/** + * Segment an UDP/IPv4 packet. This function doesn't check if the input + * packet has correct checksums, and doesn't update checksums for output + * GSO segments. Furthermore, it doesn't process IP fragment packets. + * + * @param pkt + * The packet mbuf to segment. + * @param gso_size + * The max length of a GSO segment, measured in bytes. + * @param direct_pool + * MBUF pool used for allocating direct buffers for output segments. + * @param indirect_pool + * MBUF pool used for allocating indirect buffers for output segments. + * @param pkts_out + * Pointer array used to store the MBUF addresses of output GSO + * segments, when the function succeeds. If the memory space in + * pkts_out is insufficient, it fails and returns -EINVAL. + * @param nb_pkts_out + * The max number of items that 'pkts_out' can keep. + * + * @return + * - The number of GSO segments filled in pkts_out on success. + * - Return -ENOMEM if run out of memory in MBUF pools. + * - Return -EINVAL for invalid parameters. + */ +int gso_udp4_segment(struct rte_mbuf *pkt, + uint16_t gso_size, + struct rte_mempool *direct_pool, + struct rte_mempool *indirect_pool, + struct rte_mbuf **pkts_out, + uint16_t nb_pkts_out); +#endif diff --git a/lib/librte_gso/meson.build b/lib/librte_gso/meson.build index 056534fb..ad8dd858 100644 --- a/lib/librte_gso/meson.build +++ b/lib/librte_gso/meson.build @@ -1,7 +1,7 @@ # SPDX-License-Identifier: BSD-3-Clause # Copyright(c) 2017 Intel Corporation -sources = files('gso_common.c', 'gso_tcp4.c', +sources = files('gso_common.c', 'gso_tcp4.c', 'gso_udp4.c', 'gso_tunnel_tcp4.c', 'rte_gso.c') headers = files('rte_gso.h') deps += ['ethdev'] diff --git a/lib/librte_gso/rte_gso.c b/lib/librte_gso/rte_gso.c index a44e3d43..751b5b62 100644 --- a/lib/librte_gso/rte_gso.c +++ b/lib/librte_gso/rte_gso.c @@ -11,6 +11,17 @@ #include "gso_common.h" #include "gso_tcp4.h" #include "gso_tunnel_tcp4.h" +#include "gso_udp4.h" + +#define ILLEGAL_UDP_GSO_CTX(ctx) \ + ((((ctx)->gso_types & DEV_TX_OFFLOAD_UDP_TSO) == 0) || \ + (ctx)->gso_size < RTE_GSO_UDP_SEG_SIZE_MIN) + +#define ILLEGAL_TCP_GSO_CTX(ctx) \ + ((((ctx)->gso_types & (DEV_TX_OFFLOAD_TCP_TSO | \ + DEV_TX_OFFLOAD_VXLAN_TNL_TSO | \ + DEV_TX_OFFLOAD_GRE_TNL_TSO)) == 0) || \ + (ctx)->gso_size < RTE_GSO_SEG_SIZE_MIN) int rte_gso_segment(struct rte_mbuf *pkt, @@ -27,14 +38,12 @@ rte_gso_segment(struct rte_mbuf *pkt, if (pkt == NULL || pkts_out == NULL || gso_ctx == NULL || nb_pkts_out < 1 || - gso_ctx->gso_size < RTE_GSO_SEG_SIZE_MIN || - ((gso_ctx->gso_types & (DEV_TX_OFFLOAD_TCP_TSO | - DEV_TX_OFFLOAD_VXLAN_TNL_TSO | - DEV_TX_OFFLOAD_GRE_TNL_TSO)) == 0)) + (ILLEGAL_UDP_GSO_CTX(gso_ctx) && + ILLEGAL_TCP_GSO_CTX(gso_ctx))) return -EINVAL; if (gso_ctx->gso_size >= pkt->pkt_len) { - pkt->ol_flags &= (~PKT_TX_TCP_SEG); + pkt->ol_flags &= (~(PKT_TX_TCP_SEG | PKT_TX_UDP_SEG)); pkts_out[0] = pkt; return 1; } @@ -59,6 +68,11 @@ rte_gso_segment(struct rte_mbuf *pkt, ret = gso_tcp4_segment(pkt, gso_size, ipid_delta, direct_pool, indirect_pool, pkts_out, nb_pkts_out); + } else if (IS_IPV4_UDP(pkt->ol_flags) && + (gso_ctx->gso_types & DEV_TX_OFFLOAD_UDP_TSO)) { + pkt->ol_flags &= (~PKT_TX_UDP_SEG); + ret = gso_udp4_segment(pkt, gso_size, direct_pool, + indirect_pool, pkts_out, nb_pkts_out); } else { /* unsupported packet, skip */ pkts_out[0] = pkt; diff --git a/lib/librte_gso/rte_gso.h b/lib/librte_gso/rte_gso.h index f4abd61c..a626a11e 100644 --- a/lib/librte_gso/rte_gso.h +++ b/lib/librte_gso/rte_gso.h @@ -17,10 +17,14 @@ extern "C" { #include <stdint.h> #include <rte_mbuf.h> -/* Minimum GSO segment size. */ +/* Minimum GSO segment size for TCP based packets. */ #define RTE_GSO_SEG_SIZE_MIN (sizeof(struct ether_hdr) + \ sizeof(struct ipv4_hdr) + sizeof(struct tcp_hdr) + 1) +/* Minimum GSO segment size for UDP based packets. */ +#define RTE_GSO_UDP_SEG_SIZE_MIN (sizeof(struct ether_hdr) + \ + sizeof(struct ipv4_hdr) + sizeof(struct udp_hdr) + 1) + /* GSO flags for rte_gso_ctx. */ #define RTE_GSO_FLAG_IPID_FIXED (1ULL << 0) /**< Use fixed IP ids for output GSO segments. Setting diff --git a/lib/librte_hash/meson.build b/lib/librte_hash/meson.build index e139e1d7..efc06ede 100644 --- a/lib/librte_hash/meson.build +++ b/lib/librte_hash/meson.build @@ -6,7 +6,6 @@ headers = files('rte_cmp_arm64.h', 'rte_cmp_x86.h', 'rte_crc_arm64.h', 'rte_cuckoo_hash.h', - 'rte_cuckoo_hash_x86.h', 'rte_fbk_hash.h', 'rte_hash_crc.h', 'rte_hash.h', diff --git a/lib/librte_hash/rte_cuckoo_hash.c b/lib/librte_hash/rte_cuckoo_hash.c index a07543a2..f7b86c8c 100644 --- a/lib/librte_hash/rte_cuckoo_hash.c +++ b/lib/librte_hash/rte_cuckoo_hash.c @@ -31,9 +31,6 @@ #include "rte_hash.h" #include "rte_cuckoo_hash.h" -#if defined(RTE_ARCH_X86) -#include "rte_cuckoo_hash_x86.h" -#endif TAILQ_HEAD(rte_hash_list, rte_tailq_entry); @@ -93,8 +90,10 @@ rte_hash_create(const struct rte_hash_parameters *params) void *buckets = NULL; char ring_name[RTE_RING_NAMESIZE]; unsigned num_key_slots; - unsigned hw_trans_mem_support = 0; unsigned i; + unsigned int hw_trans_mem_support = 0, multi_writer_support = 0; + unsigned int readwrite_concur_support = 0; + rte_hash_function default_hash_func = (rte_hash_function)rte_jhash; hash_list = RTE_TAILQ_CAST(rte_hash_tailq.head, rte_hash_list); @@ -107,7 +106,6 @@ rte_hash_create(const struct rte_hash_parameters *params) /* Check for valid parameters */ if ((params->entries > RTE_HASH_ENTRIES_MAX) || (params->entries < RTE_HASH_BUCKET_ENTRIES) || - !rte_is_power_of_2(RTE_HASH_BUCKET_ENTRIES) || (params->key_len == 0)) { rte_errno = EINVAL; RTE_LOG(ERR, HASH, "rte_hash_create has invalid parameters\n"); @@ -118,21 +116,29 @@ rte_hash_create(const struct rte_hash_parameters *params) if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_TRANS_MEM_SUPPORT) hw_trans_mem_support = 1; + if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_MULTI_WRITER_ADD) + multi_writer_support = 1; + + if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY) { + readwrite_concur_support = 1; + multi_writer_support = 1; + } + /* Store all keys and leave the first entry as a dummy entry for lookup_bulk */ - if (hw_trans_mem_support) + if (multi_writer_support) /* * Increase number of slots by total number of indices * that can be stored in the lcore caches * except for the first cache */ num_key_slots = params->entries + (RTE_MAX_LCORE - 1) * - LCORE_CACHE_SIZE + 1; + (LCORE_CACHE_SIZE - 1) + 1; else num_key_slots = params->entries + 1; snprintf(ring_name, sizeof(ring_name), "HT_%s", params->name); /* Create ring (Dummy slot index is not enqueued) */ - r = rte_ring_create(ring_name, rte_align32pow2(num_key_slots - 1), + r = rte_ring_create(ring_name, rte_align32pow2(num_key_slots), params->socket_id, 0); if (r == NULL) { RTE_LOG(ERR, HASH, "memory allocation failed\n"); @@ -233,7 +239,7 @@ rte_hash_create(const struct rte_hash_parameters *params) h->cmp_jump_table_idx = KEY_OTHER_BYTES; #endif - if (hw_trans_mem_support) { + if (multi_writer_support) { h->local_free_slots = rte_zmalloc_socket(NULL, sizeof(struct lcore_cache) * RTE_MAX_LCORE, RTE_CACHE_LINE_SIZE, params->socket_id); @@ -261,6 +267,8 @@ rte_hash_create(const struct rte_hash_parameters *params) h->key_store = k; h->free_slots = r; h->hw_trans_mem_support = hw_trans_mem_support; + h->multi_writer_support = multi_writer_support; + h->readwrite_concur_support = readwrite_concur_support; #if defined(RTE_ARCH_X86) if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2)) @@ -271,24 +279,20 @@ rte_hash_create(const struct rte_hash_parameters *params) #endif h->sig_cmp_fn = RTE_HASH_COMPARE_SCALAR; - /* Turn on multi-writer only with explicit flat from user and TM + /* Turn on multi-writer only with explicit flag from user and TM * support. */ - if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_MULTI_WRITER_ADD) { - if (h->hw_trans_mem_support) { - h->add_key = ADD_KEY_MULTIWRITER_TM; - } else { - h->add_key = ADD_KEY_MULTIWRITER; - h->multiwriter_lock = rte_malloc(NULL, - sizeof(rte_spinlock_t), - LCORE_CACHE_SIZE); - rte_spinlock_init(h->multiwriter_lock); - } - } else - h->add_key = ADD_KEY_SINGLEWRITER; + if (h->multi_writer_support) { + h->readwrite_lock = rte_malloc(NULL, sizeof(rte_rwlock_t), + RTE_CACHE_LINE_SIZE); + if (h->readwrite_lock == NULL) + goto err_unlock; + + rte_rwlock_init(h->readwrite_lock); + } /* Populate free slots ring. Entry zero is reserved for key misses. */ - for (i = 1; i < params->entries + 1; i++) + for (i = 1; i < num_key_slots; i++) rte_ring_sp_enqueue(r, (void *)((uintptr_t) i)); te->data = (void *) h; @@ -335,11 +339,10 @@ rte_hash_free(struct rte_hash *h) rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); - if (h->hw_trans_mem_support) + if (h->multi_writer_support) { rte_free(h->local_free_slots); - - if (h->add_key == ADD_KEY_MULTIWRITER) - rte_free(h->multiwriter_lock); + rte_free(h->readwrite_lock); + } rte_ring_free(h->free_slots); rte_free(h->key_store); rte_free(h->buckets); @@ -366,15 +369,78 @@ rte_hash_secondary_hash(const hash_sig_t primary_hash) return primary_hash ^ ((tag + 1) * alt_bits_xor); } +int32_t +rte_hash_count(const struct rte_hash *h) +{ + uint32_t tot_ring_cnt, cached_cnt = 0; + uint32_t i, ret; + + if (h == NULL) + return -EINVAL; + + if (h->multi_writer_support) { + tot_ring_cnt = h->entries + (RTE_MAX_LCORE - 1) * + (LCORE_CACHE_SIZE - 1); + for (i = 0; i < RTE_MAX_LCORE; i++) + cached_cnt += h->local_free_slots[i].len; + + ret = tot_ring_cnt - rte_ring_count(h->free_slots) - + cached_cnt; + } else { + tot_ring_cnt = h->entries; + ret = tot_ring_cnt - rte_ring_count(h->free_slots); + } + return ret; +} + +/* Read write locks implemented using rte_rwlock */ +static inline void +__hash_rw_writer_lock(const struct rte_hash *h) +{ + if (h->multi_writer_support && h->hw_trans_mem_support) + rte_rwlock_write_lock_tm(h->readwrite_lock); + else if (h->multi_writer_support) + rte_rwlock_write_lock(h->readwrite_lock); +} + + +static inline void +__hash_rw_reader_lock(const struct rte_hash *h) +{ + if (h->readwrite_concur_support && h->hw_trans_mem_support) + rte_rwlock_read_lock_tm(h->readwrite_lock); + else if (h->readwrite_concur_support) + rte_rwlock_read_lock(h->readwrite_lock); +} + +static inline void +__hash_rw_writer_unlock(const struct rte_hash *h) +{ + if (h->multi_writer_support && h->hw_trans_mem_support) + rte_rwlock_write_unlock_tm(h->readwrite_lock); + else if (h->multi_writer_support) + rte_rwlock_write_unlock(h->readwrite_lock); +} + +static inline void +__hash_rw_reader_unlock(const struct rte_hash *h) +{ + if (h->readwrite_concur_support && h->hw_trans_mem_support) + rte_rwlock_read_unlock_tm(h->readwrite_lock); + else if (h->readwrite_concur_support) + rte_rwlock_read_unlock(h->readwrite_lock); +} + void rte_hash_reset(struct rte_hash *h) { void *ptr; - unsigned i; + uint32_t tot_ring_cnt, i; if (h == NULL) return; + __hash_rw_writer_lock(h); memset(h->buckets, 0, h->num_buckets * sizeof(struct rte_hash_bucket)); memset(h->key_store, 0, h->key_entry_size * (h->entries + 1)); @@ -383,97 +449,260 @@ rte_hash_reset(struct rte_hash *h) rte_pause(); /* Repopulate the free slots ring. Entry zero is reserved for key misses */ - for (i = 1; i < h->entries + 1; i++) + if (h->multi_writer_support) + tot_ring_cnt = h->entries + (RTE_MAX_LCORE - 1) * + (LCORE_CACHE_SIZE - 1); + else + tot_ring_cnt = h->entries; + + for (i = 1; i < tot_ring_cnt + 1; i++) rte_ring_sp_enqueue(h->free_slots, (void *)((uintptr_t) i)); - if (h->hw_trans_mem_support) { + if (h->multi_writer_support) { /* Reset local caches per lcore */ for (i = 0; i < RTE_MAX_LCORE; i++) h->local_free_slots[i].len = 0; } + __hash_rw_writer_unlock(h); } -/* Search for an entry that can be pushed to its alternative location */ -static inline int -make_space_bucket(const struct rte_hash *h, struct rte_hash_bucket *bkt, - unsigned int *nr_pushes) +/* + * Function called to enqueue back an index in the cache/ring, + * as slot has not being used and it can be used in the + * next addition attempt. + */ +static inline void +enqueue_slot_back(const struct rte_hash *h, + struct lcore_cache *cached_free_slots, + void *slot_id) { - unsigned i, j; - int ret; - uint32_t next_bucket_idx; - struct rte_hash_bucket *next_bkt[RTE_HASH_BUCKET_ENTRIES]; + if (h->multi_writer_support) { + cached_free_slots->objs[cached_free_slots->len] = slot_id; + cached_free_slots->len++; + } else + rte_ring_sp_enqueue(h->free_slots, slot_id); +} + +/* Search a key from bucket and update its data */ +static inline int32_t +search_and_update(const struct rte_hash *h, void *data, const void *key, + struct rte_hash_bucket *bkt, hash_sig_t sig, hash_sig_t alt_hash) +{ + int i; + struct rte_hash_key *k, *keys = h->key_store; - /* - * Push existing item (search for bucket with space in - * alternative locations) to its alternative location - */ for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { - /* Search for space in alternative locations */ - next_bucket_idx = bkt->sig_alt[i] & h->bucket_bitmask; - next_bkt[i] = &h->buckets[next_bucket_idx]; - for (j = 0; j < RTE_HASH_BUCKET_ENTRIES; j++) { - if (next_bkt[i]->key_idx[j] == EMPTY_SLOT) - break; + if (bkt->sig_current[i] == sig && + bkt->sig_alt[i] == alt_hash) { + k = (struct rte_hash_key *) ((char *)keys + + bkt->key_idx[i] * h->key_entry_size); + if (rte_hash_cmp_eq(key, k->key, h) == 0) { + /* Update data */ + k->pdata = data; + /* + * Return index where key is stored, + * subtracting the first dummy index + */ + return bkt->key_idx[i] - 1; + } } - - if (j != RTE_HASH_BUCKET_ENTRIES) - break; } + return -1; +} - /* Alternative location has spare room (end of recursive function) */ - if (i != RTE_HASH_BUCKET_ENTRIES) { - next_bkt[i]->sig_alt[j] = bkt->sig_current[i]; - next_bkt[i]->sig_current[j] = bkt->sig_alt[i]; - next_bkt[i]->key_idx[j] = bkt->key_idx[i]; - return i; +/* Only tries to insert at one bucket (@prim_bkt) without trying to push + * buckets around. + * return 1 if matching existing key, return 0 if succeeds, return -1 for no + * empty entry. + */ +static inline int32_t +rte_hash_cuckoo_insert_mw(const struct rte_hash *h, + struct rte_hash_bucket *prim_bkt, + struct rte_hash_bucket *sec_bkt, + const struct rte_hash_key *key, void *data, + hash_sig_t sig, hash_sig_t alt_hash, uint32_t new_idx, + int32_t *ret_val) +{ + unsigned int i; + struct rte_hash_bucket *cur_bkt = prim_bkt; + int32_t ret; + + __hash_rw_writer_lock(h); + /* Check if key was inserted after last check but before this + * protected region in case of inserting duplicated keys. + */ + ret = search_and_update(h, data, key, cur_bkt, sig, alt_hash); + if (ret != -1) { + __hash_rw_writer_unlock(h); + *ret_val = ret; + return 1; + } + ret = search_and_update(h, data, key, sec_bkt, alt_hash, sig); + if (ret != -1) { + __hash_rw_writer_unlock(h); + *ret_val = ret; + return 1; } - /* Pick entry that has not been pushed yet */ - for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) - if (bkt->flag[i] == 0) + /* Insert new entry if there is room in the primary + * bucket. + */ + for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { + /* Check if slot is available */ + if (likely(prim_bkt->key_idx[i] == EMPTY_SLOT)) { + prim_bkt->sig_current[i] = sig; + prim_bkt->sig_alt[i] = alt_hash; + prim_bkt->key_idx[i] = new_idx; break; + } + } + __hash_rw_writer_unlock(h); - /* All entries have been pushed, so entry cannot be added */ - if (i == RTE_HASH_BUCKET_ENTRIES || ++(*nr_pushes) > RTE_HASH_MAX_PUSHES) - return -ENOSPC; + if (i != RTE_HASH_BUCKET_ENTRIES) + return 0; - /* Set flag to indicate that this entry is going to be pushed */ - bkt->flag[i] = 1; + /* no empty entry */ + return -1; +} - /* Need room in alternative bucket to insert the pushed entry */ - ret = make_space_bucket(h, next_bkt[i], nr_pushes); - /* - * After recursive function. - * Clear flags and insert the pushed entry - * in its alternative location if successful, - * or return error +/* Shift buckets along provided cuckoo_path (@leaf and @leaf_slot) and fill + * the path head with new entry (sig, alt_hash, new_idx) + * return 1 if matched key found, return -1 if cuckoo path invalided and fail, + * return 0 if succeeds. + */ +static inline int +rte_hash_cuckoo_move_insert_mw(const struct rte_hash *h, + struct rte_hash_bucket *bkt, + struct rte_hash_bucket *alt_bkt, + const struct rte_hash_key *key, void *data, + struct queue_node *leaf, uint32_t leaf_slot, + hash_sig_t sig, hash_sig_t alt_hash, uint32_t new_idx, + int32_t *ret_val) +{ + uint32_t prev_alt_bkt_idx; + struct rte_hash_bucket *cur_bkt = bkt; + struct queue_node *prev_node, *curr_node = leaf; + struct rte_hash_bucket *prev_bkt, *curr_bkt = leaf->bkt; + uint32_t prev_slot, curr_slot = leaf_slot; + int32_t ret; + + __hash_rw_writer_lock(h); + + /* In case empty slot was gone before entering protected region */ + if (curr_bkt->key_idx[curr_slot] != EMPTY_SLOT) { + __hash_rw_writer_unlock(h); + return -1; + } + + /* Check if key was inserted after last check but before this + * protected region. */ - bkt->flag[i] = 0; - if (ret >= 0) { - next_bkt[i]->sig_alt[ret] = bkt->sig_current[i]; - next_bkt[i]->sig_current[ret] = bkt->sig_alt[i]; - next_bkt[i]->key_idx[ret] = bkt->key_idx[i]; - return i; - } else - return ret; + ret = search_and_update(h, data, key, cur_bkt, sig, alt_hash); + if (ret != -1) { + __hash_rw_writer_unlock(h); + *ret_val = ret; + return 1; + } + + ret = search_and_update(h, data, key, alt_bkt, alt_hash, sig); + if (ret != -1) { + __hash_rw_writer_unlock(h); + *ret_val = ret; + return 1; + } + + while (likely(curr_node->prev != NULL)) { + prev_node = curr_node->prev; + prev_bkt = prev_node->bkt; + prev_slot = curr_node->prev_slot; + + prev_alt_bkt_idx = + prev_bkt->sig_alt[prev_slot] & h->bucket_bitmask; + + if (unlikely(&h->buckets[prev_alt_bkt_idx] + != curr_bkt)) { + /* revert it to empty, otherwise duplicated keys */ + curr_bkt->key_idx[curr_slot] = EMPTY_SLOT; + __hash_rw_writer_unlock(h); + return -1; + } + + /* Need to swap current/alt sig to allow later + * Cuckoo insert to move elements back to its + * primary bucket if available + */ + curr_bkt->sig_alt[curr_slot] = + prev_bkt->sig_current[prev_slot]; + curr_bkt->sig_current[curr_slot] = + prev_bkt->sig_alt[prev_slot]; + curr_bkt->key_idx[curr_slot] = + prev_bkt->key_idx[prev_slot]; + + curr_slot = prev_slot; + curr_node = prev_node; + curr_bkt = curr_node->bkt; + } + + curr_bkt->sig_current[curr_slot] = sig; + curr_bkt->sig_alt[curr_slot] = alt_hash; + curr_bkt->key_idx[curr_slot] = new_idx; + + __hash_rw_writer_unlock(h); + + return 0; } /* - * Function called to enqueue back an index in the cache/ring, - * as slot has not being used and it can be used in the - * next addition attempt. + * Make space for new key, using bfs Cuckoo Search and Multi-Writer safe + * Cuckoo */ -static inline void -enqueue_slot_back(const struct rte_hash *h, - struct lcore_cache *cached_free_slots, - void *slot_id) +static inline int +rte_hash_cuckoo_make_space_mw(const struct rte_hash *h, + struct rte_hash_bucket *bkt, + struct rte_hash_bucket *sec_bkt, + const struct rte_hash_key *key, void *data, + hash_sig_t sig, hash_sig_t alt_hash, + uint32_t new_idx, int32_t *ret_val) { - if (h->hw_trans_mem_support) { - cached_free_slots->objs[cached_free_slots->len] = slot_id; - cached_free_slots->len++; - } else - rte_ring_sp_enqueue(h->free_slots, slot_id); + unsigned int i; + struct queue_node queue[RTE_HASH_BFS_QUEUE_MAX_LEN]; + struct queue_node *tail, *head; + struct rte_hash_bucket *curr_bkt, *alt_bkt; + + tail = queue; + head = queue + 1; + tail->bkt = bkt; + tail->prev = NULL; + tail->prev_slot = -1; + + /* Cuckoo bfs Search */ + while (likely(tail != head && head < + queue + RTE_HASH_BFS_QUEUE_MAX_LEN - + RTE_HASH_BUCKET_ENTRIES)) { + curr_bkt = tail->bkt; + for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { + if (curr_bkt->key_idx[i] == EMPTY_SLOT) { + int32_t ret = rte_hash_cuckoo_move_insert_mw(h, + bkt, sec_bkt, key, data, + tail, i, sig, alt_hash, + new_idx, ret_val); + if (likely(ret != -1)) + return ret; + } + + /* Enqueue new node and keep prev node info */ + alt_bkt = &(h->buckets[curr_bkt->sig_alt[i] + & h->bucket_bitmask]); + head->bkt = alt_bkt; + head->prev = tail; + head->prev_slot = i; + head++; + } + tail++; + } + + return -ENOSPC; } static inline int32_t @@ -482,19 +711,15 @@ __rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key, { hash_sig_t alt_hash; uint32_t prim_bucket_idx, sec_bucket_idx; - unsigned i; struct rte_hash_bucket *prim_bkt, *sec_bkt; - struct rte_hash_key *new_k, *k, *keys = h->key_store; + struct rte_hash_key *new_k, *keys = h->key_store; void *slot_id = NULL; uint32_t new_idx; int ret; unsigned n_slots; unsigned lcore_id; struct lcore_cache *cached_free_slots = NULL; - unsigned int nr_pushes = 0; - - if (h->add_key == ADD_KEY_MULTIWRITER) - rte_spinlock_lock(h->multiwriter_lock); + int32_t ret_val; prim_bucket_idx = sig & h->bucket_bitmask; prim_bkt = &h->buckets[prim_bucket_idx]; @@ -505,8 +730,24 @@ __rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key, sec_bkt = &h->buckets[sec_bucket_idx]; rte_prefetch0(sec_bkt); - /* Get a new slot for storing the new key */ - if (h->hw_trans_mem_support) { + /* Check if key is already inserted in primary location */ + __hash_rw_writer_lock(h); + ret = search_and_update(h, data, key, prim_bkt, sig, alt_hash); + if (ret != -1) { + __hash_rw_writer_unlock(h); + return ret; + } + + /* Check if key is already inserted in secondary location */ + ret = search_and_update(h, data, key, sec_bkt, alt_hash, sig); + if (ret != -1) { + __hash_rw_writer_unlock(h); + return ret; + } + __hash_rw_writer_unlock(h); + + /* Did not find a match, so get a new slot for storing the new key */ + if (h->multi_writer_support) { lcore_id = rte_lcore_id(); cached_free_slots = &h->local_free_slots[lcore_id]; /* Try to get a free slot from the local cache */ @@ -516,8 +757,7 @@ __rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key, cached_free_slots->objs, LCORE_CACHE_SIZE, NULL); if (n_slots == 0) { - ret = -ENOSPC; - goto failure; + return -ENOSPC; } cached_free_slots->len += n_slots; @@ -528,124 +768,50 @@ __rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key, slot_id = cached_free_slots->objs[cached_free_slots->len]; } else { if (rte_ring_sc_dequeue(h->free_slots, &slot_id) != 0) { - ret = -ENOSPC; - goto failure; + return -ENOSPC; } } new_k = RTE_PTR_ADD(keys, (uintptr_t)slot_id * h->key_entry_size); - rte_prefetch0(new_k); new_idx = (uint32_t)((uintptr_t) slot_id); - - /* Check if key is already inserted in primary location */ - for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { - if (prim_bkt->sig_current[i] == sig && - prim_bkt->sig_alt[i] == alt_hash) { - k = (struct rte_hash_key *) ((char *)keys + - prim_bkt->key_idx[i] * h->key_entry_size); - if (rte_hash_cmp_eq(key, k->key, h) == 0) { - /* Enqueue index of free slot back in the ring. */ - enqueue_slot_back(h, cached_free_slots, slot_id); - /* Update data */ - k->pdata = data; - /* - * Return index where key is stored, - * subtracting the first dummy index - */ - ret = prim_bkt->key_idx[i] - 1; - goto failure; - } - } - } - - /* Check if key is already inserted in secondary location */ - for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { - if (sec_bkt->sig_alt[i] == sig && - sec_bkt->sig_current[i] == alt_hash) { - k = (struct rte_hash_key *) ((char *)keys + - sec_bkt->key_idx[i] * h->key_entry_size); - if (rte_hash_cmp_eq(key, k->key, h) == 0) { - /* Enqueue index of free slot back in the ring. */ - enqueue_slot_back(h, cached_free_slots, slot_id); - /* Update data */ - k->pdata = data; - /* - * Return index where key is stored, - * subtracting the first dummy index - */ - ret = sec_bkt->key_idx[i] - 1; - goto failure; - } - } - } - /* Copy key */ rte_memcpy(new_k->key, key, h->key_len); new_k->pdata = data; -#if defined(RTE_ARCH_X86) /* currently only x86 support HTM */ - if (h->add_key == ADD_KEY_MULTIWRITER_TM) { - ret = rte_hash_cuckoo_insert_mw_tm(prim_bkt, - sig, alt_hash, new_idx); - if (ret >= 0) - return new_idx - 1; - /* Primary bucket full, need to make space for new entry */ - ret = rte_hash_cuckoo_make_space_mw_tm(h, prim_bkt, sig, - alt_hash, new_idx); + /* Find an empty slot and insert */ + ret = rte_hash_cuckoo_insert_mw(h, prim_bkt, sec_bkt, key, data, + sig, alt_hash, new_idx, &ret_val); + if (ret == 0) + return new_idx - 1; + else if (ret == 1) { + enqueue_slot_back(h, cached_free_slots, slot_id); + return ret_val; + } - if (ret >= 0) - return new_idx - 1; + /* Primary bucket full, need to make space for new entry */ + ret = rte_hash_cuckoo_make_space_mw(h, prim_bkt, sec_bkt, key, data, + sig, alt_hash, new_idx, &ret_val); + if (ret == 0) + return new_idx - 1; + else if (ret == 1) { + enqueue_slot_back(h, cached_free_slots, slot_id); + return ret_val; + } - /* Also search secondary bucket to get better occupancy */ - ret = rte_hash_cuckoo_make_space_mw_tm(h, sec_bkt, sig, - alt_hash, new_idx); + /* Also search secondary bucket to get better occupancy */ + ret = rte_hash_cuckoo_make_space_mw(h, sec_bkt, prim_bkt, key, data, + alt_hash, sig, new_idx, &ret_val); - if (ret >= 0) - return new_idx - 1; + if (ret == 0) + return new_idx - 1; + else if (ret == 1) { + enqueue_slot_back(h, cached_free_slots, slot_id); + return ret_val; } else { -#endif - for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { - /* Check if slot is available */ - if (likely(prim_bkt->key_idx[i] == EMPTY_SLOT)) { - prim_bkt->sig_current[i] = sig; - prim_bkt->sig_alt[i] = alt_hash; - prim_bkt->key_idx[i] = new_idx; - break; - } - } - - if (i != RTE_HASH_BUCKET_ENTRIES) { - if (h->add_key == ADD_KEY_MULTIWRITER) - rte_spinlock_unlock(h->multiwriter_lock); - return new_idx - 1; - } - - /* Primary bucket full, need to make space for new entry - * After recursive function. - * Insert the new entry in the position of the pushed entry - * if successful or return error and - * store the new slot back in the ring - */ - ret = make_space_bucket(h, prim_bkt, &nr_pushes); - if (ret >= 0) { - prim_bkt->sig_current[ret] = sig; - prim_bkt->sig_alt[ret] = alt_hash; - prim_bkt->key_idx[ret] = new_idx; - if (h->add_key == ADD_KEY_MULTIWRITER) - rte_spinlock_unlock(h->multiwriter_lock); - return new_idx - 1; - } -#if defined(RTE_ARCH_X86) + enqueue_slot_back(h, cached_free_slots, slot_id); + return ret; } -#endif - /* Error in addition, store new slot back in the ring and return error */ - enqueue_slot_back(h, cached_free_slots, (void *)((uintptr_t) new_idx)); - -failure: - if (h->add_key == ADD_KEY_MULTIWRITER) - rte_spinlock_unlock(h->multiwriter_lock); - return ret; } int32_t @@ -690,20 +856,15 @@ rte_hash_add_key_data(const struct rte_hash *h, const void *key, void *data) else return ret; } + +/* Search one bucket to find the match key */ static inline int32_t -__rte_hash_lookup_with_hash(const struct rte_hash *h, const void *key, - hash_sig_t sig, void **data) +search_one_bucket(const struct rte_hash *h, const void *key, hash_sig_t sig, + void **data, const struct rte_hash_bucket *bkt) { - uint32_t bucket_idx; - hash_sig_t alt_hash; - unsigned i; - struct rte_hash_bucket *bkt; + int i; struct rte_hash_key *k, *keys = h->key_store; - bucket_idx = sig & h->bucket_bitmask; - bkt = &h->buckets[bucket_idx]; - - /* Check if key is in primary location */ for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { if (bkt->sig_current[i] == sig && bkt->key_idx[i] != EMPTY_SLOT) { @@ -720,30 +881,41 @@ __rte_hash_lookup_with_hash(const struct rte_hash *h, const void *key, } } } + return -1; +} + +static inline int32_t +__rte_hash_lookup_with_hash(const struct rte_hash *h, const void *key, + hash_sig_t sig, void **data) +{ + uint32_t bucket_idx; + hash_sig_t alt_hash; + struct rte_hash_bucket *bkt; + int ret; + + bucket_idx = sig & h->bucket_bitmask; + bkt = &h->buckets[bucket_idx]; + __hash_rw_reader_lock(h); + + /* Check if key is in primary location */ + ret = search_one_bucket(h, key, sig, data, bkt); + if (ret != -1) { + __hash_rw_reader_unlock(h); + return ret; + } /* Calculate secondary hash */ alt_hash = rte_hash_secondary_hash(sig); bucket_idx = alt_hash & h->bucket_bitmask; bkt = &h->buckets[bucket_idx]; /* Check if key is in secondary location */ - for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { - if (bkt->sig_current[i] == alt_hash && - bkt->sig_alt[i] == sig) { - k = (struct rte_hash_key *) ((char *)keys + - bkt->key_idx[i] * h->key_entry_size); - if (rte_hash_cmp_eq(key, k->key, h) == 0) { - if (data != NULL) - *data = k->pdata; - /* - * Return index where key is stored, - * subtracting the first dummy index - */ - return bkt->key_idx[i] - 1; - } - } + ret = search_one_bucket(h, key, alt_hash, data, bkt); + if (ret != -1) { + __hash_rw_reader_unlock(h); + return ret; } - + __hash_rw_reader_unlock(h); return -ENOENT; } @@ -785,7 +957,7 @@ remove_entry(const struct rte_hash *h, struct rte_hash_bucket *bkt, unsigned i) bkt->sig_current[i] = NULL_SIGNATURE; bkt->sig_alt[i] = NULL_SIGNATURE; - if (h->hw_trans_mem_support) { + if (h->multi_writer_support) { lcore_id = rte_lcore_id(); cached_free_slots = &h->local_free_slots[lcore_id]; /* Cache full, need to free it. */ @@ -806,20 +978,15 @@ remove_entry(const struct rte_hash *h, struct rte_hash_bucket *bkt, unsigned i) } } +/* Search one bucket and remove the matched key */ static inline int32_t -__rte_hash_del_key_with_hash(const struct rte_hash *h, const void *key, - hash_sig_t sig) +search_and_remove(const struct rte_hash *h, const void *key, + struct rte_hash_bucket *bkt, hash_sig_t sig) { - uint32_t bucket_idx; - hash_sig_t alt_hash; - unsigned i; - struct rte_hash_bucket *bkt; struct rte_hash_key *k, *keys = h->key_store; + unsigned int i; int32_t ret; - bucket_idx = sig & h->bucket_bitmask; - bkt = &h->buckets[bucket_idx]; - /* Check if key is in primary location */ for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { if (bkt->sig_current[i] == sig && @@ -839,32 +1006,42 @@ __rte_hash_del_key_with_hash(const struct rte_hash *h, const void *key, } } } + return -1; +} + +static inline int32_t +__rte_hash_del_key_with_hash(const struct rte_hash *h, const void *key, + hash_sig_t sig) +{ + uint32_t bucket_idx; + hash_sig_t alt_hash; + struct rte_hash_bucket *bkt; + int32_t ret; + + bucket_idx = sig & h->bucket_bitmask; + bkt = &h->buckets[bucket_idx]; + + __hash_rw_writer_lock(h); + /* look for key in primary bucket */ + ret = search_and_remove(h, key, bkt, sig); + if (ret != -1) { + __hash_rw_writer_unlock(h); + return ret; + } /* Calculate secondary hash */ alt_hash = rte_hash_secondary_hash(sig); bucket_idx = alt_hash & h->bucket_bitmask; bkt = &h->buckets[bucket_idx]; - /* Check if key is in secondary location */ - for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { - if (bkt->sig_current[i] == alt_hash && - bkt->key_idx[i] != EMPTY_SLOT) { - k = (struct rte_hash_key *) ((char *)keys + - bkt->key_idx[i] * h->key_entry_size); - if (rte_hash_cmp_eq(key, k->key, h) == 0) { - remove_entry(h, bkt, i); - - /* - * Return index where key is stored, - * subtracting the first dummy index - */ - ret = bkt->key_idx[i] - 1; - bkt->key_idx[i] = EMPTY_SLOT; - return ret; - } - } + /* look for key in secondary bucket */ + ret = search_and_remove(h, key, bkt, alt_hash); + if (ret != -1) { + __hash_rw_writer_unlock(h); + return ret; } + __hash_rw_writer_unlock(h); return -ENOENT; } @@ -1006,6 +1183,7 @@ __rte_hash_lookup_bulk(const struct rte_hash *h, const void **keys, rte_prefetch0(secondary_bkt[i]); } + __hash_rw_reader_lock(h); /* Compare signatures and prefetch key slot of first hit */ for (i = 0; i < num_keys; i++) { compare_signatures(&prim_hitmask[i], &sec_hitmask[i], @@ -1088,6 +1266,8 @@ next_key: continue; } + __hash_rw_reader_unlock(h); + if (hit_mask != NULL) *hit_mask = hits; } @@ -1146,7 +1326,7 @@ rte_hash_iterate(const struct rte_hash *h, const void **key, void **data, uint32 bucket_idx = *next / RTE_HASH_BUCKET_ENTRIES; idx = *next % RTE_HASH_BUCKET_ENTRIES; } - + __hash_rw_reader_lock(h); /* Get position of entry in key table */ position = h->buckets[bucket_idx].key_idx[idx]; next_key = (struct rte_hash_key *) ((char *)h->key_store + @@ -1155,6 +1335,8 @@ rte_hash_iterate(const struct rte_hash *h, const void **key, void **data, uint32 *key = next_key->key; *data = next_key->pdata; + __hash_rw_reader_unlock(h); + /* Increment iterator */ (*next)++; diff --git a/lib/librte_hash/rte_cuckoo_hash.h b/lib/librte_hash/rte_cuckoo_hash.h index 7a54e555..b43f467d 100644 --- a/lib/librte_hash/rte_cuckoo_hash.h +++ b/lib/librte_hash/rte_cuckoo_hash.h @@ -88,15 +88,14 @@ const rte_hash_cmp_eq_t cmp_jump_table[NUM_KEY_CMP_CASES] = { #endif -enum add_key_case { - ADD_KEY_SINGLEWRITER = 0, - ADD_KEY_MULTIWRITER, - ADD_KEY_MULTIWRITER_TM, -}; /** Number of items per bucket. */ #define RTE_HASH_BUCKET_ENTRIES 8 +#if !RTE_IS_POWER_OF_2(RTE_HASH_BUCKET_ENTRIES) +#error RTE_HASH_BUCKET_ENTRIES must be a power of 2 +#endif + #define NULL_SIGNATURE 0 #define EMPTY_SLOT 0 @@ -155,18 +154,20 @@ struct rte_hash { struct rte_ring *free_slots; /**< Ring that stores all indexes of the free slots in the key table */ - uint8_t hw_trans_mem_support; - /**< Hardware transactional memory support */ + struct lcore_cache *local_free_slots; /**< Local cache per lcore, storing some indexes of the free slots */ - enum add_key_case add_key; /**< Multi-writer hash add behavior */ - - rte_spinlock_t *multiwriter_lock; /**< Multi-writer spinlock for w/o TM */ /* Fields used in lookup */ uint32_t key_len __rte_cache_aligned; /**< Length of hash key. */ + uint8_t hw_trans_mem_support; + /**< If hardware transactional memory is used. */ + uint8_t multi_writer_support; + /**< If multi-writer support is enabled. */ + uint8_t readwrite_concur_support; + /**< If read-write concurrency support is enabled */ rte_hash_function hash_func; /**< Function used to calculate hash. */ uint32_t hash_func_init_val; /**< Init value used by hash_func. */ rte_hash_cmp_eq_t rte_hash_custom_cmp_eq; @@ -184,6 +185,7 @@ struct rte_hash { /**< Table with buckets storing all the hash values and key indexes * to the key table. */ + rte_rwlock_t *readwrite_lock; /**< Read-write lock thread-safety. */ } __rte_cache_aligned; struct queue_node { diff --git a/lib/librte_hash/rte_cuckoo_hash_x86.h b/lib/librte_hash/rte_cuckoo_hash_x86.h deleted file mode 100644 index 2c5b017e..00000000 --- a/lib/librte_hash/rte_cuckoo_hash_x86.h +++ /dev/null @@ -1,164 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2016 Intel Corporation - */ - -/* rte_cuckoo_hash_x86.h - * This file holds all x86 specific Cuckoo Hash functions - */ - -/* Only tries to insert at one bucket (@prim_bkt) without trying to push - * buckets around - */ -static inline unsigned -rte_hash_cuckoo_insert_mw_tm(struct rte_hash_bucket *prim_bkt, - hash_sig_t sig, hash_sig_t alt_hash, uint32_t new_idx) -{ - unsigned i, status; - unsigned try = 0; - - while (try < RTE_HASH_TSX_MAX_RETRY) { - status = rte_xbegin(); - if (likely(status == RTE_XBEGIN_STARTED)) { - /* Insert new entry if there is room in the primary - * bucket. - */ - for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { - /* Check if slot is available */ - if (likely(prim_bkt->key_idx[i] == EMPTY_SLOT)) { - prim_bkt->sig_current[i] = sig; - prim_bkt->sig_alt[i] = alt_hash; - prim_bkt->key_idx[i] = new_idx; - break; - } - } - rte_xend(); - - if (i != RTE_HASH_BUCKET_ENTRIES) - return 0; - - break; /* break off try loop if transaction commits */ - } else { - /* If we abort we give up this cuckoo path. */ - try++; - rte_pause(); - } - } - - return -1; -} - -/* Shift buckets along provided cuckoo_path (@leaf and @leaf_slot) and fill - * the path head with new entry (sig, alt_hash, new_idx) - */ -static inline int -rte_hash_cuckoo_move_insert_mw_tm(const struct rte_hash *h, - struct queue_node *leaf, uint32_t leaf_slot, - hash_sig_t sig, hash_sig_t alt_hash, uint32_t new_idx) -{ - unsigned try = 0; - unsigned status; - uint32_t prev_alt_bkt_idx; - - struct queue_node *prev_node, *curr_node = leaf; - struct rte_hash_bucket *prev_bkt, *curr_bkt = leaf->bkt; - uint32_t prev_slot, curr_slot = leaf_slot; - - while (try < RTE_HASH_TSX_MAX_RETRY) { - status = rte_xbegin(); - if (likely(status == RTE_XBEGIN_STARTED)) { - while (likely(curr_node->prev != NULL)) { - prev_node = curr_node->prev; - prev_bkt = prev_node->bkt; - prev_slot = curr_node->prev_slot; - - prev_alt_bkt_idx - = prev_bkt->sig_alt[prev_slot] - & h->bucket_bitmask; - - if (unlikely(&h->buckets[prev_alt_bkt_idx] - != curr_bkt)) { - rte_xabort(RTE_XABORT_CUCKOO_PATH_INVALIDED); - } - - /* Need to swap current/alt sig to allow later - * Cuckoo insert to move elements back to its - * primary bucket if available - */ - curr_bkt->sig_alt[curr_slot] = - prev_bkt->sig_current[prev_slot]; - curr_bkt->sig_current[curr_slot] = - prev_bkt->sig_alt[prev_slot]; - curr_bkt->key_idx[curr_slot] - = prev_bkt->key_idx[prev_slot]; - - curr_slot = prev_slot; - curr_node = prev_node; - curr_bkt = curr_node->bkt; - } - - curr_bkt->sig_current[curr_slot] = sig; - curr_bkt->sig_alt[curr_slot] = alt_hash; - curr_bkt->key_idx[curr_slot] = new_idx; - - rte_xend(); - - return 0; - } - - /* If we abort we give up this cuckoo path, since most likely it's - * no longer valid as TSX detected data conflict - */ - try++; - rte_pause(); - } - - return -1; -} - -/* - * Make space for new key, using bfs Cuckoo Search and Multi-Writer safe - * Cuckoo - */ -static inline int -rte_hash_cuckoo_make_space_mw_tm(const struct rte_hash *h, - struct rte_hash_bucket *bkt, - hash_sig_t sig, hash_sig_t alt_hash, - uint32_t new_idx) -{ - unsigned i; - struct queue_node queue[RTE_HASH_BFS_QUEUE_MAX_LEN]; - struct queue_node *tail, *head; - struct rte_hash_bucket *curr_bkt, *alt_bkt; - - tail = queue; - head = queue + 1; - tail->bkt = bkt; - tail->prev = NULL; - tail->prev_slot = -1; - - /* Cuckoo bfs Search */ - while (likely(tail != head && head < - queue + RTE_HASH_BFS_QUEUE_MAX_LEN - - RTE_HASH_BUCKET_ENTRIES)) { - curr_bkt = tail->bkt; - for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { - if (curr_bkt->key_idx[i] == EMPTY_SLOT) { - if (likely(rte_hash_cuckoo_move_insert_mw_tm(h, - tail, i, sig, - alt_hash, new_idx) == 0)) - return 0; - } - - /* Enqueue new node and keep prev node info */ - alt_bkt = &(h->buckets[curr_bkt->sig_alt[i] - & h->bucket_bitmask]); - head->bkt = alt_bkt; - head->prev = tail; - head->prev_slot = i; - head++; - } - tail++; - } - - return -ENOSPC; -} diff --git a/lib/librte_hash/rte_hash.h b/lib/librte_hash/rte_hash.h index f71ca9fb..9e7d9315 100644 --- a/lib/librte_hash/rte_hash.h +++ b/lib/librte_hash/rte_hash.h @@ -34,6 +34,9 @@ extern "C" { /** Default behavior of insertion, single writer/multi writer */ #define RTE_HASH_EXTRA_FLAGS_MULTI_WRITER_ADD 0x02 +/** Flag to support reader writer concurrency */ +#define RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY 0x04 + /** Signature of key that is stored internally. */ typedef uint32_t hash_sig_t; @@ -124,9 +127,22 @@ void rte_hash_reset(struct rte_hash *h); /** + * Return the number of keys in the hash table + * @param h + * Hash table to query from + * @return + * - -EINVAL if parameters are invalid + * - A value indicating how many keys were inserted in the table. + */ +int32_t +rte_hash_count(const struct rte_hash *h); + +/** * Add a key-value pair to an existing hash table. * This operation is not multi-thread safe - * and should only be called from one thread. + * and should only be called from one thread by default. + * Thread safety can be enabled by setting flag during + * table creation. * * @param h * Hash table to add the key to. @@ -146,7 +162,9 @@ rte_hash_add_key_data(const struct rte_hash *h, const void *key, void *data); * Add a key-value pair with a pre-computed hash value * to an existing hash table. * This operation is not multi-thread safe - * and should only be called from one thread. + * and should only be called from one thread by default. + * Thread safety can be enabled by setting flag during + * table creation. * * @param h * Hash table to add the key to. @@ -167,7 +185,9 @@ rte_hash_add_key_with_hash_data(const struct rte_hash *h, const void *key, /** * Add a key to an existing hash table. This operation is not multi-thread safe - * and should only be called from one thread. + * and should only be called from one thread by default. + * Thread safety can be enabled by setting flag during + * table creation. * * @param h * Hash table to add the key to. @@ -185,7 +205,9 @@ rte_hash_add_key(const struct rte_hash *h, const void *key); /** * Add a key to an existing hash table. * This operation is not multi-thread safe - * and should only be called from one thread. + * and should only be called from one thread by default. + * Thread safety can be enabled by setting flag during + * table creation. * * @param h * Hash table to add the key to. @@ -205,7 +227,9 @@ rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key, hash_sig_t /** * Remove a key from an existing hash table. * This operation is not multi-thread safe - * and should only be called from one thread. + * and should only be called from one thread by default. + * Thread safety can be enabled by setting flag during + * table creation. * * @param h * Hash table to remove the key from. @@ -224,7 +248,9 @@ rte_hash_del_key(const struct rte_hash *h, const void *key); /** * Remove a key from an existing hash table. * This operation is not multi-thread safe - * and should only be called from one thread. + * and should only be called from one thread by default. + * Thread safety can be enabled by setting flag during + * table creation. * * @param h * Hash table to remove the key from. @@ -244,7 +270,9 @@ rte_hash_del_key_with_hash(const struct rte_hash *h, const void *key, hash_sig_t /** * Find a key in the hash table given the position. - * This operation is multi-thread safe. + * This operation is multi-thread safe with regarding to other lookup threads. + * Read-write concurrency can be enabled by setting flag during + * table creation. * * @param h * Hash table to get the key from. @@ -254,8 +282,8 @@ rte_hash_del_key_with_hash(const struct rte_hash *h, const void *key, hash_sig_t * Output containing a pointer to the key * @return * - 0 if retrieved successfully - * - EINVAL if the parameters are invalid. - * - ENOENT if no valid key is found in the given position. + * - -EINVAL if the parameters are invalid. + * - -ENOENT if no valid key is found in the given position. */ int rte_hash_get_key_with_position(const struct rte_hash *h, const int32_t position, @@ -263,7 +291,9 @@ rte_hash_get_key_with_position(const struct rte_hash *h, const int32_t position, /** * Find a key-value pair in the hash table. - * This operation is multi-thread safe. + * This operation is multi-thread safe with regarding to other lookup threads. + * Read-write concurrency can be enabled by setting flag during + * table creation. * * @param h * Hash table to look in. @@ -272,9 +302,11 @@ rte_hash_get_key_with_position(const struct rte_hash *h, const int32_t position, * @param data * Output with pointer to data returned from the hash table. * @return - * 0 if successful lookup - * - EINVAL if the parameters are invalid. - * - ENOENT if the key is not found. + * - A positive value that can be used by the caller as an offset into an + * array of user data. This value is unique for this key, and is the same + * value that was returned when the key was added. + * - -EINVAL if the parameters are invalid. + * - -ENOENT if the key is not found. */ int rte_hash_lookup_data(const struct rte_hash *h, const void *key, void **data); @@ -282,7 +314,9 @@ rte_hash_lookup_data(const struct rte_hash *h, const void *key, void **data); /** * Find a key-value pair with a pre-computed hash value * to an existing hash table. - * This operation is multi-thread safe. + * This operation is multi-thread safe with regarding to other lookup threads. + * Read-write concurrency can be enabled by setting flag during + * table creation. * * @param h * Hash table to look in. @@ -293,9 +327,11 @@ rte_hash_lookup_data(const struct rte_hash *h, const void *key, void **data); * @param data * Output with pointer to data returned from the hash table. * @return - * 0 if successful lookup - * - EINVAL if the parameters are invalid. - * - ENOENT if the key is not found. + * - A positive value that can be used by the caller as an offset into an + * array of user data. This value is unique for this key, and is the same + * value that was returned when the key was added. + * - -EINVAL if the parameters are invalid. + * - -ENOENT if the key is not found. */ int rte_hash_lookup_with_hash_data(const struct rte_hash *h, const void *key, @@ -303,7 +339,9 @@ rte_hash_lookup_with_hash_data(const struct rte_hash *h, const void *key, /** * Find a key in the hash table. - * This operation is multi-thread safe. + * This operation is multi-thread safe with regarding to other lookup threads. + * Read-write concurrency can be enabled by setting flag during + * table creation. * * @param h * Hash table to look in. @@ -321,7 +359,9 @@ rte_hash_lookup(const struct rte_hash *h, const void *key); /** * Find a key in the hash table. - * This operation is multi-thread safe. + * This operation is multi-thread safe with regarding to other lookup threads. + * Read-write concurrency can be enabled by setting flag during + * table creation. * * @param h * Hash table to look in. @@ -356,7 +396,9 @@ rte_hash_hash(const struct rte_hash *h, const void *key); /** * Find multiple keys in the hash table. - * This operation is multi-thread safe. + * This operation is multi-thread safe with regarding to other lookup threads. + * Read-write concurrency can be enabled by setting flag during + * table creation. * * @param h * Hash table to look in. @@ -377,7 +419,9 @@ rte_hash_lookup_bulk_data(const struct rte_hash *h, const void **keys, /** * Find multiple keys in the hash table. - * This operation is multi-thread safe. + * This operation is multi-thread safe with regarding to other lookup threads. + * Read-write concurrency can be enabled by setting flag during + * table creation. * * @param h * Hash table to look in. diff --git a/lib/librte_hash/rte_hash_version.map b/lib/librte_hash/rte_hash_version.map index 52a2576f..e216ac8e 100644 --- a/lib/librte_hash/rte_hash_version.map +++ b/lib/librte_hash/rte_hash_version.map @@ -45,3 +45,11 @@ DPDK_16.07 { rte_hash_get_key_with_position; } DPDK_2.2; + + +DPDK_18.08 { + global: + + rte_hash_count; + +} DPDK_16.07; diff --git a/lib/librte_kni/meson.build b/lib/librte_kni/meson.build index c4b21961..a738a033 100644 --- a/lib/librte_kni/meson.build +++ b/lib/librte_kni/meson.build @@ -1,7 +1,7 @@ # SPDX-License-Identifier: BSD-3-Clause # Copyright(c) 2017 Intel Corporation -if host_machine.system() != 'linux' +if host_machine.system() != 'linux' or cc.sizeof('void *') == 4 build = false endif version = 2 diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c index 8a8f6c1c..65f6a2b0 100644 --- a/lib/librte_kni/rte_kni.c +++ b/lib/librte_kni/rte_kni.c @@ -715,6 +715,9 @@ rte_kni_get(const char *name) struct rte_kni_memzone_slot *it; struct rte_kni *kni; + if (name == NULL || name[0] == '\0') + return NULL; + /* Note: could be improved perf-wise if necessary */ for (i = 0; i < kni_memzone_pool.max_ifaces; i++) { it = &kni_memzone_pool.slots[i]; diff --git a/lib/librte_kvargs/Makefile b/lib/librte_kvargs/Makefile index 39d5ac33..87593954 100644 --- a/lib/librte_kvargs/Makefile +++ b/lib/librte_kvargs/Makefile @@ -7,7 +7,7 @@ include $(RTE_SDK)/mk/rte.vars.mk LIB = librte_kvargs.a CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 -LDLIBS += -lrte_eal +CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common/include EXPORT_MAP := rte_kvargs_version.map diff --git a/lib/librte_kvargs/meson.build b/lib/librte_kvargs/meson.build index 0c5b9cb2..acd3e543 100644 --- a/lib/librte_kvargs/meson.build +++ b/lib/librte_kvargs/meson.build @@ -1,6 +1,11 @@ # SPDX-License-Identifier: BSD-3-Clause # Copyright(c) 2017 Intel Corporation +includes = [global_inc] +includes += include_directories('../librte_eal/common/include') + version = 1 sources = files('rte_kvargs.c') headers = files('rte_kvargs.h') + +deps += 'compat' diff --git a/lib/librte_kvargs/rte_kvargs.c b/lib/librte_kvargs/rte_kvargs.c index d92a5f9d..a28f7694 100644 --- a/lib/librte_kvargs/rte_kvargs.c +++ b/lib/librte_kvargs/rte_kvargs.c @@ -6,7 +6,6 @@ #include <string.h> #include <stdlib.h> -#include <rte_log.h> #include <rte_string_fns.h> #include "rte_kvargs.h" @@ -28,29 +27,22 @@ rte_kvargs_tokenize(struct rte_kvargs *kvlist, const char *params) * to pass to rte_strsplit */ kvlist->str = strdup(params); - if (kvlist->str == NULL) { - RTE_LOG(ERR, PMD, "Cannot parse arguments: not enough memory\n"); + if (kvlist->str == NULL) return -1; - } /* browse each key/value pair and add it in kvlist */ str = kvlist->str; while ((str = strtok_r(str, RTE_KVARGS_PAIRS_DELIM, &ctx1)) != NULL) { i = kvlist->count; - if (i >= RTE_KVARGS_MAX) { - RTE_LOG(ERR, PMD, "Cannot parse arguments: list full\n"); + if (i >= RTE_KVARGS_MAX) return -1; - } kvlist->pairs[i].key = strtok_r(str, RTE_KVARGS_KV_DELIM, &ctx2); kvlist->pairs[i].value = strtok_r(NULL, RTE_KVARGS_KV_DELIM, &ctx2); - if (kvlist->pairs[i].key == NULL || kvlist->pairs[i].value == NULL) { - RTE_LOG(ERR, PMD, - "Cannot parse arguments: wrong key or value\n" - "params=<%s>\n", params); + if (kvlist->pairs[i].key == NULL || + kvlist->pairs[i].value == NULL) return -1; - } kvlist->count++; str = NULL; @@ -89,12 +81,8 @@ check_for_valid_keys(struct rte_kvargs *kvlist, for (i = 0; i < kvlist->count; i++) { pair = &kvlist->pairs[i]; ret = is_valid_key(valid, pair->key); - if (!ret) { - RTE_LOG(ERR, PMD, - "Error parsing device, invalid key <%s>\n", - pair->key); + if (!ret) return -1; - } } return 0; } @@ -180,3 +168,38 @@ rte_kvargs_parse(const char *args, const char * const valid_keys[]) return kvlist; } + +__rte_experimental +struct rte_kvargs * +rte_kvargs_parse_delim(const char *args, const char * const valid_keys[], + const char *valid_ends) +{ + struct rte_kvargs *kvlist = NULL; + char *copy; + size_t len; + + if (valid_ends == NULL) + return rte_kvargs_parse(args, valid_keys); + + copy = strdup(args); + if (copy == NULL) + return NULL; + + len = strcspn(copy, valid_ends); + copy[len] = '\0'; + + kvlist = rte_kvargs_parse(copy, valid_keys); + + free(copy); + return kvlist; +} + +__rte_experimental +int +rte_kvargs_strcmp(const char *key __rte_unused, + const char *value, void *opaque) +{ + const char *str = opaque; + + return -abs(strcmp(str, value)); +} diff --git a/lib/librte_kvargs/rte_kvargs.h b/lib/librte_kvargs/rte_kvargs.h index 51b8120b..fc041956 100644 --- a/lib/librte_kvargs/rte_kvargs.h +++ b/lib/librte_kvargs/rte_kvargs.h @@ -25,6 +25,8 @@ extern "C" { #endif +#include <rte_compat.h> + /** Maximum number of key/value associations */ #define RTE_KVARGS_MAX 32 @@ -72,6 +74,36 @@ struct rte_kvargs *rte_kvargs_parse(const char *args, const char *const valid_keys[]); /** + * Allocate a rte_kvargs and store key/value associations from a string. + * This version will consider any byte from valid_ends as a possible + * terminating character, and will not parse beyond any of their occurrence. + * + * The function allocates and fills an rte_kvargs structure from a given + * string whose format is key1=value1,key2=value2,... + * + * The structure can be freed with rte_kvargs_free(). + * + * @param args + * The input string containing the key/value associations + * + * @param valid_keys + * A list of valid keys (table of const char *, the last must be NULL). + * This argument is ignored if NULL + * + * @param valid_ends + * Acceptable terminating characters. + * If NULL, the behavior is the same as ``rte_kvargs_parse``. + * + * @return + * - A pointer to an allocated rte_kvargs structure on success + * - NULL on error + */ +__rte_experimental +struct rte_kvargs *rte_kvargs_parse_delim(const char *args, + const char *const valid_keys[], + const char *valid_ends); + +/** * Free a rte_kvargs structure * * Free a rte_kvargs structure previously allocated with @@ -121,6 +153,32 @@ int rte_kvargs_process(const struct rte_kvargs *kvlist, unsigned rte_kvargs_count(const struct rte_kvargs *kvlist, const char *key_match); +/** + * Generic kvarg handler for string comparison. + * + * This function can be used for a generic string comparison processing + * on a list of kvargs. + * + * @param key + * kvarg pair key. + * + * @param value + * kvarg pair value. + * + * @param opaque + * Opaque pointer to a string. + * + * @return + * 0 if the strings match. + * !0 otherwise or on error. + * + * Unless strcmp, comparison ordering is not kept. + * In order for rte_kvargs_process to stop processing on match error, + * a negative value is returned even if strcmp had returned a positive one. + */ +__rte_experimental +int rte_kvargs_strcmp(const char *key, const char *value, void *opaque); + #ifdef __cplusplus } #endif diff --git a/lib/librte_kvargs/rte_kvargs_version.map b/lib/librte_kvargs/rte_kvargs_version.map index 2030ec46..8f4b4e3f 100644 --- a/lib/librte_kvargs/rte_kvargs_version.map +++ b/lib/librte_kvargs/rte_kvargs_version.map @@ -8,3 +8,11 @@ DPDK_2.0 { local: *; }; + +EXPERIMENTAL { + global: + + rte_kvargs_parse_delim; + rte_kvargs_strcmp; + +} DPDK_2.0; diff --git a/lib/librte_latencystats/rte_latencystats.c b/lib/librte_latencystats/rte_latencystats.c index 46c69bf0..1fdec68e 100644 --- a/lib/librte_latencystats/rte_latencystats.c +++ b/lib/librte_latencystats/rte_latencystats.c @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2017 Intel Corporation + * Copyright(c) 2018 Intel Corporation */ #include <unistd.h> @@ -265,6 +265,7 @@ rte_latencystats_uninit(void) uint16_t qid; int ret = 0; struct rxtx_cbs *cbs = NULL; + const struct rte_memzone *mz = NULL; /** De register Rx/Tx callbacks */ RTE_ETH_FOREACH_DEV(pid) { @@ -288,6 +289,11 @@ rte_latencystats_uninit(void) } } + /* free up the memzone */ + mz = rte_memzone_lookup(MZ_RTE_LATENCY_STATS); + if (mz) + rte_memzone_free(mz); + return 0; } diff --git a/lib/librte_mbuf/Makefile b/lib/librte_mbuf/Makefile index 8749a00f..e2b98a25 100644 --- a/lib/librte_mbuf/Makefile +++ b/lib/librte_mbuf/Makefile @@ -6,7 +6,6 @@ include $(RTE_SDK)/mk/rte.vars.mk # library name LIB = librte_mbuf.a -CFLAGS += -DALLOW_EXPERIMENTAL_API CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 LDLIBS += -lrte_eal -lrte_mempool diff --git a/lib/librte_mbuf/meson.build b/lib/librte_mbuf/meson.build index 869c17c1..45ffb0db 100644 --- a/lib/librte_mbuf/meson.build +++ b/lib/librte_mbuf/meson.build @@ -2,7 +2,6 @@ # Copyright(c) 2017 Intel Corporation version = 3 -allow_experimental_apis = true sources = files('rte_mbuf.c', 'rte_mbuf_ptype.c', 'rte_mbuf_pool_ops.c') headers = files('rte_mbuf.h', 'rte_mbuf_ptype.h', 'rte_mbuf_pool_ops.h') deps += ['mempool'] diff --git a/lib/librte_mbuf/rte_mbuf.c b/lib/librte_mbuf/rte_mbuf.c index fca580ef..e714c5a5 100644 --- a/lib/librte_mbuf/rte_mbuf.c +++ b/lib/librte_mbuf/rte_mbuf.c @@ -107,7 +107,7 @@ rte_pktmbuf_init(struct rte_mempool *mp, } /* Helper to create a mbuf pool with given mempool ops name*/ -struct rte_mempool * __rte_experimental +struct rte_mempool * rte_pktmbuf_pool_create_by_ops(const char *name, unsigned int n, unsigned int cache_size, uint16_t priv_size, uint16_t data_room_size, int socket_id, const char *ops_name) diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h index 8e6b4d29..9ce5d76d 100644 --- a/lib/librte_mbuf/rte_mbuf.h +++ b/lib/librte_mbuf/rte_mbuf.h @@ -729,6 +729,24 @@ rte_mbuf_to_baddr(struct rte_mbuf *md) } /** + * Return the starting address of the private data area embedded in + * the given mbuf. + * + * Note that no check is made to ensure that a private data area + * actually exists in the supplied mbuf. + * + * @param m + * The pointer to the mbuf. + * @return + * The starting address of the private data area of the given mbuf. + */ +static inline void * __rte_experimental +rte_mbuf_to_priv(struct rte_mbuf *m) +{ + return RTE_PTR_ADD(m, sizeof(struct rte_mbuf)); +} + +/** * Returns TRUE if given mbuf is cloned by mbuf indirection, or FALSE * otherwise. * @@ -1146,7 +1164,7 @@ rte_pktmbuf_pool_create(const char *name, unsigned n, * - EEXIST - a memzone with the same name already exists * - ENOMEM - no appropriate memory area found in which to create memzone */ -struct rte_mempool * __rte_experimental +struct rte_mempool * rte_pktmbuf_pool_create_by_ops(const char *name, unsigned int n, unsigned int cache_size, uint16_t priv_size, uint16_t data_room_size, int socket_id, const char *ops_name); diff --git a/lib/librte_mbuf/rte_mbuf_pool_ops.c b/lib/librte_mbuf/rte_mbuf_pool_ops.c index a1d4699f..5722976f 100644 --- a/lib/librte_mbuf/rte_mbuf_pool_ops.c +++ b/lib/librte_mbuf/rte_mbuf_pool_ops.c @@ -9,7 +9,7 @@ #include <rte_errno.h> #include <rte_mbuf_pool_ops.h> -int __rte_experimental +int rte_mbuf_set_platform_mempool_ops(const char *ops_name) { const struct rte_memzone *mz; @@ -35,7 +35,7 @@ rte_mbuf_set_platform_mempool_ops(const char *ops_name) return -EEXIST; } -const char * __rte_experimental +const char * rte_mbuf_platform_mempool_ops(void) { const struct rte_memzone *mz; @@ -46,7 +46,7 @@ rte_mbuf_platform_mempool_ops(void) return mz->addr; } -int __rte_experimental +int rte_mbuf_set_user_mempool_ops(const char *ops_name) { const struct rte_memzone *mz; @@ -67,7 +67,7 @@ rte_mbuf_set_user_mempool_ops(const char *ops_name) } -const char * __rte_experimental +const char * rte_mbuf_user_mempool_ops(void) { const struct rte_memzone *mz; @@ -79,7 +79,7 @@ rte_mbuf_user_mempool_ops(void) } /* Return mbuf pool ops name */ -const char * __rte_experimental +const char * rte_mbuf_best_mempool_ops(void) { /* User defined mempool ops takes the priority */ diff --git a/lib/librte_mbuf/rte_mbuf_pool_ops.h b/lib/librte_mbuf/rte_mbuf_pool_ops.h index ebf5bf0f..7ed95a49 100644 --- a/lib/librte_mbuf/rte_mbuf_pool_ops.h +++ b/lib/librte_mbuf/rte_mbuf_pool_ops.h @@ -12,9 +12,6 @@ * These APIs are for configuring the mbuf pool ops names to be largely used by * rte_pktmbuf_pool_create(). However, this can also be used to set and inquire * the best mempool ops available. - * - * @warning - * @b EXPERIMENTAL: this API may change without prior notice */ #include <rte_compat.h> @@ -34,7 +31,7 @@ extern "C" { * - On success, zero. * - On failure, a negative value. */ -int __rte_experimental +int rte_mbuf_set_platform_mempool_ops(const char *ops_name); /** @@ -46,7 +43,7 @@ rte_mbuf_set_platform_mempool_ops(const char *ops_name); * - On success, platform pool ops name. * - On failure, NULL. */ -const char * __rte_experimental +const char * rte_mbuf_platform_mempool_ops(void); /** @@ -60,7 +57,7 @@ rte_mbuf_platform_mempool_ops(void); * - On success, zero. * - On failure, a negative value. */ -int __rte_experimental +int rte_mbuf_set_user_mempool_ops(const char *ops_name); /** @@ -72,7 +69,7 @@ rte_mbuf_set_user_mempool_ops(const char *ops_name); * - On success, user pool ops name.. * - On failure, NULL. */ -const char * __rte_experimental +const char * rte_mbuf_user_mempool_ops(void); /** @@ -87,7 +84,7 @@ rte_mbuf_user_mempool_ops(void); * @return * returns preferred mbuf pool ops name */ -const char * __rte_experimental +const char * rte_mbuf_best_mempool_ops(void); diff --git a/lib/librte_mbuf/rte_mbuf_ptype.h b/lib/librte_mbuf/rte_mbuf_ptype.h index 79ea3142..01acc66e 100644 --- a/lib/librte_mbuf/rte_mbuf_ptype.h +++ b/lib/librte_mbuf/rte_mbuf_ptype.h @@ -653,9 +653,9 @@ extern "C" { #define RTE_ETH_IS_IPV4_HDR(ptype) ((ptype) & RTE_PTYPE_L3_IPV4) /** - * Check if the (outer) L3 header is IPv4. To avoid comparing IPv4 types one by - * one, bit 6 is selected to be used for IPv4 only. Then checking bit 6 can - * determine if it is an IPV4 packet. + * Check if the (outer) L3 header is IPv6. To avoid comparing IPv6 types one by + * one, bit 6 is selected to be used for IPv6 only. Then checking bit 6 can + * determine if it is an IPV6 packet. */ #define RTE_ETH_IS_IPV6_HDR(ptype) ((ptype) & RTE_PTYPE_L3_IPV6) diff --git a/lib/librte_mbuf/rte_mbuf_version.map b/lib/librte_mbuf/rte_mbuf_version.map index 1bb9538d..cae68db8 100644 --- a/lib/librte_mbuf/rte_mbuf_version.map +++ b/lib/librte_mbuf/rte_mbuf_version.map @@ -35,7 +35,7 @@ DPDK_16.11 { } DPDK_2.1; -EXPERIMENTAL { +DPDK_18.08 { global: rte_mbuf_best_mempool_ops; @@ -44,4 +44,4 @@ EXPERIMENTAL { rte_mbuf_set_user_mempool_ops; rte_mbuf_user_mempool_ops; rte_pktmbuf_pool_create_by_ops; -}; +} DPDK_16.11; diff --git a/lib/librte_member/rte_member.c b/lib/librte_member/rte_member.c index e147dd1f..702c01d3 100644 --- a/lib/librte_member/rte_member.c +++ b/lib/librte_member/rte_member.c @@ -297,10 +297,7 @@ rte_member_reset(const struct rte_member_setsum *setsum) } } -RTE_INIT(librte_member_init_log); - -static void -librte_member_init_log(void) +RTE_INIT(librte_member_init_log) { librte_member_logtype = rte_log_register("lib.member"); if (librte_member_logtype >= 0) diff --git a/lib/librte_mempool/Makefile b/lib/librte_mempool/Makefile index e3c32b14..20bf63fb 100644 --- a/lib/librte_mempool/Makefile +++ b/lib/librte_mempool/Makefile @@ -7,15 +7,12 @@ include $(RTE_SDK)/mk/rte.vars.mk LIB = librte_mempool.a CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 -# Allow deprecated symbol to use deprecated rte_mempool_populate_iova_tab() -# from earlier deprecated rte_mempool_populate_phys_tab() -CFLAGS += -Wno-deprecated-declarations CFLAGS += -DALLOW_EXPERIMENTAL_API LDLIBS += -lrte_eal -lrte_ring EXPORT_MAP := rte_mempool_version.map -LIBABIVER := 4 +LIBABIVER := 5 # memseg walk is not yet part of stable API CFLAGS += -DALLOW_EXPERIMENTAL_API diff --git a/lib/librte_mempool/meson.build b/lib/librte_mempool/meson.build index d507e551..38d7ae89 100644 --- a/lib/librte_mempool/meson.build +++ b/lib/librte_mempool/meson.build @@ -5,17 +5,13 @@ allow_experimental_apis = true extra_flags = [] -# Allow deprecated symbol to use deprecated rte_mempool_populate_iova_tab() -# from earlier deprecated rte_mempool_populate_phys_tab() -extra_flags += '-Wno-deprecated-declarations' - foreach flag: extra_flags if cc.has_argument(flag) cflags += flag endif endforeach -version = 4 +version = 5 sources = files('rte_mempool.c', 'rte_mempool_ops.c', 'rte_mempool_ops_default.c') headers = files('rte_mempool.h') diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c index 8c8b9f80..03e6b5f7 100644 --- a/lib/librte_mempool/rte_mempool.c +++ b/lib/librte_mempool/rte_mempool.c @@ -225,93 +225,6 @@ rte_mempool_calc_obj_size(uint32_t elt_size, uint32_t flags, return sz->total_size; } - -/* - * Internal function to calculate required memory chunk size shared - * by default implementation of the corresponding callback and - * deprecated external function. - */ -size_t -rte_mempool_calc_mem_size_helper(uint32_t elt_num, size_t total_elt_sz, - uint32_t pg_shift) -{ - size_t obj_per_page, pg_num, pg_sz; - - if (total_elt_sz == 0) - return 0; - - if (pg_shift == 0) - return total_elt_sz * elt_num; - - pg_sz = (size_t)1 << pg_shift; - obj_per_page = pg_sz / total_elt_sz; - if (obj_per_page == 0) - return RTE_ALIGN_CEIL(total_elt_sz, pg_sz) * elt_num; - - pg_num = (elt_num + obj_per_page - 1) / obj_per_page; - return pg_num << pg_shift; -} - -/* - * Calculate maximum amount of memory required to store given number of objects. - */ -size_t -rte_mempool_xmem_size(uint32_t elt_num, size_t total_elt_sz, uint32_t pg_shift, - __rte_unused unsigned int flags) -{ - return rte_mempool_calc_mem_size_helper(elt_num, total_elt_sz, - pg_shift); -} - -/* - * Calculate how much memory would be actually required with the - * given memory footprint to store required number of elements. - */ -ssize_t -rte_mempool_xmem_usage(__rte_unused void *vaddr, uint32_t elt_num, - size_t total_elt_sz, const rte_iova_t iova[], uint32_t pg_num, - uint32_t pg_shift, __rte_unused unsigned int flags) -{ - uint32_t elt_cnt = 0; - rte_iova_t start, end; - uint32_t iova_idx; - size_t pg_sz = (size_t)1 << pg_shift; - - /* if iova is NULL, assume contiguous memory */ - if (iova == NULL) { - start = 0; - end = pg_sz * pg_num; - iova_idx = pg_num; - } else { - start = iova[0]; - end = iova[0] + pg_sz; - iova_idx = 1; - } - while (elt_cnt < elt_num) { - - if (end - start >= total_elt_sz) { - /* enough contiguous memory, add an object */ - start += total_elt_sz; - elt_cnt++; - } else if (iova_idx < pg_num) { - /* no room to store one obj, add a page */ - if (end == iova[iova_idx]) { - end += pg_sz; - } else { - start = iova[iova_idx]; - end = iova[iova_idx] + pg_sz; - } - iova_idx++; - - } else { - /* no more page, return how many elements fit */ - return -(size_t)elt_cnt; - } - } - - return (size_t)iova_idx << pg_shift; -} - /* free a memchunk allocated with rte_memzone_reserve() */ static void rte_mempool_memchunk_mz_free(__rte_unused struct rte_mempool_memhdr *memhdr, @@ -423,63 +336,6 @@ fail: return ret; } -int -rte_mempool_populate_phys(struct rte_mempool *mp, char *vaddr, - phys_addr_t paddr, size_t len, rte_mempool_memchunk_free_cb_t *free_cb, - void *opaque) -{ - return rte_mempool_populate_iova(mp, vaddr, paddr, len, free_cb, opaque); -} - -/* Add objects in the pool, using a table of physical pages. Return the - * number of objects added, or a negative value on error. - */ -int -rte_mempool_populate_iova_tab(struct rte_mempool *mp, char *vaddr, - const rte_iova_t iova[], uint32_t pg_num, uint32_t pg_shift, - rte_mempool_memchunk_free_cb_t *free_cb, void *opaque) -{ - uint32_t i, n; - int ret, cnt = 0; - size_t pg_sz = (size_t)1 << pg_shift; - - /* mempool must not be populated */ - if (mp->nb_mem_chunks != 0) - return -EEXIST; - - if (mp->flags & MEMPOOL_F_NO_IOVA_CONTIG) - return rte_mempool_populate_iova(mp, vaddr, RTE_BAD_IOVA, - pg_num * pg_sz, free_cb, opaque); - - for (i = 0; i < pg_num && mp->populated_size < mp->size; i += n) { - - /* populate with the largest group of contiguous pages */ - for (n = 1; (i + n) < pg_num && - iova[i + n - 1] + pg_sz == iova[i + n]; n++) - ; - - ret = rte_mempool_populate_iova(mp, vaddr + i * pg_sz, - iova[i], n * pg_sz, free_cb, opaque); - if (ret < 0) { - rte_mempool_free_memchunks(mp); - return ret; - } - /* no need to call the free callback for next chunks */ - free_cb = NULL; - cnt += ret; - } - return cnt; -} - -int -rte_mempool_populate_phys_tab(struct rte_mempool *mp, char *vaddr, - const phys_addr_t paddr[], uint32_t pg_num, uint32_t pg_shift, - rte_mempool_memchunk_free_cb_t *free_cb, void *opaque) -{ - return rte_mempool_populate_iova_tab(mp, vaddr, paddr, pg_num, pg_shift, - free_cb, opaque); -} - /* Populate the mempool with a virtual area. Return the number of * objects added, or a negative value on error. */ @@ -916,6 +772,12 @@ rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size, mempool_list = RTE_TAILQ_CAST(rte_mempool_tailq.head, rte_mempool_list); + /* asked for zero items */ + if (n == 0) { + rte_errno = EINVAL; + return NULL; + } + /* asked cache too big */ if (cache_size > RTE_MEMPOOL_CACHE_MAX_SIZE || CALC_CACHE_FLUSHTHRESH(cache_size) > n) { @@ -1065,66 +927,6 @@ rte_mempool_create(const char *name, unsigned n, unsigned elt_size, return NULL; } -/* - * Create the mempool over already allocated chunk of memory. - * That external memory buffer can consists of physically disjoint pages. - * Setting vaddr to NULL, makes mempool to fallback to rte_mempool_create() - * behavior. - */ -struct rte_mempool * -rte_mempool_xmem_create(const char *name, unsigned n, unsigned elt_size, - unsigned cache_size, unsigned private_data_size, - rte_mempool_ctor_t *mp_init, void *mp_init_arg, - rte_mempool_obj_cb_t *obj_init, void *obj_init_arg, - int socket_id, unsigned flags, void *vaddr, - const rte_iova_t iova[], uint32_t pg_num, uint32_t pg_shift) -{ - struct rte_mempool *mp = NULL; - int ret; - - /* no virtual address supplied, use rte_mempool_create() */ - if (vaddr == NULL) - return rte_mempool_create(name, n, elt_size, cache_size, - private_data_size, mp_init, mp_init_arg, - obj_init, obj_init_arg, socket_id, flags); - - /* check that we have both VA and PA */ - if (iova == NULL) { - rte_errno = EINVAL; - return NULL; - } - - /* Check that pg_shift parameter is valid. */ - if (pg_shift > MEMPOOL_PG_SHIFT_MAX) { - rte_errno = EINVAL; - return NULL; - } - - mp = rte_mempool_create_empty(name, n, elt_size, cache_size, - private_data_size, socket_id, flags); - if (mp == NULL) - return NULL; - - /* call the mempool priv initializer */ - if (mp_init) - mp_init(mp, mp_init_arg); - - ret = rte_mempool_populate_iova_tab(mp, vaddr, iova, pg_num, pg_shift, - NULL, NULL); - if (ret < 0 || ret != (int)mp->size) - goto fail; - - /* call the object initializers */ - if (obj_init) - rte_mempool_obj_iter(mp, obj_init, obj_init_arg); - - return mp; - - fail: - rte_mempool_free(mp); - return NULL; -} - /* Return the number of entries in the mempool */ unsigned int rte_mempool_avail_count(const struct rte_mempool *mp) diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h index 1f59553b..7c9cd9a2 100644 --- a/lib/librte_mempool/rte_mempool.h +++ b/lib/librte_mempool/rte_mempool.h @@ -488,28 +488,6 @@ ssize_t rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp, size_t *min_chunk_size, size_t *align); /** - * @internal Helper function to calculate memory size required to store - * specified number of objects in assumption that the memory buffer will - * be aligned at page boundary. - * - * Note that if object size is bigger than page size, then it assumes - * that pages are grouped in subsets of physically continuous pages big - * enough to store at least one object. - * - * @param elt_num - * Number of elements. - * @param total_elt_sz - * The size of each element, including header and trailer, as returned - * by rte_mempool_calc_obj_size(). - * @param pg_shift - * LOG2 of the physical pages size. If set to 0, ignore page boundaries. - * @return - * Required memory size aligned at page boundary. - */ -size_t rte_mempool_calc_mem_size_helper(uint32_t elt_num, size_t total_elt_sz, - uint32_t pg_shift); - -/** * Function to be called for each populated object. * * @param[in] mp @@ -974,74 +952,6 @@ rte_mempool_create(const char *name, unsigned n, unsigned elt_size, int socket_id, unsigned flags); /** - * @deprecated - * Create a new mempool named *name* in memory. - * - * The pool contains n elements of elt_size. Its size is set to n. - * This function uses ``memzone_reserve()`` to allocate the mempool header - * (and the objects if vaddr is NULL). - * Depending on the input parameters, mempool elements can be either allocated - * together with the mempool header, or an externally provided memory buffer - * could be used to store mempool objects. In later case, that external - * memory buffer can consist of set of disjoint physical pages. - * - * @param name - * The name of the mempool. - * @param n - * The number of elements in the mempool. The optimum size (in terms of - * memory usage) for a mempool is when n is a power of two minus one: - * n = (2^q - 1). - * @param elt_size - * The size of each element. - * @param cache_size - * Size of the cache. See rte_mempool_create() for details. - * @param private_data_size - * The size of the private data appended after the mempool - * structure. This is useful for storing some private data after the - * mempool structure, as is done for rte_mbuf_pool for example. - * @param mp_init - * A function pointer that is called for initialization of the pool, - * before object initialization. The user can initialize the private - * data in this function if needed. This parameter can be NULL if - * not needed. - * @param mp_init_arg - * An opaque pointer to data that can be used in the mempool - * constructor function. - * @param obj_init - * A function called for each object at initialization of the pool. - * See rte_mempool_create() for details. - * @param obj_init_arg - * An opaque pointer passed to the object constructor function. - * @param socket_id - * The *socket_id* argument is the socket identifier in the case of - * NUMA. The value can be *SOCKET_ID_ANY* if there is no NUMA - * constraint for the reserved zone. - * @param flags - * Flags controlling the behavior of the mempool. See - * rte_mempool_create() for details. - * @param vaddr - * Virtual address of the externally allocated memory buffer. - * Will be used to store mempool objects. - * @param iova - * Array of IO addresses of the pages that comprises given memory buffer. - * @param pg_num - * Number of elements in the iova array. - * @param pg_shift - * LOG2 of the physical pages size. - * @return - * The pointer to the new allocated mempool, on success. NULL on error - * with rte_errno set appropriately. See rte_mempool_create() for details. - */ -__rte_deprecated -struct rte_mempool * -rte_mempool_xmem_create(const char *name, unsigned n, unsigned elt_size, - unsigned cache_size, unsigned private_data_size, - rte_mempool_ctor_t *mp_init, void *mp_init_arg, - rte_mempool_obj_cb_t *obj_init, void *obj_init_arg, - int socket_id, unsigned flags, void *vaddr, - const rte_iova_t iova[], uint32_t pg_num, uint32_t pg_shift); - -/** * Create an empty mempool * * The mempool is allocated and initialized, but it is not populated: no @@ -1123,48 +1033,6 @@ int rte_mempool_populate_iova(struct rte_mempool *mp, char *vaddr, rte_iova_t iova, size_t len, rte_mempool_memchunk_free_cb_t *free_cb, void *opaque); -__rte_deprecated -int rte_mempool_populate_phys(struct rte_mempool *mp, char *vaddr, - phys_addr_t paddr, size_t len, rte_mempool_memchunk_free_cb_t *free_cb, - void *opaque); - -/** - * @deprecated - * Add physical memory for objects in the pool at init - * - * Add a virtually contiguous memory chunk in the pool where objects can - * be instantiated. The IO addresses corresponding to the virtual - * area are described in iova[], pg_num, pg_shift. - * - * @param mp - * A pointer to the mempool structure. - * @param vaddr - * The virtual address of memory that should be used to store objects. - * @param iova - * An array of IO addresses of each page composing the virtual area. - * @param pg_num - * Number of elements in the iova array. - * @param pg_shift - * LOG2 of the physical pages size. - * @param free_cb - * The callback used to free this chunk when destroying the mempool. - * @param opaque - * An opaque argument passed to free_cb. - * @return - * The number of objects added on success. - * On error, the chunks are not added in the memory list of the - * mempool and a negative errno is returned. - */ -__rte_deprecated -int rte_mempool_populate_iova_tab(struct rte_mempool *mp, char *vaddr, - const rte_iova_t iova[], uint32_t pg_num, uint32_t pg_shift, - rte_mempool_memchunk_free_cb_t *free_cb, void *opaque); - -__rte_deprecated -int rte_mempool_populate_phys_tab(struct rte_mempool *mp, char *vaddr, - const phys_addr_t paddr[], uint32_t pg_num, uint32_t pg_shift, - rte_mempool_memchunk_free_cb_t *free_cb, void *opaque); - /** * Add virtually contiguous memory for objects in the pool at init * @@ -1746,13 +1614,6 @@ rte_mempool_virt2iova(const void *elt) return hdr->iova; } -__rte_deprecated -static inline phys_addr_t -rte_mempool_virt2phy(__rte_unused const struct rte_mempool *mp, const void *elt) -{ - return rte_mempool_virt2iova(elt); -} - /** * Check the consistency of mempool objects. * @@ -1822,68 +1683,6 @@ uint32_t rte_mempool_calc_obj_size(uint32_t elt_size, uint32_t flags, struct rte_mempool_objsz *sz); /** - * @deprecated - * Get the size of memory required to store mempool elements. - * - * Calculate the maximum amount of memory required to store given number - * of objects. Assume that the memory buffer will be aligned at page - * boundary. - * - * Note that if object size is bigger than page size, then it assumes - * that pages are grouped in subsets of physically continuous pages big - * enough to store at least one object. - * - * @param elt_num - * Number of elements. - * @param total_elt_sz - * The size of each element, including header and trailer, as returned - * by rte_mempool_calc_obj_size(). - * @param pg_shift - * LOG2 of the physical pages size. If set to 0, ignore page boundaries. - * @param flags - * The mempool flags. - * @return - * Required memory size aligned at page boundary. - */ -__rte_deprecated -size_t rte_mempool_xmem_size(uint32_t elt_num, size_t total_elt_sz, - uint32_t pg_shift, unsigned int flags); - -/** - * @deprecated - * Get the size of memory required to store mempool elements. - * - * Calculate how much memory would be actually required with the given - * memory footprint to store required number of objects. - * - * @param vaddr - * Virtual address of the externally allocated memory buffer. - * Will be used to store mempool objects. - * @param elt_num - * Number of elements. - * @param total_elt_sz - * The size of each element, including header and trailer, as returned - * by rte_mempool_calc_obj_size(). - * @param iova - * Array of IO addresses of the pages that comprises given memory buffer. - * @param pg_num - * Number of elements in the iova array. - * @param pg_shift - * LOG2 of the physical pages size. - * @param flags - * The mempool flags. - * @return - * On success, the number of bytes needed to store given number of - * objects, aligned to the given page size. If the provided memory - * buffer is too small, return a negative value whose absolute value - * is the actual number of elements that can be stored in that buffer. - */ -__rte_deprecated -ssize_t rte_mempool_xmem_usage(void *vaddr, uint32_t elt_num, - size_t total_elt_sz, const rte_iova_t iova[], uint32_t pg_num, - uint32_t pg_shift, unsigned int flags); - -/** * Walk list of all memory pools * * @param func diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c index fd63ca13..4e2bfc82 100644 --- a/lib/librte_mempool/rte_mempool_ops_default.c +++ b/lib/librte_mempool/rte_mempool_ops_default.c @@ -12,12 +12,31 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp, size_t *min_chunk_size, size_t *align) { size_t total_elt_sz; + size_t obj_per_page, pg_num, pg_sz; size_t mem_size; total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size; - - mem_size = rte_mempool_calc_mem_size_helper(obj_num, total_elt_sz, - pg_shift); + if (total_elt_sz == 0) { + mem_size = 0; + } else if (pg_shift == 0) { + mem_size = total_elt_sz * obj_num; + } else { + pg_sz = (size_t)1 << pg_shift; + obj_per_page = pg_sz / total_elt_sz; + if (obj_per_page == 0) { + /* + * Note that if object size is bigger than page size, + * then it is assumed that pages are grouped in subsets + * of physically continuous pages big enough to store + * at least one object. + */ + mem_size = + RTE_ALIGN_CEIL(total_elt_sz, pg_sz) * obj_num; + } else { + pg_num = (obj_num + obj_per_page - 1) / obj_per_page; + mem_size = pg_num << pg_shift; + } + } *min_chunk_size = RTE_MAX((size_t)1 << pg_shift, total_elt_sz); diff --git a/lib/librte_mempool/rte_mempool_version.map b/lib/librte_mempool/rte_mempool_version.map index 7091b954..17cbca46 100644 --- a/lib/librte_mempool/rte_mempool_version.map +++ b/lib/librte_mempool/rte_mempool_version.map @@ -8,9 +8,6 @@ DPDK_2.0 { rte_mempool_list_dump; rte_mempool_lookup; rte_mempool_walk; - rte_mempool_xmem_create; - rte_mempool_xmem_size; - rte_mempool_xmem_usage; local: *; }; @@ -34,8 +31,6 @@ DPDK_16.07 { rte_mempool_ops_table; rte_mempool_populate_anon; rte_mempool_populate_default; - rte_mempool_populate_phys; - rte_mempool_populate_phys_tab; rte_mempool_populate_virt; rte_mempool_register_ops; rte_mempool_set_ops_byname; @@ -46,7 +41,6 @@ DPDK_17.11 { global: rte_mempool_populate_iova; - rte_mempool_populate_iova_tab; } DPDK_16.07; diff --git a/lib/librte_meter/rte_meter.c b/lib/librte_meter/rte_meter.c index 59af5ef2..473f69ab 100644 --- a/lib/librte_meter/rte_meter.c +++ b/lib/librte_meter/rte_meter.c @@ -30,7 +30,7 @@ rte_meter_get_tb_params(uint64_t hz, uint64_t rate, uint64_t *tb_period, uint64_ } } -int __rte_experimental +int rte_meter_srtcm_profile_config(struct rte_meter_srtcm_profile *p, struct rte_meter_srtcm_params *params) { @@ -68,7 +68,7 @@ rte_meter_srtcm_config(struct rte_meter_srtcm *m, return 0; } -int __rte_experimental +int rte_meter_trtcm_profile_config(struct rte_meter_trtcm_profile *p, struct rte_meter_trtcm_params *params) { diff --git a/lib/librte_meter/rte_meter.h b/lib/librte_meter/rte_meter.h index 03d80566..58a05158 100644 --- a/lib/librte_meter/rte_meter.h +++ b/lib/librte_meter/rte_meter.h @@ -20,7 +20,6 @@ extern "C" { ***/ #include <stdint.h> -#include <rte_compat.h> /* * Application Programmer's Interface (API) @@ -82,7 +81,7 @@ struct rte_meter_trtcm; * @return * 0 upon success, error code otherwise */ -int __rte_experimental +int rte_meter_srtcm_profile_config(struct rte_meter_srtcm_profile *p, struct rte_meter_srtcm_params *params); @@ -96,7 +95,7 @@ rte_meter_srtcm_profile_config(struct rte_meter_srtcm_profile *p, * @return * 0 upon success, error code otherwise */ -int __rte_experimental +int rte_meter_trtcm_profile_config(struct rte_meter_trtcm_profile *p, struct rte_meter_trtcm_params *params); diff --git a/lib/librte_meter/rte_meter_version.map b/lib/librte_meter/rte_meter_version.map index 9215d4cb..cb79f0c2 100644 --- a/lib/librte_meter/rte_meter_version.map +++ b/lib/librte_meter/rte_meter_version.map @@ -11,7 +11,7 @@ DPDK_2.0 { local: *; }; -EXPERIMENTAL { +DPDK_18.08 { global: rte_meter_srtcm_profile_config; diff --git a/lib/librte_metrics/rte_metrics.c b/lib/librte_metrics/rte_metrics.c index 258f0582..99a96b65 100644 --- a/lib/librte_metrics/rte_metrics.c +++ b/lib/librte_metrics/rte_metrics.c @@ -96,6 +96,9 @@ rte_metrics_reg_names(const char * const *names, uint16_t cnt_names) /* Some sanity checks */ if (cnt_names < 1 || names == NULL) return -EINVAL; + for (idx_name = 0; idx_name < cnt_names; idx_name++) + if (names[idx_name] == NULL) + return -EINVAL; memzone = rte_memzone_lookup(RTE_METRICS_MEMZONE_NAME); if (memzone == NULL) @@ -159,6 +162,11 @@ rte_metrics_update_values(int port_id, stats = memzone->addr; rte_spinlock_lock(&stats->lock); + + if (key >= stats->cnt_stats) { + rte_spinlock_unlock(&stats->lock); + return -EINVAL; + } idx_metric = key; cnt_setsize = 1; while (idx_metric < stats->cnt_stats) { @@ -200,9 +208,8 @@ rte_metrics_get_names(struct rte_metric_name *names, int return_value; memzone = rte_memzone_lookup(RTE_METRICS_MEMZONE_NAME); - /* If not allocated, fail silently */ if (memzone == NULL) - return 0; + return -EIO; stats = memzone->addr; rte_spinlock_lock(&stats->lock); @@ -238,9 +245,9 @@ rte_metrics_get_values(int port_id, return -EINVAL; memzone = rte_memzone_lookup(RTE_METRICS_MEMZONE_NAME); - /* If not allocated, fail silently */ if (memzone == NULL) - return 0; + return -EIO; + stats = memzone->addr; rte_spinlock_lock(&stats->lock); diff --git a/lib/librte_net/rte_ip.h b/lib/librte_net/rte_ip.h index 72dc2456..f2a8904a 100644 --- a/lib/librte_net/rte_ip.h +++ b/lib/librte_net/rte_ip.h @@ -108,25 +108,25 @@ __rte_raw_cksum(const void *buf, size_t len, uint32_t sum) /* workaround gcc strict-aliasing warning */ uintptr_t ptr = (uintptr_t)buf; typedef uint16_t __attribute__((__may_alias__)) u16_p; - const u16_p *u16 = (const u16_p *)ptr; - - while (len >= (sizeof(*u16) * 4)) { - sum += u16[0]; - sum += u16[1]; - sum += u16[2]; - sum += u16[3]; - len -= sizeof(*u16) * 4; - u16 += 4; + const u16_p *u16_buf = (const u16_p *)ptr; + + while (len >= (sizeof(*u16_buf) * 4)) { + sum += u16_buf[0]; + sum += u16_buf[1]; + sum += u16_buf[2]; + sum += u16_buf[3]; + len -= sizeof(*u16_buf) * 4; + u16_buf += 4; } - while (len >= sizeof(*u16)) { - sum += *u16; - len -= sizeof(*u16); - u16 += 1; + while (len >= sizeof(*u16_buf)) { + sum += *u16_buf; + len -= sizeof(*u16_buf); + u16_buf += 1; } /* if length is in odd bytes */ if (len == 1) - sum += *((const uint8_t *)u16); + sum += *((const uint8_t *)u16_buf); return sum; } diff --git a/lib/librte_power/channel_commands.h b/lib/librte_power/channel_commands.h index 5e8b4ab5..ee638eef 100644 --- a/lib/librte_power/channel_commands.h +++ b/lib/librte_power/channel_commands.h @@ -48,7 +48,8 @@ enum workload {HIGH, MEDIUM, LOW}; enum policy_to_use { TRAFFIC, TIME, - WORKLOAD + WORKLOAD, + BRANCH_RATIO }; struct traffic { diff --git a/lib/librte_power/power_acpi_cpufreq.c b/lib/librte_power/power_acpi_cpufreq.c index bce933e9..cd5978d5 100644 --- a/lib/librte_power/power_acpi_cpufreq.c +++ b/lib/librte_power/power_acpi_cpufreq.c @@ -623,3 +623,24 @@ power_acpi_disable_turbo(unsigned int lcore_id) return 0; } + +int power_acpi_get_capabilities(unsigned int lcore_id, + struct rte_power_core_capabilities *caps) +{ + struct rte_power_info *pi; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return -1; + } + if (caps == NULL) { + RTE_LOG(ERR, POWER, "Invalid argument\n"); + return -1; + } + + pi = &lcore_power_info[lcore_id]; + caps->capabilities = 0; + caps->turbo = !!(pi->turbo_available); + + return 0; +} diff --git a/lib/librte_power/power_acpi_cpufreq.h b/lib/librte_power/power_acpi_cpufreq.h index edeeb27a..1af74160 100644 --- a/lib/librte_power/power_acpi_cpufreq.h +++ b/lib/librte_power/power_acpi_cpufreq.h @@ -14,6 +14,7 @@ #include <rte_byteorder.h> #include <rte_log.h> #include <rte_string_fns.h> +#include "rte_power.h" #ifdef __cplusplus extern "C" { @@ -196,6 +197,21 @@ int power_acpi_enable_turbo(unsigned int lcore_id); */ int power_acpi_disable_turbo(unsigned int lcore_id); +/** + * Returns power capabilities for a specific lcore. + * + * @param lcore_id + * lcore id. + * @param caps + * pointer to rte_power_core_capabilities object. + * + * @return + * - 0 on success. + * - Negative on error. + */ +int power_acpi_get_capabilities(unsigned int lcore_id, + struct rte_power_core_capabilities *caps); + #ifdef __cplusplus } #endif diff --git a/lib/librte_power/power_kvm_vm.c b/lib/librte_power/power_kvm_vm.c index 38e9066f..20659b72 100644 --- a/lib/librte_power/power_kvm_vm.c +++ b/lib/librte_power/power_kvm_vm.c @@ -124,3 +124,11 @@ power_kvm_vm_disable_turbo(unsigned int lcore_id) { return send_msg(lcore_id, CPU_POWER_DISABLE_TURBO); } + +struct rte_power_core_capabilities; +int power_kvm_vm_get_capabilities(__rte_unused unsigned int lcore_id, + __rte_unused struct rte_power_core_capabilities *caps) +{ + RTE_LOG(ERR, POWER, "rte_power_get_capabilities is not implemented for Virtual Machine Power Management\n"); + return -ENOTSUP; +} diff --git a/lib/librte_power/power_kvm_vm.h b/lib/librte_power/power_kvm_vm.h index 446d6997..94d4aa12 100644 --- a/lib/librte_power/power_kvm_vm.h +++ b/lib/librte_power/power_kvm_vm.h @@ -14,6 +14,7 @@ #include <rte_byteorder.h> #include <rte_log.h> #include <rte_string_fns.h> +#include "rte_power.h" #ifdef __cplusplus extern "C" { @@ -177,6 +178,22 @@ int power_kvm_vm_enable_turbo(unsigned int lcore_id); * - Negative on error. */ int power_kvm_vm_disable_turbo(unsigned int lcore_id); + +/** + * Returns power capabilities for a specific lcore. + * + * @param lcore_id + * lcore id. + * @param caps + * pointer to rte_power_core_capabilities object. + * + * @return + * - 0 on success. + * - Negative on error. + */ +int power_kvm_vm_get_capabilities(unsigned int lcore_id, + struct rte_power_core_capabilities *caps); + #ifdef __cplusplus } #endif diff --git a/lib/librte_power/rte_power.c b/lib/librte_power/rte_power.c index 6c8fb403..208b7919 100644 --- a/lib/librte_power/rte_power.c +++ b/lib/librte_power/rte_power.c @@ -24,6 +24,7 @@ rte_power_freq_change_t rte_power_freq_min = NULL; rte_power_freq_change_t rte_power_turbo_status; rte_power_freq_change_t rte_power_freq_enable_turbo; rte_power_freq_change_t rte_power_freq_disable_turbo; +rte_power_get_capabilities_t rte_power_get_capabilities; int rte_power_set_env(enum power_management_env env) @@ -42,6 +43,7 @@ rte_power_set_env(enum power_management_env env) rte_power_turbo_status = power_acpi_turbo_status; rte_power_freq_enable_turbo = power_acpi_enable_turbo; rte_power_freq_disable_turbo = power_acpi_disable_turbo; + rte_power_get_capabilities = power_acpi_get_capabilities; } else if (env == PM_ENV_KVM_VM) { rte_power_freqs = power_kvm_vm_freqs; rte_power_get_freq = power_kvm_vm_get_freq; @@ -53,6 +55,7 @@ rte_power_set_env(enum power_management_env env) rte_power_turbo_status = power_kvm_vm_turbo_status; rte_power_freq_enable_turbo = power_kvm_vm_enable_turbo; rte_power_freq_disable_turbo = power_kvm_vm_disable_turbo; + rte_power_get_capabilities = power_kvm_vm_get_capabilities; } else { RTE_LOG(ERR, POWER, "Invalid Power Management Environment(%d) set\n", env); diff --git a/lib/librte_power/rte_power.h b/lib/librte_power/rte_power.h index b4b7357b..d70bc0b3 100644 --- a/lib/librte_power/rte_power.h +++ b/lib/librte_power/rte_power.h @@ -247,6 +247,38 @@ extern rte_power_freq_change_t rte_power_freq_enable_turbo; */ extern rte_power_freq_change_t rte_power_freq_disable_turbo; +/** + * Power capabilities summary. + */ +struct rte_power_core_capabilities { + RTE_STD_C11 + union { + uint64_t capabilities; + RTE_STD_C11 + struct { + uint64_t turbo:1; /**< Turbo can be enabled. */ + }; + }; +}; + +/** + * Returns power capabilities for a specific lcore. + * Function pointer definition. Review each environments + * specific documentation for usage. + * + * @param lcore_id + * lcore id. + * @param caps + * pointer to rte_power_core_capabilities object. + * + * @return + * - 0 on success. + * - Negative on error. + */ +typedef int (*rte_power_get_capabilities_t)(unsigned int lcore_id, + struct rte_power_core_capabilities *caps); + +extern rte_power_get_capabilities_t rte_power_get_capabilities; #ifdef __cplusplus } diff --git a/lib/librte_power/rte_power_version.map b/lib/librte_power/rte_power_version.map index 96dc42ec..dd587dfb 100644 --- a/lib/librte_power/rte_power_version.map +++ b/lib/librte_power/rte_power_version.map @@ -25,4 +25,11 @@ DPDK_17.11 { rte_power_freq_enable_turbo; rte_power_turbo_status; -} DPDK_2.0;
\ No newline at end of file +} DPDK_2.0; + +DPDK_18.08 { + global: + + rte_power_get_capabilities; + +} DPDK_17.11; diff --git a/lib/librte_rawdev/Makefile b/lib/librte_rawdev/Makefile index b9105b06..addb288d 100644 --- a/lib/librte_rawdev/Makefile +++ b/lib/librte_rawdev/Makefile @@ -10,7 +10,6 @@ LIB = librte_rawdev.a LIBABIVER := 1 # build flags -CFLAGS += -DALLOW_EXPERIMENTAL_API CFLAGS += -O3 CFLAGS += $(WERROR_FLAGS) LDLIBS += -lrte_eal diff --git a/lib/librte_rawdev/meson.build b/lib/librte_rawdev/meson.build index dcd37ad4..a20fbdc0 100644 --- a/lib/librte_rawdev/meson.build +++ b/lib/librte_rawdev/meson.build @@ -1,6 +1,5 @@ # SPDX-License-Identifier: BSD-3-Clause # Copyright(c) 2018 Intel Corporation -allow_experimental_apis = true sources = files('rte_rawdev.c') headers = files('rte_rawdev.h', 'rte_rawdev_pmd.h') diff --git a/lib/librte_rawdev/rte_rawdev.c b/lib/librte_rawdev/rte_rawdev.c index 284e6aec..62b6b97e 100644 --- a/lib/librte_rawdev/rte_rawdev.c +++ b/lib/librte_rawdev/rte_rawdev.c @@ -46,13 +46,13 @@ static struct rte_rawdev_global rawdev_globals = { struct rte_rawdev_global *rte_rawdev_globals = &rawdev_globals; /* Raw device, northbound API implementation */ -uint8_t __rte_experimental +uint8_t rte_rawdev_count(void) { return rte_rawdev_globals->nb_devs; } -uint16_t __rte_experimental +uint16_t rte_rawdev_get_dev_id(const char *name) { uint16_t i; @@ -69,7 +69,7 @@ rte_rawdev_get_dev_id(const char *name) return -ENODEV; } -int __rte_experimental +int rte_rawdev_socket_id(uint16_t dev_id) { struct rte_rawdev *dev; @@ -80,7 +80,7 @@ rte_rawdev_socket_id(uint16_t dev_id) return dev->socket_id; } -int __rte_experimental +int rte_rawdev_info_get(uint16_t dev_id, struct rte_rawdev_info *dev_info) { struct rte_rawdev *rawdev; @@ -102,7 +102,7 @@ rte_rawdev_info_get(uint16_t dev_id, struct rte_rawdev_info *dev_info) return 0; } -int __rte_experimental +int rte_rawdev_configure(uint16_t dev_id, struct rte_rawdev_info *dev_conf) { struct rte_rawdev *dev; @@ -131,7 +131,7 @@ rte_rawdev_configure(uint16_t dev_id, struct rte_rawdev_info *dev_conf) return diag; } -int __rte_experimental +int rte_rawdev_queue_conf_get(uint16_t dev_id, uint16_t queue_id, rte_rawdev_obj_t queue_conf) @@ -146,7 +146,7 @@ rte_rawdev_queue_conf_get(uint16_t dev_id, return 0; } -int __rte_experimental +int rte_rawdev_queue_setup(uint16_t dev_id, uint16_t queue_id, rte_rawdev_obj_t queue_conf) @@ -160,7 +160,7 @@ rte_rawdev_queue_setup(uint16_t dev_id, return (*dev->dev_ops->queue_setup)(dev, queue_id, queue_conf); } -int __rte_experimental +int rte_rawdev_queue_release(uint16_t dev_id, uint16_t queue_id) { struct rte_rawdev *dev; @@ -172,7 +172,19 @@ rte_rawdev_queue_release(uint16_t dev_id, uint16_t queue_id) return (*dev->dev_ops->queue_release)(dev, queue_id); } -int __rte_experimental +uint16_t +rte_rawdev_queue_count(uint16_t dev_id) +{ + struct rte_rawdev *dev; + + RTE_RAWDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + dev = &rte_rawdevs[dev_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->queue_count, -ENOTSUP); + return (*dev->dev_ops->queue_count)(dev); +} + +int rte_rawdev_get_attr(uint16_t dev_id, const char *attr_name, uint64_t *attr_value) @@ -186,7 +198,7 @@ rte_rawdev_get_attr(uint16_t dev_id, return (*dev->dev_ops->attr_get)(dev, attr_name, attr_value); } -int __rte_experimental +int rte_rawdev_set_attr(uint16_t dev_id, const char *attr_name, const uint64_t attr_value) @@ -200,7 +212,7 @@ rte_rawdev_set_attr(uint16_t dev_id, return (*dev->dev_ops->attr_set)(dev, attr_name, attr_value); } -int __rte_experimental +int rte_rawdev_enqueue_buffers(uint16_t dev_id, struct rte_rawdev_buf **buffers, unsigned int count, @@ -215,7 +227,7 @@ rte_rawdev_enqueue_buffers(uint16_t dev_id, return (*dev->dev_ops->enqueue_bufs)(dev, buffers, count, context); } -int __rte_experimental +int rte_rawdev_dequeue_buffers(uint16_t dev_id, struct rte_rawdev_buf **buffers, unsigned int count, @@ -230,7 +242,7 @@ rte_rawdev_dequeue_buffers(uint16_t dev_id, return (*dev->dev_ops->dequeue_bufs)(dev, buffers, count, context); } -int __rte_experimental +int rte_rawdev_dump(uint16_t dev_id, FILE *f) { struct rte_rawdev *dev; @@ -251,7 +263,7 @@ xstats_get_count(uint16_t dev_id) return (*dev->dev_ops->xstats_get_names)(dev, NULL, 0); } -int __rte_experimental +int rte_rawdev_xstats_names_get(uint16_t dev_id, struct rte_rawdev_xstats_name *xstats_names, unsigned int size) @@ -274,7 +286,7 @@ rte_rawdev_xstats_names_get(uint16_t dev_id, } /* retrieve rawdev extended statistics */ -int __rte_experimental +int rte_rawdev_xstats_get(uint16_t dev_id, const unsigned int ids[], uint64_t values[], @@ -287,7 +299,7 @@ rte_rawdev_xstats_get(uint16_t dev_id, return (*dev->dev_ops->xstats_get)(dev, ids, values, n); } -uint64_t __rte_experimental +uint64_t rte_rawdev_xstats_by_name_get(uint16_t dev_id, const char *name, unsigned int *id) @@ -306,7 +318,7 @@ rte_rawdev_xstats_by_name_get(uint16_t dev_id, return (*dev->dev_ops->xstats_get_by_name)(dev, name, id); } -int __rte_experimental +int rte_rawdev_xstats_reset(uint16_t dev_id, const uint32_t ids[], uint32_t nb_ids) { @@ -317,7 +329,7 @@ rte_rawdev_xstats_reset(uint16_t dev_id, return (*dev->dev_ops->xstats_reset)(dev, ids, nb_ids); } -int __rte_experimental +int rte_rawdev_firmware_status_get(uint16_t dev_id, rte_rawdev_obj_t status_info) { RTE_RAWDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); @@ -327,7 +339,7 @@ rte_rawdev_firmware_status_get(uint16_t dev_id, rte_rawdev_obj_t status_info) return (*dev->dev_ops->firmware_status_get)(dev, status_info); } -int __rte_experimental +int rte_rawdev_firmware_version_get(uint16_t dev_id, rte_rawdev_obj_t version_info) { RTE_RAWDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); @@ -337,7 +349,7 @@ rte_rawdev_firmware_version_get(uint16_t dev_id, rte_rawdev_obj_t version_info) return (*dev->dev_ops->firmware_version_get)(dev, version_info); } -int __rte_experimental +int rte_rawdev_firmware_load(uint16_t dev_id, rte_rawdev_obj_t firmware_image) { RTE_RAWDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); @@ -350,7 +362,7 @@ rte_rawdev_firmware_load(uint16_t dev_id, rte_rawdev_obj_t firmware_image) return (*dev->dev_ops->firmware_load)(dev, firmware_image); } -int __rte_experimental +int rte_rawdev_firmware_unload(uint16_t dev_id) { RTE_RAWDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); @@ -360,7 +372,7 @@ rte_rawdev_firmware_unload(uint16_t dev_id) return (*dev->dev_ops->firmware_unload)(dev); } -int __rte_experimental +int rte_rawdev_selftest(uint16_t dev_id) { RTE_RAWDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); @@ -370,7 +382,7 @@ rte_rawdev_selftest(uint16_t dev_id) return (*dev->dev_ops->dev_selftest)(); } -int __rte_experimental +int rte_rawdev_start(uint16_t dev_id) { struct rte_rawdev *dev; @@ -397,7 +409,7 @@ rte_rawdev_start(uint16_t dev_id) return 0; } -void __rte_experimental +void rte_rawdev_stop(uint16_t dev_id) { struct rte_rawdev *dev; @@ -419,7 +431,7 @@ rte_rawdev_stop(uint16_t dev_id) dev->started = 0; } -int __rte_experimental +int rte_rawdev_close(uint16_t dev_id) { struct rte_rawdev *dev; @@ -438,7 +450,7 @@ rte_rawdev_close(uint16_t dev_id) return (*dev->dev_ops->dev_close)(dev); } -int __rte_experimental +int rte_rawdev_reset(uint16_t dev_id) { struct rte_rawdev *dev; @@ -465,7 +477,7 @@ rte_rawdev_find_free_device_index(void) return RTE_RAWDEV_MAX_DEVS; } -struct rte_rawdev * __rte_experimental +struct rte_rawdev * rte_rawdev_pmd_allocate(const char *name, size_t dev_priv_size, int socket_id) { struct rte_rawdev *rawdev; @@ -506,7 +518,7 @@ rte_rawdev_pmd_allocate(const char *name, size_t dev_priv_size, int socket_id) return rawdev; } -int __rte_experimental +int rte_rawdev_pmd_release(struct rte_rawdev *rawdev) { int ret; @@ -532,10 +544,7 @@ rte_rawdev_pmd_release(struct rte_rawdev *rawdev) return 0; } -RTE_INIT(librawdev_init_log); - -static void -librawdev_init_log(void) +RTE_INIT(librawdev_init_log) { librawdev_logtype = rte_log_register("lib.rawdev"); if (librawdev_logtype >= 0) diff --git a/lib/librte_rawdev/rte_rawdev.h b/lib/librte_rawdev/rte_rawdev.h index 2e14919b..684bfdb8 100644 --- a/lib/librte_rawdev/rte_rawdev.h +++ b/lib/librte_rawdev/rte_rawdev.h @@ -35,7 +35,7 @@ typedef void *rte_rawdev_obj_t; * @return * The total number of usable raw devices. */ -uint8_t __rte_experimental +uint8_t rte_rawdev_count(void); /** @@ -48,7 +48,7 @@ rte_rawdev_count(void); * Returns raw device identifier on success. * - <0: Failure to find named raw device. */ -uint16_t __rte_experimental +uint16_t rte_rawdev_get_dev_id(const char *name); /** @@ -61,7 +61,7 @@ rte_rawdev_get_dev_id(const char *name); * a default of zero if the socket could not be determined. * -(-EINVAL) dev_id value is out of range. */ -int __rte_experimental +int rte_rawdev_socket_id(uint16_t dev_id); /** @@ -84,7 +84,7 @@ struct rte_rawdev_info; * - <0: Error code returned by the driver info get function. * */ -int __rte_experimental +int rte_rawdev_info_get(uint16_t dev_id, struct rte_rawdev_info *dev_info); /** @@ -111,7 +111,7 @@ rte_rawdev_info_get(uint16_t dev_id, struct rte_rawdev_info *dev_info); * - 0: Success, device configured. * - <0: Error code returned by the driver configuration function. */ -int __rte_experimental +int rte_rawdev_configure(uint16_t dev_id, struct rte_rawdev_info *dev_conf); @@ -137,7 +137,7 @@ rte_rawdev_configure(uint16_t dev_id, struct rte_rawdev_info *dev_conf); * @see rte_raw_queue_setup() * */ -int __rte_experimental +int rte_rawdev_queue_conf_get(uint16_t dev_id, uint16_t queue_id, rte_rawdev_obj_t queue_conf); @@ -160,7 +160,7 @@ rte_rawdev_queue_conf_get(uint16_t dev_id, * - 0: Success, raw queue correctly set up. * - <0: raw queue configuration failed */ -int __rte_experimental +int rte_rawdev_queue_setup(uint16_t dev_id, uint16_t queue_id, rte_rawdev_obj_t queue_conf); @@ -180,8 +180,9 @@ rte_rawdev_queue_setup(uint16_t dev_id, * - 0: Success, raw queue released. * - <0: raw queue configuration failed */ -int __rte_experimental +int rte_rawdev_queue_release(uint16_t dev_id, uint16_t queue_id); + /** * Get the number of raw queues on a specific raw device * @@ -190,7 +191,7 @@ rte_rawdev_queue_release(uint16_t dev_id, uint16_t queue_id); * @return * - The number of configured raw queues */ -uint16_t __rte_experimental +uint16_t rte_rawdev_queue_count(uint16_t dev_id); /** @@ -208,7 +209,7 @@ rte_rawdev_queue_count(uint16_t dev_id); * - 0: Success, device started. * < 0: Failure */ -int __rte_experimental +int rte_rawdev_start(uint16_t dev_id); /** @@ -218,7 +219,7 @@ rte_rawdev_start(uint16_t dev_id); * @param dev_id * Raw device identifier. */ -void __rte_experimental +void rte_rawdev_stop(uint16_t dev_id); /** @@ -232,7 +233,7 @@ rte_rawdev_stop(uint16_t dev_id); * - <0 on failure to close device * - (-EAGAIN) if device is busy */ -int __rte_experimental +int rte_rawdev_close(uint16_t dev_id); /** @@ -246,7 +247,7 @@ rte_rawdev_close(uint16_t dev_id); * 0 for sucessful reset, * !0 for failure in resetting */ -int __rte_experimental +int rte_rawdev_reset(uint16_t dev_id); #define RTE_RAWDEV_NAME_MAX_LEN (64) @@ -316,7 +317,7 @@ struct rte_rawdev_buf { * - 0: on success * - <0: on failure. */ -int __rte_experimental +int rte_rawdev_dump(uint16_t dev_id, FILE *f); /** @@ -338,7 +339,7 @@ rte_rawdev_dump(uint16_t dev_id, FILE *f); * 0 for success * !0 Error; attr_value remains untouched in case of error. */ -int __rte_experimental +int rte_rawdev_get_attr(uint16_t dev_id, const char *attr_name, uint64_t *attr_value); @@ -357,7 +358,7 @@ rte_rawdev_get_attr(uint16_t dev_id, * 0 for success * !0 Error */ -int __rte_experimental +int rte_rawdev_set_attr(uint16_t dev_id, const char *attr_name, const uint64_t attr_value); @@ -383,7 +384,7 @@ rte_rawdev_set_attr(uint16_t dev_id, * Whether partial enqueue is failure or success is defined between app * and driver implementation. */ -int __rte_experimental +int rte_rawdev_enqueue_buffers(uint16_t dev_id, struct rte_rawdev_buf **buffers, unsigned int count, @@ -414,7 +415,7 @@ rte_rawdev_enqueue_buffers(uint16_t dev_id, * Whether partial enqueue is failure or success is defined between app * and driver implementation. */ -int __rte_experimental +int rte_rawdev_dequeue_buffers(uint16_t dev_id, struct rte_rawdev_buf **buffers, unsigned int count, @@ -454,7 +455,7 @@ struct rte_rawdev_xstats_name { * -ENODEV for invalid *dev_id* * -ENOTSUP if the device doesn't support this function. */ -int __rte_experimental +int rte_rawdev_xstats_names_get(uint16_t dev_id, struct rte_rawdev_xstats_name *xstats_names, unsigned int size); @@ -478,7 +479,7 @@ rte_rawdev_xstats_names_get(uint16_t dev_id, * -ENODEV for invalid *dev_id* * -ENOTSUP if the device doesn't support this function. */ -int __rte_experimental +int rte_rawdev_xstats_get(uint16_t dev_id, const unsigned int ids[], uint64_t values[], @@ -500,7 +501,7 @@ rte_rawdev_xstats_get(uint16_t dev_id, * - positive value or zero: the stat value * - negative value: -EINVAL if stat not found, -ENOTSUP if not supported. */ -uint64_t __rte_experimental +uint64_t rte_rawdev_xstats_by_name_get(uint16_t dev_id, const char *name, unsigned int *id); @@ -520,7 +521,7 @@ rte_rawdev_xstats_by_name_get(uint16_t dev_id, * - zero: successfully reset the statistics to zero * - negative value: -EINVAL invalid parameters, -ENOTSUP if not supported. */ -int __rte_experimental +int rte_rawdev_xstats_reset(uint16_t dev_id, const uint32_t ids[], uint32_t nb_ids); @@ -539,7 +540,7 @@ rte_rawdev_xstats_reset(uint16_t dev_id, * 0 for success, * !0 for failure, `status_info` argument state is undefined */ -int __rte_experimental +int rte_rawdev_firmware_status_get(uint16_t dev_id, rte_rawdev_obj_t status_info); @@ -557,7 +558,7 @@ rte_rawdev_firmware_status_get(uint16_t dev_id, * 0 for success, * !0 for failure, `version_info` argument state is undefined */ -int __rte_experimental +int rte_rawdev_firmware_version_get(uint16_t dev_id, rte_rawdev_obj_t version_info); @@ -574,7 +575,7 @@ rte_rawdev_firmware_version_get(uint16_t dev_id, * 0 for successful load * !0 for failure to load the provided image, or image incorrect. */ -int __rte_experimental +int rte_rawdev_firmware_load(uint16_t dev_id, rte_rawdev_obj_t firmware_image); /** @@ -586,7 +587,7 @@ rte_rawdev_firmware_load(uint16_t dev_id, rte_rawdev_obj_t firmware_image); * 0 for successful Unload * !0 for failure in unloading */ -int __rte_experimental +int rte_rawdev_firmware_unload(uint16_t dev_id); /** @@ -599,7 +600,7 @@ rte_rawdev_firmware_unload(uint16_t dev_id); * - -ENOTSUP if the device doesn't support selftest * - other values < 0 on failure. */ -int __rte_experimental +int rte_rawdev_selftest(uint16_t dev_id); #ifdef __cplusplus diff --git a/lib/librte_rawdev/rte_rawdev_pmd.h b/lib/librte_rawdev/rte_rawdev_pmd.h index 408adf0f..bb9bbc35 100644 --- a/lib/librte_rawdev/rte_rawdev_pmd.h +++ b/lib/librte_rawdev/rte_rawdev_pmd.h @@ -251,6 +251,24 @@ typedef int (*rawdev_queue_release_t)(struct rte_rawdev *dev, uint16_t queue_id); /** + * Get the count of number of queues configured on this device. + * + * Another way to fetch this information is to fetch the device configuration. + * But, that assumes that the device configuration managed by the driver has + * that kind of information. + * + * This function helps in getting queue count supported, independently. It + * can help in cases where iterator needs to be implemented. + * + * @param + * Raw device pointer + * @return + * Number of queues; 0 is assumed to be a valid response. + * + */ +typedef uint16_t (*rawdev_queue_count_t)(struct rte_rawdev *dev); + +/** * Enqueue an array of raw buffers to the device. * * Buffer being used is opaque - it can be obtained from mempool or from @@ -506,6 +524,8 @@ struct rte_rawdev_ops { rawdev_queue_setup_t queue_setup; /**< Release an raw queue. */ rawdev_queue_release_t queue_release; + /**< Get the number of queues attached to the device */ + rawdev_queue_count_t queue_count; /**< Enqueue an array of raw buffers to device. */ rawdev_enqueue_bufs_t enqueue_bufs; @@ -556,7 +576,7 @@ struct rte_rawdev_ops { * @return * - Slot in the rte_dev_devices array for a new device; */ -struct rte_rawdev * __rte_experimental +struct rte_rawdev * rte_rawdev_pmd_allocate(const char *name, size_t dev_private_size, int socket_id); @@ -568,7 +588,7 @@ rte_rawdev_pmd_allocate(const char *name, size_t dev_private_size, * @return * - 0 on success, negative on error */ -int __rte_experimental +int rte_rawdev_pmd_release(struct rte_rawdev *rawdev); /** @@ -585,7 +605,7 @@ rte_rawdev_pmd_release(struct rte_rawdev *rawdev); * - Raw device pointer if device is successfully created. * - NULL if device cannot be created. */ -struct rte_rawdev * __rte_experimental +struct rte_rawdev * rte_rawdev_pmd_init(const char *name, size_t dev_private_size, int socket_id); @@ -597,7 +617,7 @@ rte_rawdev_pmd_init(const char *name, size_t dev_private_size, * @return * - 0 on success, negative on error */ -int __rte_experimental +int rte_rawdev_pmd_uninit(const char *name); #ifdef __cplusplus diff --git a/lib/librte_rawdev/rte_rawdev_version.map b/lib/librte_rawdev/rte_rawdev_version.map index af4465e2..b61dbff1 100644 --- a/lib/librte_rawdev/rte_rawdev_version.map +++ b/lib/librte_rawdev/rte_rawdev_version.map @@ -1,4 +1,4 @@ -EXPERIMENTAL { +DPDK_18.08 { global: rte_rawdev_close; @@ -16,6 +16,7 @@ EXPERIMENTAL { rte_rawdev_pmd_allocate; rte_rawdev_pmd_release; rte_rawdev_queue_conf_get; + rte_rawdev_queue_count; rte_rawdev_queue_setup; rte_rawdev_queue_release; rte_rawdev_reset; diff --git a/lib/librte_ring/rte_ring.h b/lib/librte_ring/rte_ring.h index 12458225..7a731d07 100644 --- a/lib/librte_ring/rte_ring.h +++ b/lib/librte_ring/rte_ring.h @@ -26,8 +26,9 @@ * - Bulk dequeue. * - Bulk enqueue. * - * Note: the ring implementation is not preemptable. A lcore must not - * be interrupted by another task that uses the same ring. + * Note: the ring implementation is not preemptible. Refer to Programmer's + * guide/Environment Abstraction Layer/Multiple pthread/Known Issues/rte_ring + * for more information. * */ @@ -382,7 +383,7 @@ __rte_ring_do_dequeue(struct rte_ring *r, void **obj_table, uint32_t cons_head, cons_next; uint32_t entries; - n = __rte_ring_move_cons_head(r, is_sc, n, behavior, + n = __rte_ring_move_cons_head(r, (int)is_sc, n, behavior, &cons_head, &cons_next, &entries); if (n == 0) goto end; diff --git a/lib/librte_ring/rte_ring_c11_mem.h b/lib/librte_ring/rte_ring_c11_mem.h index cb3f82b1..94df3c4a 100644 --- a/lib/librte_ring/rte_ring_c11_mem.h +++ b/lib/librte_ring/rte_ring_c11_mem.h @@ -66,14 +66,14 @@ __rte_ring_move_prod_head(struct rte_ring *r, unsigned int is_sp, *old_head = __atomic_load_n(&r->prod.head, __ATOMIC_ACQUIRE); - const uint32_t cons_tail = r->cons.tail; + /* * The subtraction is done between two unsigned 32bits value * (the result is always modulo 32 bits even if we have * *old_head > cons_tail). So 'free_entries' is always between 0 * and capacity (which is < size). */ - *free_entries = (capacity + cons_tail - *old_head); + *free_entries = (capacity + r->cons.tail - *old_head); /* check that we have enough room in ring */ if (unlikely(n > *free_entries)) @@ -133,13 +133,13 @@ __rte_ring_move_cons_head(struct rte_ring *r, int is_sc, n = max; *old_head = __atomic_load_n(&r->cons.head, __ATOMIC_ACQUIRE); - const uint32_t prod_tail = r->prod.tail; + /* The subtraction is done between two unsigned 32bits value * (the result is always modulo 32 bits even if we have * cons_head > prod_tail). So 'entries' is always between 0 * and size(ring)-1. */ - *entries = (prod_tail - *old_head); + *entries = (r->prod.tail - *old_head); /* Set the actual entries for dequeue */ if (n > *entries) diff --git a/lib/librte_security/rte_security.c b/lib/librte_security/rte_security.c index 1e559c99..1954960a 100644 --- a/lib/librte_security/rte_security.c +++ b/lib/librte_security/rte_security.c @@ -1,34 +1,6 @@ -/*- - * BSD LICENSE - * - * Copyright 2017 NXP. - * Copyright(c) 2017 Intel Corporation. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of NXP nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 NXP. + * Copyright(c) 2017 Intel Corporation. */ #include <rte_malloc.h> @@ -91,7 +63,6 @@ rte_security_session_destroy(struct rte_security_ctx *instance, struct rte_security_session *sess) { int ret; - struct rte_mempool *mp = rte_mempool_from_obj(sess); RTE_FUNC_PTR_OR_ERR_RET(*instance->ops->session_destroy, -ENOTSUP); @@ -100,7 +71,7 @@ rte_security_session_destroy(struct rte_security_ctx *instance, ret = instance->ops->session_destroy(instance->device, sess); if (!ret) - rte_mempool_put(mp, (void *)sess); + rte_mempool_put(rte_mempool_from_obj(sess), (void *)sess); return ret; } diff --git a/lib/librte_security/rte_security.h b/lib/librte_security/rte_security.h index afa2861f..b0d1b97e 100644 --- a/lib/librte_security/rte_security.h +++ b/lib/librte_security/rte_security.h @@ -1,34 +1,6 @@ -/*- - * BSD LICENSE - * - * Copyright 2017 NXP. - * Copyright(c) 2017 Intel Corporation. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of NXP nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 NXP. + * Copyright(c) 2017 Intel Corporation. */ #ifndef _RTE_SECURITY_H_ diff --git a/lib/librte_security/rte_security_driver.h b/lib/librte_security/rte_security_driver.h index 0583f889..42f42ffe 100644 --- a/lib/librte_security/rte_security_driver.h +++ b/lib/librte_security/rte_security_driver.h @@ -1,34 +1,6 @@ -/*- - * BSD LICENSE - * - * Copyright(c) 2017 Intel Corporation. All rights reserved. - * Copyright 2017 NXP. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 NXP. + * Copyright(c) 2017 Intel Corporation. */ #ifndef _RTE_SECURITY_DRIVER_H_ diff --git a/lib/librte_vhost/iotlb.c b/lib/librte_vhost/iotlb.c index c11ebcaa..c6354fef 100644 --- a/lib/librte_vhost/iotlb.c +++ b/lib/librte_vhost/iotlb.c @@ -303,6 +303,13 @@ out: return vva; } +void +vhost_user_iotlb_flush_all(struct vhost_virtqueue *vq) +{ + vhost_user_iotlb_cache_remove_all(vq); + vhost_user_iotlb_pending_remove_all(vq); +} + int vhost_user_iotlb_init(struct virtio_net *dev, int vq_index) { @@ -315,8 +322,7 @@ vhost_user_iotlb_init(struct virtio_net *dev, int vq_index) * The cache has already been initialized, * just drop all cached and pending entries. */ - vhost_user_iotlb_cache_remove_all(vq); - vhost_user_iotlb_pending_remove_all(vq); + vhost_user_iotlb_flush_all(vq); } #ifdef RTE_LIBRTE_VHOST_NUMA diff --git a/lib/librte_vhost/iotlb.h b/lib/librte_vhost/iotlb.h index e7083e37..60b9e4c5 100644 --- a/lib/librte_vhost/iotlb.h +++ b/lib/librte_vhost/iotlb.h @@ -73,7 +73,7 @@ void vhost_user_iotlb_pending_insert(struct vhost_virtqueue *vq, uint64_t iova, uint8_t perm); void vhost_user_iotlb_pending_remove(struct vhost_virtqueue *vq, uint64_t iova, uint64_t size, uint8_t perm); - +void vhost_user_iotlb_flush_all(struct vhost_virtqueue *vq); int vhost_user_iotlb_init(struct virtio_net *dev, int vq_index); #endif /* _VHOST_IOTLB_H_ */ diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h index 7f0cb9bc..b02673d4 100644 --- a/lib/librte_vhost/rte_vhost.h +++ b/lib/librte_vhost/rte_vhost.h @@ -58,6 +58,14 @@ extern "C" { #define VHOST_USER_PROTOCOL_F_CRYPTO_SESSION 7 #endif +#ifndef VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD +#define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD 10 +#endif + +#ifndef VHOST_USER_PROTOCOL_F_HOST_NOTIFIER +#define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11 +#endif + /** Indicate whether protocol features negotiation is supported. */ #ifndef VHOST_USER_F_PROTOCOL_FEATURES #define VHOST_USER_F_PROTOCOL_FEATURES 30 diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c index 0399c37b..d6303174 100644 --- a/lib/librte_vhost/socket.c +++ b/lib/librte_vhost/socket.c @@ -853,6 +853,12 @@ rte_vhost_driver_register(const char *path, uint64_t flags) vsocket->supported_features = VIRTIO_NET_SUPPORTED_FEATURES; vsocket->features = VIRTIO_NET_SUPPORTED_FEATURES; + /* Dequeue zero copy can't assure descriptors returned in order */ + if (vsocket->dequeue_zero_copy) { + vsocket->supported_features &= ~(1ULL << VIRTIO_F_IN_ORDER); + vsocket->features &= ~(1ULL << VIRTIO_F_IN_ORDER); + } + if (!(flags & RTE_VHOST_USER_IOMMU_SUPPORT)) { vsocket->supported_features &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM); vsocket->features &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM); diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c index afded495..3c9be10a 100644 --- a/lib/librte_vhost/vhost.c +++ b/lib/librte_vhost/vhost.c @@ -93,9 +93,12 @@ cleanup_device(struct virtio_net *dev, int destroy) } void -free_vq(struct vhost_virtqueue *vq) +free_vq(struct virtio_net *dev, struct vhost_virtqueue *vq) { - rte_free(vq->shadow_used_ring); + if (vq_is_packed(dev)) + rte_free(vq->shadow_used_packed); + else + rte_free(vq->shadow_used_split); rte_free(vq->batch_copy_elems); rte_mempool_free(vq->iotlb_pool); rte_free(vq); @@ -110,19 +113,16 @@ free_device(struct virtio_net *dev) uint32_t i; for (i = 0; i < dev->nr_vring; i++) - free_vq(dev->virtqueue[i]); + free_vq(dev, dev->virtqueue[i]); rte_free(dev); } -int -vring_translate(struct virtio_net *dev, struct vhost_virtqueue *vq) +static int +vring_translate_split(struct virtio_net *dev, struct vhost_virtqueue *vq) { uint64_t req_size, size; - if (!(dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) - goto out; - req_size = sizeof(struct vring_desc) * vq->size; size = req_size; vq->desc = (struct vring_desc *)(uintptr_t)vhost_iova_to_vva(dev, vq, @@ -153,6 +153,55 @@ vring_translate(struct virtio_net *dev, struct vhost_virtqueue *vq) if (!vq->used || size != req_size) return -1; + return 0; +} + +static int +vring_translate_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) +{ + uint64_t req_size, size; + + req_size = sizeof(struct vring_packed_desc) * vq->size; + size = req_size; + vq->desc_packed = (struct vring_packed_desc *)(uintptr_t) + vhost_iova_to_vva(dev, vq, vq->ring_addrs.desc_user_addr, + &size, VHOST_ACCESS_RW); + if (!vq->desc_packed || size != req_size) + return -1; + + req_size = sizeof(struct vring_packed_desc_event); + size = req_size; + vq->driver_event = (struct vring_packed_desc_event *)(uintptr_t) + vhost_iova_to_vva(dev, vq, vq->ring_addrs.avail_user_addr, + &size, VHOST_ACCESS_RW); + if (!vq->driver_event || size != req_size) + return -1; + + req_size = sizeof(struct vring_packed_desc_event); + size = req_size; + vq->device_event = (struct vring_packed_desc_event *)(uintptr_t) + vhost_iova_to_vva(dev, vq, vq->ring_addrs.used_user_addr, + &size, VHOST_ACCESS_RW); + if (!vq->device_event || size != req_size) + return -1; + + return 0; +} + +int +vring_translate(struct virtio_net *dev, struct vhost_virtqueue *vq) +{ + + if (!(dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) + goto out; + + if (vq_is_packed(dev)) { + if (vring_translate_packed(dev, vq) < 0) + return -1; + } else { + if (vring_translate_split(dev, vq) < 0) + return -1; + } out: vq->access_ok = 1; @@ -234,6 +283,9 @@ alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx) dev->virtqueue[vring_idx] = vq; init_vring_queue(dev, vring_idx); rte_spinlock_init(&vq->access_lock); + vq->avail_wrap_counter = 1; + vq->used_wrap_counter = 1; + vq->signalled_used_valid = false; dev->nr_vring += 1; @@ -268,21 +320,21 @@ vhost_new_device(void) struct virtio_net *dev; int i; - dev = rte_zmalloc(NULL, sizeof(struct virtio_net), 0); - if (dev == NULL) { - RTE_LOG(ERR, VHOST_CONFIG, - "Failed to allocate memory for new dev.\n"); - return -1; - } - for (i = 0; i < MAX_VHOST_DEVICE; i++) { if (vhost_devices[i] == NULL) break; } + if (i == MAX_VHOST_DEVICE) { RTE_LOG(ERR, VHOST_CONFIG, "Failed to find a free slot for new device.\n"); - rte_free(dev); + return -1; + } + + dev = rte_zmalloc(NULL, sizeof(struct virtio_net), 0); + if (dev == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to allocate memory for new dev.\n"); return -1; } @@ -291,10 +343,27 @@ vhost_new_device(void) dev->flags = VIRTIO_DEV_BUILTIN_VIRTIO_NET; dev->slave_req_fd = -1; dev->vdpa_dev_id = -1; + rte_spinlock_init(&dev->slave_req_lock); return i; } +void +vhost_destroy_device_notify(struct virtio_net *dev) +{ + struct rte_vdpa_device *vdpa_dev; + int did; + + if (dev->flags & VIRTIO_DEV_RUNNING) { + did = dev->vdpa_dev_id; + vdpa_dev = rte_vdpa_get_device(did); + if (vdpa_dev && vdpa_dev->ops->dev_close) + vdpa_dev->ops->dev_close(dev->vid); + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } +} + /* * Invoked when there is the vhost-user connection is broken (when * the virtio device is being detached). @@ -303,20 +372,11 @@ void vhost_destroy_device(int vid) { struct virtio_net *dev = get_device(vid); - struct rte_vdpa_device *vdpa_dev; - int did = -1; if (dev == NULL) return; - if (dev->flags & VIRTIO_DEV_RUNNING) { - did = dev->vdpa_dev_id; - vdpa_dev = rte_vdpa_get_device(did); - if (vdpa_dev && vdpa_dev->ops->dev_close) - vdpa_dev->ops->dev_close(dev->vid); - dev->flags &= ~VIRTIO_DEV_RUNNING; - dev->notify_ops->destroy_device(vid); - } + vhost_destroy_device_notify(dev); cleanup_device(dev, 1); free_device(dev); @@ -346,6 +406,8 @@ vhost_detach_vdpa_device(int vid) if (dev == NULL) return; + vhost_user_host_notifier_ctrl(vid, false); + dev->vdpa_dev_id = -1; } @@ -558,7 +620,11 @@ rte_vhost_vring_call(int vid, uint16_t vring_idx) if (!vq) return -1; - vhost_vring_call(dev, vq); + if (vq_is_packed(dev)) + vhost_vring_call_packed(dev, vq); + else + vhost_vring_call_split(dev, vq); + return 0; } @@ -579,19 +645,52 @@ rte_vhost_avail_entries(int vid, uint16_t queue_id) return *(volatile uint16_t *)&vq->avail->idx - vq->last_used_idx; } +static inline void +vhost_enable_notify_split(struct vhost_virtqueue *vq, int enable) +{ + if (enable) + vq->used->flags &= ~VRING_USED_F_NO_NOTIFY; + else + vq->used->flags |= VRING_USED_F_NO_NOTIFY; +} + +static inline void +vhost_enable_notify_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, int enable) +{ + uint16_t flags; + + if (!enable) + vq->device_event->flags = VRING_EVENT_F_DISABLE; + + flags = VRING_EVENT_F_ENABLE; + if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) { + flags = VRING_EVENT_F_DESC; + vq->device_event->off_wrap = vq->last_avail_idx | + vq->avail_wrap_counter << 15; + } + + rte_smp_wmb(); + + vq->device_event->flags = flags; +} + int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable) { struct virtio_net *dev = get_device(vid); + struct vhost_virtqueue *vq; if (!dev) return -1; - if (enable) - dev->virtqueue[queue_id]->used->flags &= - ~VRING_USED_F_NO_NOTIFY; + vq = dev->virtqueue[queue_id]; + + if (vq_is_packed(dev)) + vhost_enable_notify_packed(dev, vq, enable); else - dev->virtqueue[queue_id]->used->flags |= VRING_USED_F_NO_NOTIFY; + vhost_enable_notify_split(vq, enable); + return 0; } diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 58c425a5..760a09c0 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -43,6 +43,7 @@ * from vring to do scatter RX. */ struct buf_vector { + uint64_t buf_iova; uint64_t buf_addr; uint32_t buf_len; uint32_t desc_idx; @@ -55,6 +56,7 @@ struct buf_vector { struct zcopy_mbuf { struct rte_mbuf *mbuf; uint32_t desc_idx; + uint16_t desc_count; uint16_t in_use; TAILQ_ENTRY(zcopy_mbuf) next; @@ -79,19 +81,35 @@ struct log_cache_entry { unsigned long val; }; +struct vring_used_elem_packed { + uint16_t id; + uint32_t len; + uint32_t count; +}; + /** * Structure contains variables relevant to RX/TX virtqueues. */ struct vhost_virtqueue { - struct vring_desc *desc; - struct vring_avail *avail; - struct vring_used *used; + union { + struct vring_desc *desc; + struct vring_packed_desc *desc_packed; + }; + union { + struct vring_avail *avail; + struct vring_packed_desc_event *driver_event; + }; + union { + struct vring_used *used; + struct vring_packed_desc_event *device_event; + }; uint32_t size; uint16_t last_avail_idx; uint16_t last_used_idx; /* Last used index we notify to front end. */ uint16_t signalled_used; + bool signalled_used_valid; #define VIRTIO_INVALID_EVENTFD (-1) #define VIRTIO_UNINITIALIZED_EVENTFD (-2) @@ -115,12 +133,17 @@ struct vhost_virtqueue { struct zcopy_mbuf *zmbufs; struct zcopy_mbuf_list zmbuf_list; - struct vring_used_elem *shadow_used_ring; + union { + struct vring_used_elem *shadow_used_split; + struct vring_used_elem_packed *shadow_used_packed; + }; uint16_t shadow_used_idx; struct vhost_vring_addr ring_addrs; struct batch_copy_elem *batch_copy_elems; uint16_t batch_copy_nb_elems; + bool used_wrap_counter; + bool avail_wrap_counter; struct log_cache_entry log_cache[VHOST_LOG_CACHE_NR]; uint16_t log_cache_nb_elem; @@ -191,6 +214,42 @@ struct vhost_msg { #define VIRTIO_F_VERSION_1 32 #endif +/* Declare packed ring related bits for older kernels */ +#ifndef VIRTIO_F_RING_PACKED + +#define VIRTIO_F_RING_PACKED 34 + +#define VRING_DESC_F_NEXT 1 +#define VRING_DESC_F_WRITE 2 +#define VRING_DESC_F_INDIRECT 4 + +#define VRING_DESC_F_AVAIL (1ULL << 7) +#define VRING_DESC_F_USED (1ULL << 15) + +struct vring_packed_desc { + uint64_t addr; + uint32_t len; + uint16_t id; + uint16_t flags; +}; + +#define VRING_EVENT_F_ENABLE 0x0 +#define VRING_EVENT_F_DISABLE 0x1 +#define VRING_EVENT_F_DESC 0x2 + +struct vring_packed_desc_event { + uint16_t off_wrap; + uint16_t flags; +}; +#endif + +/* + * Available and used descs are in same order + */ +#ifndef VIRTIO_F_IN_ORDER +#define VIRTIO_F_IN_ORDER 35 +#endif + /* Features supported by this builtin vhost-user net driver. */ #define VIRTIO_NET_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \ (1ULL << VIRTIO_F_ANY_LAYOUT) | \ @@ -214,7 +273,8 @@ struct vhost_msg { (1ULL << VIRTIO_NET_F_GUEST_ECN) | \ (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | \ (1ULL << VIRTIO_RING_F_EVENT_IDX) | \ - (1ULL << VIRTIO_NET_F_MTU) | \ + (1ULL << VIRTIO_NET_F_MTU) | \ + (1ULL << VIRTIO_F_IN_ORDER) | \ (1ULL << VIRTIO_F_IOMMU_PLATFORM)) @@ -301,6 +361,7 @@ struct virtio_net { struct guest_page *guest_pages; int slave_req_fd; + rte_spinlock_t slave_req_lock; /* * Device id to identify a specific backend device. @@ -314,6 +375,19 @@ struct virtio_net { struct vhost_user_extern_ops extern_ops; } __rte_cache_aligned; +static __rte_always_inline bool +vq_is_packed(struct virtio_net *dev) +{ + return dev->features & (1ull << VIRTIO_F_RING_PACKED); +} + +static inline bool +desc_is_avail(struct vring_packed_desc *desc, bool wrap_counter) +{ + return wrap_counter == !!(desc->flags & VRING_DESC_F_AVAIL) && + wrap_counter != !!(desc->flags & VRING_DESC_F_USED); +} + #define VHOST_LOG_PAGE 4096 /* @@ -428,6 +502,7 @@ vhost_log_cache_page(struct virtio_net *dev, struct vhost_virtqueue *vq, vq->log_cache[i].offset = offset; vq->log_cache[i].val = (1UL << bit_nr); + vq->log_cache_nb_elem++; } static __rte_always_inline void @@ -535,9 +610,10 @@ int vhost_new_device(void); void cleanup_device(struct virtio_net *dev, int destroy); void reset_device(struct virtio_net *dev); void vhost_destroy_device(int); +void vhost_destroy_device_notify(struct virtio_net *dev); void cleanup_vq(struct vhost_virtqueue *vq, int destroy); -void free_vq(struct vhost_virtqueue *vq); +void free_vq(struct virtio_net *dev, struct vhost_virtqueue *vq); int alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx); @@ -588,10 +664,10 @@ vhost_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old) } static __rte_always_inline void -vhost_vring_call(struct virtio_net *dev, struct vhost_virtqueue *vq) +vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq) { /* Flush used->idx update before we read avail->flags. */ - rte_mb(); + rte_smp_mb(); /* Don't kick guest if we don't reach index specified by guest. */ if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) { @@ -615,4 +691,55 @@ vhost_vring_call(struct virtio_net *dev, struct vhost_virtqueue *vq) } } +static __rte_always_inline void +vhost_vring_call_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) +{ + uint16_t old, new, off, off_wrap; + bool signalled_used_valid, kick = false; + + /* Flush used desc update. */ + rte_smp_mb(); + + if (!(dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX))) { + if (vq->driver_event->flags != + VRING_EVENT_F_DISABLE) + kick = true; + goto kick; + } + + old = vq->signalled_used; + new = vq->last_used_idx; + vq->signalled_used = new; + signalled_used_valid = vq->signalled_used_valid; + vq->signalled_used_valid = true; + + if (vq->driver_event->flags != VRING_EVENT_F_DESC) { + if (vq->driver_event->flags != VRING_EVENT_F_DISABLE) + kick = true; + goto kick; + } + + if (unlikely(!signalled_used_valid)) { + kick = true; + goto kick; + } + + rte_smp_rmb(); + + off_wrap = vq->driver_event->off_wrap; + off = off_wrap & ~(1 << 15); + + if (new <= old) + old -= vq->size; + + if (vq->used_wrap_counter != off_wrap >> 15) + off -= vq->size; + + if (vhost_need_event(off, new, old)) + kick = true; +kick: + if (kick) + eventfd_write(vq->callfd, (eventfd_t)1); +} + #endif /* _VHOST_NET_CDEV_H_ */ diff --git a/lib/librte_vhost/vhost_crypto.c b/lib/librte_vhost/vhost_crypto.c index f1650738..57341ef8 100644 --- a/lib/librte_vhost/vhost_crypto.c +++ b/lib/librte_vhost/vhost_crypto.c @@ -940,8 +940,7 @@ vhost_crypto_process_one_req(struct vhost_crypto *vcrypto, struct vhost_virtqueue *vq, struct rte_crypto_op *op, struct vring_desc *head, uint16_t desc_idx) { - struct vhost_crypto_data_req *vc_req = RTE_PTR_ADD(op->sym->m_src, - sizeof(struct rte_mbuf)); + struct vhost_crypto_data_req *vc_req = rte_mbuf_to_priv(op->sym->m_src); struct rte_cryptodev_sym_session *session; struct virtio_crypto_op_data_req *req, tmp_req; struct virtio_crypto_inhdr *inhdr; @@ -1062,8 +1061,7 @@ vhost_crypto_finalize_one_request(struct rte_crypto_op *op, { struct rte_mbuf *m_src = op->sym->m_src; struct rte_mbuf *m_dst = op->sym->m_dst; - struct vhost_crypto_data_req *vc_req = RTE_PTR_ADD(m_src, - sizeof(struct rte_mbuf)); + struct vhost_crypto_data_req *vc_req = rte_mbuf_to_priv(m_src); uint16_t desc_idx; int ret = 0; diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c index 947290fc..a2d4c9ff 100644 --- a/lib/librte_vhost/vhost_user.c +++ b/lib/librte_vhost/vhost_user.c @@ -135,17 +135,7 @@ vhost_user_set_owner(void) static int vhost_user_reset_owner(struct virtio_net *dev) { - struct rte_vdpa_device *vdpa_dev; - int did = -1; - - if (dev->flags & VIRTIO_DEV_RUNNING) { - did = dev->vdpa_dev_id; - vdpa_dev = rte_vdpa_get_device(did); - if (vdpa_dev && vdpa_dev->ops->dev_close) - vdpa_dev->ops->dev_close(dev->vid); - dev->flags &= ~VIRTIO_DEV_RUNNING; - dev->notify_ops->destroy_device(dev->vid); - } + vhost_destroy_device_notify(dev); cleanup_device(dev, 0); reset_device(dev); @@ -243,7 +233,7 @@ vhost_user_set_features(struct virtio_net *dev, uint64_t features) dev->virtqueue[dev->nr_vring] = NULL; cleanup_vq(vq, 1); - free_vq(vq); + free_vq(dev, vq); } } @@ -292,13 +282,26 @@ vhost_user_set_vring_num(struct virtio_net *dev, TAILQ_INIT(&vq->zmbuf_list); } - vq->shadow_used_ring = rte_malloc(NULL, + if (vq_is_packed(dev)) { + vq->shadow_used_packed = rte_malloc(NULL, + vq->size * + sizeof(struct vring_used_elem_packed), + RTE_CACHE_LINE_SIZE); + if (!vq->shadow_used_packed) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to allocate memory for shadow used ring.\n"); + return -1; + } + + } else { + vq->shadow_used_split = rte_malloc(NULL, vq->size * sizeof(struct vring_used_elem), RTE_CACHE_LINE_SIZE); - if (!vq->shadow_used_ring) { - RTE_LOG(ERR, VHOST_CONFIG, - "failed to allocate memory for shadow used ring.\n"); - return -1; + if (!vq->shadow_used_split) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to allocate memory for shadow used ring.\n"); + return -1; + } } vq->batch_copy_elems = rte_malloc(NULL, @@ -325,7 +328,8 @@ numa_realloc(struct virtio_net *dev, int index) struct virtio_net *old_dev; struct vhost_virtqueue *old_vq, *vq; struct zcopy_mbuf *new_zmbuf; - struct vring_used_elem *new_shadow_used_ring; + struct vring_used_elem *new_shadow_used_split; + struct vring_used_elem_packed *new_shadow_used_packed; struct batch_copy_elem *new_batch_copy_elems; int ret; @@ -360,13 +364,26 @@ numa_realloc(struct virtio_net *dev, int index) vq->zmbufs = new_zmbuf; } - new_shadow_used_ring = rte_malloc_socket(NULL, - vq->size * sizeof(struct vring_used_elem), - RTE_CACHE_LINE_SIZE, - newnode); - if (new_shadow_used_ring) { - rte_free(vq->shadow_used_ring); - vq->shadow_used_ring = new_shadow_used_ring; + if (vq_is_packed(dev)) { + new_shadow_used_packed = rte_malloc_socket(NULL, + vq->size * + sizeof(struct vring_used_elem_packed), + RTE_CACHE_LINE_SIZE, + newnode); + if (new_shadow_used_packed) { + rte_free(vq->shadow_used_packed); + vq->shadow_used_packed = new_shadow_used_packed; + } + } else { + new_shadow_used_split = rte_malloc_socket(NULL, + vq->size * + sizeof(struct vring_used_elem), + RTE_CACHE_LINE_SIZE, + newnode); + if (new_shadow_used_split) { + rte_free(vq->shadow_used_split); + vq->shadow_used_split = new_shadow_used_split; + } } new_batch_copy_elems = rte_malloc_socket(NULL, @@ -477,6 +494,51 @@ translate_ring_addresses(struct virtio_net *dev, int vq_index) struct vhost_vring_addr *addr = &vq->ring_addrs; uint64_t len; + if (vq_is_packed(dev)) { + len = sizeof(struct vring_packed_desc) * vq->size; + vq->desc_packed = (struct vring_packed_desc *)(uintptr_t) + ring_addr_to_vva(dev, vq, addr->desc_user_addr, &len); + vq->log_guest_addr = 0; + if (vq->desc_packed == NULL || + len != sizeof(struct vring_packed_desc) * + vq->size) { + RTE_LOG(DEBUG, VHOST_CONFIG, + "(%d) failed to map desc_packed ring.\n", + dev->vid); + return dev; + } + + dev = numa_realloc(dev, vq_index); + vq = dev->virtqueue[vq_index]; + addr = &vq->ring_addrs; + + len = sizeof(struct vring_packed_desc_event); + vq->driver_event = (struct vring_packed_desc_event *) + (uintptr_t)ring_addr_to_vva(dev, + vq, addr->avail_user_addr, &len); + if (vq->driver_event == NULL || + len != sizeof(struct vring_packed_desc_event)) { + RTE_LOG(DEBUG, VHOST_CONFIG, + "(%d) failed to find driver area address.\n", + dev->vid); + return dev; + } + + len = sizeof(struct vring_packed_desc_event); + vq->device_event = (struct vring_packed_desc_event *) + (uintptr_t)ring_addr_to_vva(dev, + vq, addr->used_user_addr, &len); + if (vq->device_event == NULL || + len != sizeof(struct vring_packed_desc_event)) { + RTE_LOG(DEBUG, VHOST_CONFIG, + "(%d) failed to find device area address.\n", + dev->vid); + return dev; + } + + return dev; + } + /* The addresses are converted from QEMU virtual to Vhost virtual. */ if (vq->desc && vq->avail && vq->used) return dev; @@ -751,6 +813,11 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *pmsg) dev->mem = NULL; } + /* Flush IOTLB cache as previous HVAs are now invalid */ + if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) + for (i = 0; i < dev->nr_vring; i++) + vhost_user_iotlb_flush_all(dev->virtqueue[i]); + dev->nr_guest_pages = 0; if (!dev->guest_pages) { dev->max_guest_pages = 8; @@ -885,10 +952,20 @@ err_mmap: return -1; } -static int -vq_is_ready(struct vhost_virtqueue *vq) +static bool +vq_is_ready(struct virtio_net *dev, struct vhost_virtqueue *vq) { - return vq && vq->desc && vq->avail && vq->used && + bool rings_ok; + + if (!vq) + return false; + + if (vq_is_packed(dev)) + rings_ok = !!vq->desc_packed; + else + rings_ok = vq->desc && vq->avail && vq->used; + + return rings_ok && vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD && vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD; } @@ -905,7 +982,7 @@ virtio_is_ready(struct virtio_net *dev) for (i = 0; i < dev->nr_vring; i++) { vq = dev->virtqueue[i]; - if (!vq_is_ready(vq)) + if (!vq_is_ready(dev, vq)) return 0; } @@ -996,18 +1073,9 @@ vhost_user_get_vring_base(struct virtio_net *dev, VhostUserMsg *msg) { struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index]; - struct rte_vdpa_device *vdpa_dev; - int did = -1; /* We have to stop the queue (virtio) if it is running. */ - if (dev->flags & VIRTIO_DEV_RUNNING) { - did = dev->vdpa_dev_id; - vdpa_dev = rte_vdpa_get_device(did); - if (vdpa_dev && vdpa_dev->ops->dev_close) - vdpa_dev->ops->dev_close(dev->vid); - dev->flags &= ~VIRTIO_DEV_RUNNING; - dev->notify_ops->destroy_device(dev->vid); - } + vhost_destroy_device_notify(dev); dev->flags &= ~VIRTIO_DEV_READY; dev->flags &= ~VIRTIO_DEV_VDPA_CONFIGURED; @@ -1035,8 +1103,13 @@ vhost_user_get_vring_base(struct virtio_net *dev, if (dev->dequeue_zero_copy) free_zmbufs(vq); - rte_free(vq->shadow_used_ring); - vq->shadow_used_ring = NULL; + if (vq_is_packed(dev)) { + rte_free(vq->shadow_used_packed); + vq->shadow_used_packed = NULL; + } else { + rte_free(vq->shadow_used_split); + vq->shadow_used_split = NULL; + } rte_free(vq->batch_copy_elems); vq->batch_copy_elems = NULL; @@ -1384,6 +1457,22 @@ send_vhost_reply(int sockfd, struct VhostUserMsg *msg) return send_vhost_message(sockfd, msg, NULL, 0); } +static int +send_vhost_slave_message(struct virtio_net *dev, struct VhostUserMsg *msg, + int *fds, int fd_num) +{ + int ret; + + if (msg->flags & VHOST_USER_NEED_REPLY) + rte_spinlock_lock(&dev->slave_req_lock); + + ret = send_vhost_message(dev->slave_req_fd, msg, fds, fd_num); + if (ret < 0 && (msg->flags & VHOST_USER_NEED_REPLY)) + rte_spinlock_unlock(&dev->slave_req_lock); + + return ret; +} + /* * Allocate a queue pair if it hasn't been allocated yet */ @@ -1705,11 +1794,45 @@ skip_to_reply: if (vdpa_dev->ops->dev_conf) vdpa_dev->ops->dev_conf(dev->vid); dev->flags |= VIRTIO_DEV_VDPA_CONFIGURED; + if (vhost_user_host_notifier_ctrl(dev->vid, true) != 0) { + RTE_LOG(INFO, VHOST_CONFIG, + "(%d) software relay is used for vDPA, performance may be low.\n", + dev->vid); + } } return 0; } +static int process_slave_message_reply(struct virtio_net *dev, + const VhostUserMsg *msg) +{ + VhostUserMsg msg_reply; + int ret; + + if ((msg->flags & VHOST_USER_NEED_REPLY) == 0) + return 0; + + if (read_vhost_message(dev->slave_req_fd, &msg_reply) < 0) { + ret = -1; + goto out; + } + + if (msg_reply.request.slave != msg->request.slave) { + RTE_LOG(ERR, VHOST_CONFIG, + "Received unexpected msg type (%u), expected %u\n", + msg_reply.request.slave, msg->request.slave); + ret = -1; + goto out; + } + + ret = msg_reply.payload.u64 ? -1 : 0; + +out: + rte_spinlock_unlock(&dev->slave_req_lock); + return ret; +} + int vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm) { @@ -1735,3 +1858,101 @@ vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm) return 0; } + +static int vhost_user_slave_set_vring_host_notifier(struct virtio_net *dev, + int index, int fd, + uint64_t offset, + uint64_t size) +{ + int *fdp = NULL; + size_t fd_num = 0; + int ret; + struct VhostUserMsg msg = { + .request.slave = VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG, + .flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY, + .size = sizeof(msg.payload.area), + .payload.area = { + .u64 = index & VHOST_USER_VRING_IDX_MASK, + .size = size, + .offset = offset, + }, + }; + + if (fd < 0) + msg.payload.area.u64 |= VHOST_USER_VRING_NOFD_MASK; + else { + fdp = &fd; + fd_num = 1; + } + + ret = send_vhost_slave_message(dev, &msg, fdp, fd_num); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to set host notifier (%d)\n", ret); + return ret; + } + + return process_slave_message_reply(dev, &msg); +} + +int vhost_user_host_notifier_ctrl(int vid, bool enable) +{ + struct virtio_net *dev; + struct rte_vdpa_device *vdpa_dev; + int vfio_device_fd, did, ret = 0; + uint64_t offset, size; + unsigned int i; + + dev = get_device(vid); + if (!dev) + return -ENODEV; + + did = dev->vdpa_dev_id; + if (did < 0) + return -EINVAL; + + if (!(dev->features & (1ULL << VIRTIO_F_VERSION_1)) || + !(dev->features & (1ULL << VHOST_USER_F_PROTOCOL_FEATURES)) || + !(dev->protocol_features & + (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ)) || + !(dev->protocol_features & + (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD)) || + !(dev->protocol_features & + (1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER))) + return -ENOTSUP; + + vdpa_dev = rte_vdpa_get_device(did); + if (!vdpa_dev) + return -ENODEV; + + RTE_FUNC_PTR_OR_ERR_RET(vdpa_dev->ops->get_vfio_device_fd, -ENOTSUP); + RTE_FUNC_PTR_OR_ERR_RET(vdpa_dev->ops->get_notify_area, -ENOTSUP); + + vfio_device_fd = vdpa_dev->ops->get_vfio_device_fd(vid); + if (vfio_device_fd < 0) + return -ENOTSUP; + + if (enable) { + for (i = 0; i < dev->nr_vring; i++) { + if (vdpa_dev->ops->get_notify_area(vid, i, &offset, + &size) < 0) { + ret = -ENOTSUP; + goto disable; + } + + if (vhost_user_slave_set_vring_host_notifier(dev, i, + vfio_device_fd, offset, size) < 0) { + ret = -EFAULT; + goto disable; + } + } + } else { +disable: + for (i = 0; i < dev->nr_vring; i++) { + vhost_user_slave_set_vring_host_notifier(dev, i, -1, + 0, 0); + } + } + + return ret; +} diff --git a/lib/librte_vhost/vhost_user.h b/lib/librte_vhost/vhost_user.h index 1ad5cf46..42166adf 100644 --- a/lib/librte_vhost/vhost_user.h +++ b/lib/librte_vhost/vhost_user.h @@ -20,7 +20,9 @@ (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK) | \ (1ULL << VHOST_USER_PROTOCOL_F_NET_MTU) | \ (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ) | \ - (1ULL << VHOST_USER_PROTOCOL_F_CRYPTO_SESSION)) + (1ULL << VHOST_USER_PROTOCOL_F_CRYPTO_SESSION) | \ + (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD) | \ + (1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER)) typedef enum VhostUserRequest { VHOST_USER_NONE = 0, @@ -54,6 +56,7 @@ typedef enum VhostUserRequest { typedef enum VhostUserSlaveRequest { VHOST_USER_SLAVE_NONE = 0, VHOST_USER_SLAVE_IOTLB_MSG = 1, + VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG = 3, VHOST_USER_SLAVE_MAX } VhostUserSlaveRequest; @@ -99,6 +102,12 @@ typedef struct VhostUserCryptoSessionParam { uint8_t auth_key_buf[VHOST_USER_CRYPTO_MAX_HMAC_KEY_LENGTH]; } VhostUserCryptoSessionParam; +typedef struct VhostUserVringArea { + uint64_t u64; + uint64_t size; + uint64_t offset; +} VhostUserVringArea; + typedef struct VhostUserMsg { union { uint32_t master; /* a VhostUserRequest value */ @@ -120,6 +129,7 @@ typedef struct VhostUserMsg { VhostUserLog log; struct vhost_iotlb_msg iotlb; VhostUserCryptoSessionParam crypto_session; + VhostUserVringArea area; } payload; int fds[VHOST_MEMORY_MAX_NREGIONS]; } __attribute((packed)) VhostUserMsg; @@ -133,6 +143,7 @@ typedef struct VhostUserMsg { /* vhost_user.c */ int vhost_user_msg_handler(int vid, int fd); int vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm); +int vhost_user_host_notifier_ctrl(int vid, bool enable); /* socket.c */ int read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num); diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 76ec5f08..99c7afc8 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -25,22 +25,27 @@ #define MAX_BATCH_LEN 256 +static __rte_always_inline bool +rxvq_is_mergeable(struct virtio_net *dev) +{ + return dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF); +} + static bool is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring) { return (is_tx ^ (idx & 1)) == 0 && idx < nr_vring; } -static __rte_always_inline struct vring_desc * +static __rte_always_inline void * alloc_copy_ind_table(struct virtio_net *dev, struct vhost_virtqueue *vq, - struct vring_desc *desc) + uint64_t desc_addr, uint64_t desc_len) { - struct vring_desc *idesc; + void *idesc; uint64_t src, dst; - uint64_t len, remain = desc->len; - uint64_t desc_addr = desc->addr; + uint64_t len, remain = desc_len; - idesc = rte_malloc(__func__, desc->len, 0); + idesc = rte_malloc(__func__, desc_len, 0); if (unlikely(!idesc)) return 0; @@ -66,17 +71,18 @@ alloc_copy_ind_table(struct virtio_net *dev, struct vhost_virtqueue *vq, } static __rte_always_inline void -free_ind_table(struct vring_desc *idesc) +free_ind_table(void *idesc) { rte_free(idesc); } static __rte_always_inline void -do_flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq, - uint16_t to, uint16_t from, uint16_t size) +do_flush_shadow_used_ring_split(struct virtio_net *dev, + struct vhost_virtqueue *vq, + uint16_t to, uint16_t from, uint16_t size) { rte_memcpy(&vq->used->ring[to], - &vq->shadow_used_ring[from], + &vq->shadow_used_split[from], size * sizeof(struct vring_used_elem)); vhost_log_cache_used_vring(dev, vq, offsetof(struct vring_used, ring[to]), @@ -84,22 +90,22 @@ do_flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq, } static __rte_always_inline void -flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq) +flush_shadow_used_ring_split(struct virtio_net *dev, struct vhost_virtqueue *vq) { uint16_t used_idx = vq->last_used_idx & (vq->size - 1); if (used_idx + vq->shadow_used_idx <= vq->size) { - do_flush_shadow_used_ring(dev, vq, used_idx, 0, + do_flush_shadow_used_ring_split(dev, vq, used_idx, 0, vq->shadow_used_idx); } else { uint16_t size; /* update used ring interval [used_idx, vq->size] */ size = vq->size - used_idx; - do_flush_shadow_used_ring(dev, vq, used_idx, 0, size); + do_flush_shadow_used_ring_split(dev, vq, used_idx, 0, size); /* update the left half used ring interval [0, left_size] */ - do_flush_shadow_used_ring(dev, vq, 0, size, + do_flush_shadow_used_ring_split(dev, vq, 0, size, vq->shadow_used_idx - size); } vq->last_used_idx += vq->shadow_used_idx; @@ -109,18 +115,84 @@ flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq) vhost_log_cache_sync(dev, vq); *(volatile uint16_t *)&vq->used->idx += vq->shadow_used_idx; + vq->shadow_used_idx = 0; vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx), sizeof(vq->used->idx)); } static __rte_always_inline void -update_shadow_used_ring(struct vhost_virtqueue *vq, +update_shadow_used_ring_split(struct vhost_virtqueue *vq, uint16_t desc_idx, uint16_t len) { uint16_t i = vq->shadow_used_idx++; - vq->shadow_used_ring[i].id = desc_idx; - vq->shadow_used_ring[i].len = len; + vq->shadow_used_split[i].id = desc_idx; + vq->shadow_used_split[i].len = len; +} + +static __rte_always_inline void +flush_shadow_used_ring_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq) +{ + int i; + uint16_t used_idx = vq->last_used_idx; + + /* Split loop in two to save memory barriers */ + for (i = 0; i < vq->shadow_used_idx; i++) { + vq->desc_packed[used_idx].id = vq->shadow_used_packed[i].id; + vq->desc_packed[used_idx].len = vq->shadow_used_packed[i].len; + + used_idx += vq->shadow_used_packed[i].count; + if (used_idx >= vq->size) + used_idx -= vq->size; + } + + rte_smp_wmb(); + + for (i = 0; i < vq->shadow_used_idx; i++) { + uint16_t flags; + + if (vq->shadow_used_packed[i].len) + flags = VRING_DESC_F_WRITE; + else + flags = 0; + + if (vq->used_wrap_counter) { + flags |= VRING_DESC_F_USED; + flags |= VRING_DESC_F_AVAIL; + } else { + flags &= ~VRING_DESC_F_USED; + flags &= ~VRING_DESC_F_AVAIL; + } + + vq->desc_packed[vq->last_used_idx].flags = flags; + + vhost_log_cache_used_vring(dev, vq, + vq->last_used_idx * + sizeof(struct vring_packed_desc), + sizeof(struct vring_packed_desc)); + + vq->last_used_idx += vq->shadow_used_packed[i].count; + if (vq->last_used_idx >= vq->size) { + vq->used_wrap_counter ^= 1; + vq->last_used_idx -= vq->size; + } + } + + rte_smp_wmb(); + vq->shadow_used_idx = 0; + vhost_log_cache_sync(dev, vq); +} + +static __rte_always_inline void +update_shadow_used_ring_packed(struct vhost_virtqueue *vq, + uint16_t desc_idx, uint16_t len, uint16_t count) +{ + uint16_t i = vq->shadow_used_idx++; + + vq->shadow_used_packed[i].id = desc_idx; + vq->shadow_used_packed[i].len = len; + vq->shadow_used_packed[i].count = count; } static inline void @@ -135,6 +207,8 @@ do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq) vhost_log_cache_write(dev, vq, elem[i].log_addr, elem[i].len); PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0); } + + vq->batch_copy_nb_elems = 0; } static inline void @@ -146,6 +220,8 @@ do_data_copy_dequeue(struct vhost_virtqueue *vq) for (i = 0; i < count; i++) rte_memcpy(elem[i].dst, elem[i].src, elem[i].len); + + vq->batch_copy_nb_elems = 0; } /* avoid write operation when necessary, to lessen cache issues */ @@ -154,7 +230,7 @@ do_data_copy_dequeue(struct vhost_virtqueue *vq) (var) = (val); \ } while (0) -static void +static __rte_always_inline void virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr) { uint64_t csum_l4 = m_buf->ol_flags & PKT_TX_L4_MASK; @@ -216,324 +292,47 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr) } static __rte_always_inline int -copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, - struct vring_desc *descs, struct rte_mbuf *m, - uint16_t desc_idx, uint32_t size) +map_one_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct buf_vector *buf_vec, uint16_t *vec_idx, + uint64_t desc_iova, uint64_t desc_len, uint8_t perm) { - uint32_t desc_avail, desc_offset; - uint32_t mbuf_avail, mbuf_offset; - uint32_t cpy_len; - uint64_t desc_chunck_len; - struct vring_desc *desc; - uint64_t desc_addr, desc_gaddr; - /* A counter to avoid desc dead loop chain */ - uint16_t nr_desc = 1; - struct batch_copy_elem *batch_copy = vq->batch_copy_elems; - uint16_t copy_nb = vq->batch_copy_nb_elems; - int error = 0; + uint16_t vec_id = *vec_idx; - desc = &descs[desc_idx]; - desc_chunck_len = desc->len; - desc_gaddr = desc->addr; - desc_addr = vhost_iova_to_vva(dev, vq, desc_gaddr, - &desc_chunck_len, VHOST_ACCESS_RW); - /* - * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid - * performance issue with some versions of gcc (4.8.4 and 5.3.0) which - * otherwise stores offset on the stack instead of in a register. - */ - if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr) { - error = -1; - goto out; - } + while (desc_len) { + uint64_t desc_addr; + uint64_t desc_chunck_len = desc_len; - rte_prefetch0((void *)(uintptr_t)desc_addr); - - if (likely(desc_chunck_len >= dev->vhost_hlen)) { - virtio_enqueue_offload(m, - (struct virtio_net_hdr *)(uintptr_t)desc_addr); - PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0); - vhost_log_cache_write(dev, vq, desc_gaddr, dev->vhost_hlen); - } else { - struct virtio_net_hdr vnet_hdr; - uint64_t remain = dev->vhost_hlen; - uint64_t len; - uint64_t src = (uint64_t)(uintptr_t)&vnet_hdr, dst; - uint64_t guest_addr = desc_gaddr; - - virtio_enqueue_offload(m, &vnet_hdr); - - while (remain) { - len = remain; - dst = vhost_iova_to_vva(dev, vq, guest_addr, - &len, VHOST_ACCESS_RW); - if (unlikely(!dst || !len)) { - error = -1; - goto out; - } - - rte_memcpy((void *)(uintptr_t)dst, - (void *)(uintptr_t)src, len); - - PRINT_PACKET(dev, (uintptr_t)dst, (uint32_t)len, 0); - vhost_log_cache_write(dev, vq, guest_addr, len); - remain -= len; - guest_addr += len; - src += len; - } - } + if (unlikely(vec_id >= BUF_VECTOR_MAX)) + return -1; - desc_avail = desc->len - dev->vhost_hlen; - if (unlikely(desc_chunck_len < dev->vhost_hlen)) { - desc_chunck_len = desc_avail; - desc_gaddr = desc->addr + dev->vhost_hlen; - desc_addr = vhost_iova_to_vva(dev, - vq, desc_gaddr, + desc_addr = vhost_iova_to_vva(dev, vq, + desc_iova, &desc_chunck_len, - VHOST_ACCESS_RW); - if (unlikely(!desc_addr)) { - error = -1; - goto out; - } - - desc_offset = 0; - } else { - desc_offset = dev->vhost_hlen; - desc_chunck_len -= dev->vhost_hlen; - } - - mbuf_avail = rte_pktmbuf_data_len(m); - mbuf_offset = 0; - while (mbuf_avail != 0 || m->next != NULL) { - /* done with current mbuf, fetch next */ - if (mbuf_avail == 0) { - m = m->next; - - mbuf_offset = 0; - mbuf_avail = rte_pktmbuf_data_len(m); - } - - /* done with current desc buf, fetch next */ - if (desc_avail == 0) { - if ((desc->flags & VRING_DESC_F_NEXT) == 0) { - /* Room in vring buffer is not enough */ - error = -1; - goto out; - } - if (unlikely(desc->next >= size || ++nr_desc > size)) { - error = -1; - goto out; - } - - desc = &descs[desc->next]; - desc_chunck_len = desc->len; - desc_gaddr = desc->addr; - desc_addr = vhost_iova_to_vva(dev, vq, desc_gaddr, - &desc_chunck_len, - VHOST_ACCESS_RW); - if (unlikely(!desc_addr)) { - error = -1; - goto out; - } - - desc_offset = 0; - desc_avail = desc->len; - } else if (unlikely(desc_chunck_len == 0)) { - desc_chunck_len = desc_avail; - desc_gaddr += desc_offset; - desc_addr = vhost_iova_to_vva(dev, - vq, desc_gaddr, - &desc_chunck_len, VHOST_ACCESS_RW); - if (unlikely(!desc_addr)) { - error = -1; - goto out; - } - desc_offset = 0; - } - - cpy_len = RTE_MIN(desc_chunck_len, mbuf_avail); - if (likely(cpy_len > MAX_BATCH_LEN || copy_nb >= vq->size)) { - rte_memcpy((void *)((uintptr_t)(desc_addr + - desc_offset)), - rte_pktmbuf_mtod_offset(m, void *, mbuf_offset), - cpy_len); - vhost_log_cache_write(dev, vq, desc_gaddr + desc_offset, - cpy_len); - PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset), - cpy_len, 0); - } else { - batch_copy[copy_nb].dst = - (void *)((uintptr_t)(desc_addr + desc_offset)); - batch_copy[copy_nb].src = - rte_pktmbuf_mtod_offset(m, void *, mbuf_offset); - batch_copy[copy_nb].log_addr = desc_gaddr + desc_offset; - batch_copy[copy_nb].len = cpy_len; - copy_nb++; - } - - mbuf_avail -= cpy_len; - mbuf_offset += cpy_len; - desc_avail -= cpy_len; - desc_offset += cpy_len; - desc_chunck_len -= cpy_len; - } - -out: - vq->batch_copy_nb_elems = copy_nb; - - return error; -} - -/** - * This function adds buffers to the virtio devices RX virtqueue. Buffers can - * be received from the physical port or from another virtio device. A packet - * count is returned to indicate the number of packets that are successfully - * added to the RX queue. This function works when the mbuf is scattered, but - * it doesn't support the mergeable feature. - */ -static __rte_always_inline uint32_t -virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, - struct rte_mbuf **pkts, uint32_t count) -{ - struct vhost_virtqueue *vq; - uint16_t avail_idx, free_entries, start_idx; - uint16_t desc_indexes[MAX_PKT_BURST]; - struct vring_desc *descs; - uint16_t used_idx; - uint32_t i, sz; - - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); - if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) { - RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n", - dev->vid, __func__, queue_id); - return 0; - } - - vq = dev->virtqueue[queue_id]; - - rte_spinlock_lock(&vq->access_lock); - - if (unlikely(vq->enabled == 0)) - goto out_access_unlock; - - if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) - vhost_user_iotlb_rd_lock(vq); - - if (unlikely(vq->access_ok == 0)) { - if (unlikely(vring_translate(dev, vq) < 0)) { - count = 0; - goto out; - } - } - - avail_idx = *((volatile uint16_t *)&vq->avail->idx); - start_idx = vq->last_used_idx; - free_entries = avail_idx - start_idx; - count = RTE_MIN(count, free_entries); - count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST); - if (count == 0) - goto out; - - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n", - dev->vid, start_idx, start_idx + count); - - vq->batch_copy_nb_elems = 0; - - /* Retrieve all of the desc indexes first to avoid caching issues. */ - rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]); - for (i = 0; i < count; i++) { - used_idx = (start_idx + i) & (vq->size - 1); - desc_indexes[i] = vq->avail->ring[used_idx]; - vq->used->ring[used_idx].id = desc_indexes[i]; - vq->used->ring[used_idx].len = pkts[i]->pkt_len + - dev->vhost_hlen; - vhost_log_cache_used_vring(dev, vq, - offsetof(struct vring_used, ring[used_idx]), - sizeof(vq->used->ring[used_idx])); - } - - rte_prefetch0(&vq->desc[desc_indexes[0]]); - for (i = 0; i < count; i++) { - struct vring_desc *idesc = NULL; - uint16_t desc_idx = desc_indexes[i]; - int err; - - if (vq->desc[desc_idx].flags & VRING_DESC_F_INDIRECT) { - uint64_t dlen = vq->desc[desc_idx].len; - descs = (struct vring_desc *)(uintptr_t) - vhost_iova_to_vva(dev, - vq, vq->desc[desc_idx].addr, - &dlen, VHOST_ACCESS_RO); - if (unlikely(!descs)) { - count = i; - break; - } - - if (unlikely(dlen < vq->desc[desc_idx].len)) { - /* - * The indirect desc table is not contiguous - * in process VA space, we have to copy it. - */ - idesc = alloc_copy_ind_table(dev, vq, - &vq->desc[desc_idx]); - if (unlikely(!idesc)) - break; - - descs = idesc; - } - - desc_idx = 0; - sz = vq->desc[desc_idx].len / sizeof(*descs); - } else { - descs = vq->desc; - sz = vq->size; - } - - err = copy_mbuf_to_desc(dev, vq, descs, pkts[i], desc_idx, sz); - if (unlikely(err)) { - count = i; - free_ind_table(idesc); - break; - } + perm); + if (unlikely(!desc_addr)) + return -1; - if (i + 1 < count) - rte_prefetch0(&vq->desc[desc_indexes[i+1]]); + buf_vec[vec_id].buf_iova = desc_iova; + buf_vec[vec_id].buf_addr = desc_addr; + buf_vec[vec_id].buf_len = desc_chunck_len; - if (unlikely(!!idesc)) - free_ind_table(idesc); + desc_len -= desc_chunck_len; + desc_iova += desc_chunck_len; + vec_id++; } + *vec_idx = vec_id; - do_data_copy_enqueue(dev, vq); - - rte_smp_wmb(); - - vhost_log_cache_sync(dev, vq); - - *(volatile uint16_t *)&vq->used->idx += count; - vq->last_used_idx += count; - vhost_log_used_vring(dev, vq, - offsetof(struct vring_used, idx), - sizeof(vq->used->idx)); - - vhost_vring_call(dev, vq); -out: - if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) - vhost_user_iotlb_rd_unlock(vq); - -out_access_unlock: - rte_spinlock_unlock(&vq->access_lock); - - return count; + return 0; } static __rte_always_inline int -fill_vec_buf(struct virtio_net *dev, struct vhost_virtqueue *vq, - uint32_t avail_idx, uint32_t *vec_idx, +fill_vec_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint32_t avail_idx, uint16_t *vec_idx, struct buf_vector *buf_vec, uint16_t *desc_chain_head, - uint16_t *desc_chain_len) + uint16_t *desc_chain_len, uint8_t perm) { uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)]; - uint32_t vec_id = *vec_idx; + uint16_t vec_id = *vec_idx; uint32_t len = 0; uint64_t dlen; struct vring_desc *descs = vq->desc; @@ -555,7 +354,8 @@ fill_vec_buf(struct virtio_net *dev, struct vhost_virtqueue *vq, * The indirect desc table is not contiguous * in process VA space, we have to copy it. */ - idesc = alloc_copy_ind_table(dev, vq, &vq->desc[idx]); + idesc = alloc_copy_ind_table(dev, vq, + vq->desc[idx].addr, vq->desc[idx].len); if (unlikely(!idesc)) return -1; @@ -566,16 +366,19 @@ fill_vec_buf(struct virtio_net *dev, struct vhost_virtqueue *vq, } while (1) { - if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size)) { + if (unlikely(idx >= vq->size)) { free_ind_table(idesc); return -1; } len += descs[idx].len; - buf_vec[vec_id].buf_addr = descs[idx].addr; - buf_vec[vec_id].buf_len = descs[idx].len; - buf_vec[vec_id].desc_idx = idx; - vec_id++; + + if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id, + descs[idx].addr, descs[idx].len, + perm))) { + free_ind_table(idesc); + return -1; + } if ((descs[idx].flags & VRING_DESC_F_NEXT) == 0) break; @@ -596,13 +399,14 @@ fill_vec_buf(struct virtio_net *dev, struct vhost_virtqueue *vq, * Returns -1 on fail, 0 on success */ static inline int -reserve_avail_buf_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq, +reserve_avail_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq, uint32_t size, struct buf_vector *buf_vec, - uint16_t *num_buffers, uint16_t avail_head) + uint16_t *num_buffers, uint16_t avail_head, + uint16_t *nr_vec) { uint16_t cur_idx; - uint32_t vec_idx = 0; - uint16_t tries = 0; + uint16_t vec_idx = 0; + uint16_t max_tries, tries = 0; uint16_t head_idx = 0; uint16_t len = 0; @@ -610,49 +414,223 @@ reserve_avail_buf_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq, *num_buffers = 0; cur_idx = vq->last_avail_idx; + if (rxvq_is_mergeable(dev)) + max_tries = vq->size - 1; + else + max_tries = 1; + while (size > 0) { if (unlikely(cur_idx == avail_head)) return -1; + /* + * if we tried all available ring items, and still + * can't get enough buf, it means something abnormal + * happened. + */ + if (unlikely(++tries > max_tries)) + return -1; - if (unlikely(fill_vec_buf(dev, vq, cur_idx, &vec_idx, buf_vec, - &head_idx, &len) < 0)) + if (unlikely(fill_vec_buf_split(dev, vq, cur_idx, + &vec_idx, buf_vec, + &head_idx, &len, + VHOST_ACCESS_RW) < 0)) return -1; len = RTE_MIN(len, size); - update_shadow_used_ring(vq, head_idx, len); + update_shadow_used_ring_split(vq, head_idx, len); size -= len; cur_idx++; - tries++; *num_buffers += 1; + } + + *nr_vec = vec_idx; + + return 0; +} + +static __rte_always_inline int +fill_vec_buf_packed_indirect(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct vring_packed_desc *desc, uint16_t *vec_idx, + struct buf_vector *buf_vec, uint16_t *len, uint8_t perm) +{ + uint16_t i; + uint32_t nr_descs; + uint16_t vec_id = *vec_idx; + uint64_t dlen; + struct vring_packed_desc *descs, *idescs = NULL; + + dlen = desc->len; + descs = (struct vring_packed_desc *)(uintptr_t) + vhost_iova_to_vva(dev, vq, desc->addr, &dlen, VHOST_ACCESS_RO); + if (unlikely(!descs)) + return -1; + + if (unlikely(dlen < desc->len)) { + /* + * The indirect desc table is not contiguous + * in process VA space, we have to copy it. + */ + idescs = alloc_copy_ind_table(dev, vq, desc->addr, desc->len); + if (unlikely(!idescs)) + return -1; + + descs = idescs; + } + + nr_descs = desc->len / sizeof(struct vring_packed_desc); + if (unlikely(nr_descs >= vq->size)) { + free_ind_table(idescs); + return -1; + } + + for (i = 0; i < nr_descs; i++) { + if (unlikely(vec_id >= BUF_VECTOR_MAX)) { + free_ind_table(idescs); + return -1; + } + + *len += descs[i].len; + if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id, + descs[i].addr, descs[i].len, + perm))) + return -1; + } + *vec_idx = vec_id; + + if (unlikely(!!idescs)) + free_ind_table(idescs); + + return 0; +} + +static __rte_always_inline int +fill_vec_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint16_t avail_idx, uint16_t *desc_count, + struct buf_vector *buf_vec, uint16_t *vec_idx, + uint16_t *buf_id, uint16_t *len, uint8_t perm) +{ + bool wrap_counter = vq->avail_wrap_counter; + struct vring_packed_desc *descs = vq->desc_packed; + uint16_t vec_id = *vec_idx; + + if (avail_idx < vq->last_avail_idx) + wrap_counter ^= 1; + + if (unlikely(!desc_is_avail(&descs[avail_idx], wrap_counter))) + return -1; + + *desc_count = 0; + + while (1) { + if (unlikely(vec_id >= BUF_VECTOR_MAX)) + return -1; + + *desc_count += 1; + *buf_id = descs[avail_idx].id; + + if (descs[avail_idx].flags & VRING_DESC_F_INDIRECT) { + if (unlikely(fill_vec_buf_packed_indirect(dev, vq, + &descs[avail_idx], + &vec_id, buf_vec, + len, perm) < 0)) + return -1; + } else { + *len += descs[avail_idx].len; + + if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id, + descs[avail_idx].addr, + descs[avail_idx].len, + perm))) + return -1; + } + + if ((descs[avail_idx].flags & VRING_DESC_F_NEXT) == 0) + break; + + if (++avail_idx >= vq->size) { + avail_idx -= vq->size; + wrap_counter ^= 1; + } + } + + *vec_idx = vec_id; + + return 0; +} + +/* + * Returns -1 on fail, 0 on success + */ +static inline int +reserve_avail_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint32_t size, struct buf_vector *buf_vec, + uint16_t *nr_vec, uint16_t *num_buffers, + uint16_t *nr_descs) +{ + uint16_t avail_idx; + uint16_t vec_idx = 0; + uint16_t max_tries, tries = 0; + + uint16_t buf_id = 0; + uint16_t len = 0; + uint16_t desc_count; + + *num_buffers = 0; + avail_idx = vq->last_avail_idx; + if (rxvq_is_mergeable(dev)) + max_tries = vq->size - 1; + else + max_tries = 1; + + while (size > 0) { /* * if we tried all available ring items, and still * can't get enough buf, it means something abnormal * happened. */ - if (unlikely(tries >= vq->size)) + if (unlikely(++tries > max_tries)) return -1; + + if (unlikely(fill_vec_buf_packed(dev, vq, + avail_idx, &desc_count, + buf_vec, &vec_idx, + &buf_id, &len, + VHOST_ACCESS_RO) < 0)) + return -1; + + len = RTE_MIN(len, size); + update_shadow_used_ring_packed(vq, buf_id, len, desc_count); + size -= len; + + avail_idx += desc_count; + if (avail_idx >= vq->size) + avail_idx -= vq->size; + + *nr_descs += desc_count; + *num_buffers += 1; } + *nr_vec = vec_idx; + return 0; } static __rte_always_inline int -copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq, +copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf *m, struct buf_vector *buf_vec, - uint16_t num_buffers) + uint16_t nr_vec, uint16_t num_buffers) { uint32_t vec_idx = 0; - uint64_t desc_addr, desc_gaddr; uint32_t mbuf_offset, mbuf_avail; - uint32_t desc_offset, desc_avail; + uint32_t buf_offset, buf_avail; + uint64_t buf_addr, buf_iova, buf_len; uint32_t cpy_len; - uint64_t desc_chunck_len; - uint64_t hdr_addr, hdr_phys_addr; + uint64_t hdr_addr; struct rte_mbuf *hdr_mbuf; struct batch_copy_elem *batch_copy = vq->batch_copy_elems; struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL; - uint16_t copy_nb = vq->batch_copy_nb_elems; int error = 0; if (unlikely(m == NULL)) { @@ -660,82 +638,61 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq, goto out; } - desc_chunck_len = buf_vec[vec_idx].buf_len; - desc_gaddr = buf_vec[vec_idx].buf_addr; - desc_addr = vhost_iova_to_vva(dev, vq, - desc_gaddr, - &desc_chunck_len, - VHOST_ACCESS_RW); - if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr) { + buf_addr = buf_vec[vec_idx].buf_addr; + buf_iova = buf_vec[vec_idx].buf_iova; + buf_len = buf_vec[vec_idx].buf_len; + + if (nr_vec > 1) + rte_prefetch0((void *)(uintptr_t)buf_vec[1].buf_addr); + + if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) { error = -1; goto out; } hdr_mbuf = m; - hdr_addr = desc_addr; - if (unlikely(desc_chunck_len < dev->vhost_hlen)) + hdr_addr = buf_addr; + if (unlikely(buf_len < dev->vhost_hlen)) hdr = &tmp_hdr; else hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)hdr_addr; - hdr_phys_addr = desc_gaddr; - rte_prefetch0((void *)(uintptr_t)hdr_addr); VHOST_LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n", dev->vid, num_buffers); - desc_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen; - if (unlikely(desc_chunck_len < dev->vhost_hlen)) { - desc_chunck_len = desc_avail; - desc_gaddr += dev->vhost_hlen; - desc_addr = vhost_iova_to_vva(dev, vq, - desc_gaddr, - &desc_chunck_len, - VHOST_ACCESS_RW); - if (unlikely(!desc_addr)) { - error = -1; - goto out; - } - - desc_offset = 0; + if (unlikely(buf_len < dev->vhost_hlen)) { + buf_offset = dev->vhost_hlen - buf_len; + vec_idx++; + buf_addr = buf_vec[vec_idx].buf_addr; + buf_iova = buf_vec[vec_idx].buf_iova; + buf_len = buf_vec[vec_idx].buf_len; + buf_avail = buf_len - buf_offset; } else { - desc_offset = dev->vhost_hlen; - desc_chunck_len -= dev->vhost_hlen; + buf_offset = dev->vhost_hlen; + buf_avail = buf_len - dev->vhost_hlen; } - mbuf_avail = rte_pktmbuf_data_len(m); mbuf_offset = 0; while (mbuf_avail != 0 || m->next != NULL) { - /* done with current desc buf, get the next one */ - if (desc_avail == 0) { + /* done with current buf, get the next one */ + if (buf_avail == 0) { vec_idx++; - desc_chunck_len = buf_vec[vec_idx].buf_len; - desc_gaddr = buf_vec[vec_idx].buf_addr; - desc_addr = - vhost_iova_to_vva(dev, vq, - desc_gaddr, - &desc_chunck_len, - VHOST_ACCESS_RW); - if (unlikely(!desc_addr)) { + if (unlikely(vec_idx >= nr_vec)) { error = -1; goto out; } - /* Prefetch buffer address. */ - rte_prefetch0((void *)(uintptr_t)desc_addr); - desc_offset = 0; - desc_avail = buf_vec[vec_idx].buf_len; - } else if (unlikely(desc_chunck_len == 0)) { - desc_chunck_len = desc_avail; - desc_gaddr += desc_offset; - desc_addr = vhost_iova_to_vva(dev, vq, - desc_gaddr, - &desc_chunck_len, VHOST_ACCESS_RW); - if (unlikely(!desc_addr)) { - error = -1; - goto out; - } - desc_offset = 0; + buf_addr = buf_vec[vec_idx].buf_addr; + buf_iova = buf_vec[vec_idx].buf_iova; + buf_len = buf_vec[vec_idx].buf_len; + + /* Prefetch next buffer address. */ + if (vec_idx + 1 < nr_vec) + rte_prefetch0((void *)(uintptr_t) + buf_vec[vec_idx + 1].buf_addr); + buf_offset = 0; + buf_avail = buf_len; } /* done with current mbuf, get the next one */ @@ -748,24 +705,21 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq, if (hdr_addr) { virtio_enqueue_offload(hdr_mbuf, &hdr->hdr); - ASSIGN_UNLESS_EQUAL(hdr->num_buffers, num_buffers); + if (rxvq_is_mergeable(dev)) + ASSIGN_UNLESS_EQUAL(hdr->num_buffers, + num_buffers); if (unlikely(hdr == &tmp_hdr)) { uint64_t len; uint64_t remain = dev->vhost_hlen; uint64_t src = (uint64_t)(uintptr_t)hdr, dst; - uint64_t guest_addr = hdr_phys_addr; + uint64_t iova = buf_vec[0].buf_iova; + uint16_t hdr_vec_idx = 0; while (remain) { - len = remain; - dst = vhost_iova_to_vva(dev, vq, - guest_addr, &len, - VHOST_ACCESS_RW); - if (unlikely(!dst || !len)) { - error = -1; - goto out; - } - + len = RTE_MIN(remain, + buf_vec[hdr_vec_idx].buf_len); + dst = buf_vec[hdr_vec_idx].buf_addr; rte_memcpy((void *)(uintptr_t)dst, (void *)(uintptr_t)src, len); @@ -773,103 +727,125 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq, PRINT_PACKET(dev, (uintptr_t)dst, (uint32_t)len, 0); vhost_log_cache_write(dev, vq, - guest_addr, len); + iova, len); remain -= len; - guest_addr += len; + iova += len; src += len; + hdr_vec_idx++; } } else { PRINT_PACKET(dev, (uintptr_t)hdr_addr, dev->vhost_hlen, 0); - vhost_log_cache_write(dev, vq, hdr_phys_addr, + vhost_log_cache_write(dev, vq, + buf_vec[0].buf_iova, dev->vhost_hlen); } hdr_addr = 0; } - cpy_len = RTE_MIN(desc_chunck_len, mbuf_avail); + cpy_len = RTE_MIN(buf_avail, mbuf_avail); - if (likely(cpy_len > MAX_BATCH_LEN || copy_nb >= vq->size)) { - rte_memcpy((void *)((uintptr_t)(desc_addr + - desc_offset)), + if (likely(cpy_len > MAX_BATCH_LEN || + vq->batch_copy_nb_elems >= vq->size)) { + rte_memcpy((void *)((uintptr_t)(buf_addr + buf_offset)), rte_pktmbuf_mtod_offset(m, void *, mbuf_offset), cpy_len); - vhost_log_cache_write(dev, vq, desc_gaddr + desc_offset, + vhost_log_cache_write(dev, vq, buf_iova + buf_offset, cpy_len); - PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset), + PRINT_PACKET(dev, (uintptr_t)(buf_addr + buf_offset), cpy_len, 0); } else { - batch_copy[copy_nb].dst = - (void *)((uintptr_t)(desc_addr + desc_offset)); - batch_copy[copy_nb].src = + batch_copy[vq->batch_copy_nb_elems].dst = + (void *)((uintptr_t)(buf_addr + buf_offset)); + batch_copy[vq->batch_copy_nb_elems].src = rte_pktmbuf_mtod_offset(m, void *, mbuf_offset); - batch_copy[copy_nb].log_addr = desc_gaddr + desc_offset; - batch_copy[copy_nb].len = cpy_len; - copy_nb++; + batch_copy[vq->batch_copy_nb_elems].log_addr = + buf_iova + buf_offset; + batch_copy[vq->batch_copy_nb_elems].len = cpy_len; + vq->batch_copy_nb_elems++; } mbuf_avail -= cpy_len; mbuf_offset += cpy_len; - desc_avail -= cpy_len; - desc_offset += cpy_len; - desc_chunck_len -= cpy_len; + buf_avail -= cpy_len; + buf_offset += cpy_len; } out: - vq->batch_copy_nb_elems = copy_nb; return error; } static __rte_always_inline uint32_t -virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id, +virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf **pkts, uint32_t count) { - struct vhost_virtqueue *vq; uint32_t pkt_idx = 0; uint16_t num_buffers; struct buf_vector buf_vec[BUF_VECTOR_MAX]; uint16_t avail_head; - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); - if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) { - RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n", - dev->vid, __func__, queue_id); - return 0; - } + rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]); + avail_head = *((volatile uint16_t *)&vq->avail->idx); - vq = dev->virtqueue[queue_id]; + for (pkt_idx = 0; pkt_idx < count; pkt_idx++) { + uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen; + uint16_t nr_vec = 0; - rte_spinlock_lock(&vq->access_lock); + if (unlikely(reserve_avail_buf_split(dev, vq, + pkt_len, buf_vec, &num_buffers, + avail_head, &nr_vec) < 0)) { + VHOST_LOG_DEBUG(VHOST_DATA, + "(%d) failed to get enough desc from vring\n", + dev->vid); + vq->shadow_used_idx -= num_buffers; + break; + } - if (unlikely(vq->enabled == 0)) - goto out_access_unlock; + rte_prefetch0((void *)(uintptr_t)buf_vec[0].buf_addr); - if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) - vhost_user_iotlb_rd_lock(vq); + VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n", + dev->vid, vq->last_avail_idx, + vq->last_avail_idx + num_buffers); - if (unlikely(vq->access_ok == 0)) - if (unlikely(vring_translate(dev, vq) < 0)) - goto out; + if (copy_mbuf_to_desc(dev, vq, pkts[pkt_idx], + buf_vec, nr_vec, + num_buffers) < 0) { + vq->shadow_used_idx -= num_buffers; + break; + } - count = RTE_MIN((uint32_t)MAX_PKT_BURST, count); - if (count == 0) - goto out; + vq->last_avail_idx += num_buffers; + } - vq->batch_copy_nb_elems = 0; + do_data_copy_enqueue(dev, vq); - rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]); + if (likely(vq->shadow_used_idx)) { + flush_shadow_used_ring_split(dev, vq); + vhost_vring_call_split(dev, vq); + } + + return pkt_idx; +} + +static __rte_always_inline uint32_t +virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mbuf **pkts, uint32_t count) +{ + uint32_t pkt_idx = 0; + uint16_t num_buffers; + struct buf_vector buf_vec[BUF_VECTOR_MAX]; - vq->shadow_used_idx = 0; - avail_head = *((volatile uint16_t *)&vq->avail->idx); for (pkt_idx = 0; pkt_idx < count; pkt_idx++) { uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen; + uint16_t nr_vec = 0; + uint16_t nr_descs = 0; - if (unlikely(reserve_avail_buf_mergeable(dev, vq, - pkt_len, buf_vec, &num_buffers, - avail_head) < 0)) { + if (unlikely(reserve_avail_buf_packed(dev, vq, + pkt_len, buf_vec, &nr_vec, + &num_buffers, &nr_descs) < 0)) { VHOST_LOG_DEBUG(VHOST_DATA, "(%d) failed to get enough desc from vring\n", dev->vid); @@ -877,26 +853,72 @@ virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id, break; } + rte_prefetch0((void *)(uintptr_t)buf_vec[0].buf_addr); + VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n", dev->vid, vq->last_avail_idx, vq->last_avail_idx + num_buffers); - if (copy_mbuf_to_desc_mergeable(dev, vq, pkts[pkt_idx], - buf_vec, num_buffers) < 0) { + if (copy_mbuf_to_desc(dev, vq, pkts[pkt_idx], + buf_vec, nr_vec, + num_buffers) < 0) { vq->shadow_used_idx -= num_buffers; break; } - vq->last_avail_idx += num_buffers; + vq->last_avail_idx += nr_descs; + if (vq->last_avail_idx >= vq->size) { + vq->last_avail_idx -= vq->size; + vq->avail_wrap_counter ^= 1; + } } do_data_copy_enqueue(dev, vq); if (likely(vq->shadow_used_idx)) { - flush_shadow_used_ring(dev, vq); - vhost_vring_call(dev, vq); + flush_shadow_used_ring_packed(dev, vq); + vhost_vring_call_packed(dev, vq); } + return pkt_idx; +} + +static __rte_always_inline uint32_t +virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, + struct rte_mbuf **pkts, uint32_t count) +{ + struct vhost_virtqueue *vq; + + VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); + if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) { + RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n", + dev->vid, __func__, queue_id); + return 0; + } + + vq = dev->virtqueue[queue_id]; + + rte_spinlock_lock(&vq->access_lock); + + if (unlikely(vq->enabled == 0)) + goto out_access_unlock; + + if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) + vhost_user_iotlb_rd_lock(vq); + + if (unlikely(vq->access_ok == 0)) + if (unlikely(vring_translate(dev, vq) < 0)) + goto out; + + count = RTE_MIN((uint32_t)MAX_PKT_BURST, count); + if (count == 0) + goto out; + + if (vq_is_packed(dev)) + count = virtio_dev_rx_packed(dev, vq, pkts, count); + else + count = virtio_dev_rx_split(dev, vq, pkts, count); + out: if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) vhost_user_iotlb_rd_unlock(vq); @@ -904,7 +926,7 @@ out: out_access_unlock: rte_spinlock_unlock(&vq->access_lock); - return pkt_idx; + return count; } uint16_t @@ -923,10 +945,7 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id, return 0; } - if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) - return virtio_dev_merge_rx(dev, queue_id, pkts, count); - else - return virtio_dev_rx(dev, queue_id, pkts, count); + return virtio_dev_rx(dev, queue_id, pkts, count); } static inline bool @@ -1051,76 +1070,60 @@ put_zmbuf(struct zcopy_mbuf *zmbuf) static __rte_always_inline int copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, - struct vring_desc *descs, uint16_t max_desc, - struct rte_mbuf *m, uint16_t desc_idx, - struct rte_mempool *mbuf_pool) + struct buf_vector *buf_vec, uint16_t nr_vec, + struct rte_mbuf *m, struct rte_mempool *mbuf_pool) { - struct vring_desc *desc; - uint64_t desc_addr, desc_gaddr; - uint32_t desc_avail, desc_offset; + uint32_t buf_avail, buf_offset; + uint64_t buf_addr, buf_iova, buf_len; uint32_t mbuf_avail, mbuf_offset; uint32_t cpy_len; - uint64_t desc_chunck_len; struct rte_mbuf *cur = m, *prev = m; struct virtio_net_hdr tmp_hdr; struct virtio_net_hdr *hdr = NULL; /* A counter to avoid desc dead loop chain */ - uint32_t nr_desc = 1; + uint16_t vec_idx = 0; struct batch_copy_elem *batch_copy = vq->batch_copy_elems; - uint16_t copy_nb = vq->batch_copy_nb_elems; int error = 0; - desc = &descs[desc_idx]; - if (unlikely((desc->len < dev->vhost_hlen)) || - (desc->flags & VRING_DESC_F_INDIRECT)) { - error = -1; - goto out; - } + buf_addr = buf_vec[vec_idx].buf_addr; + buf_iova = buf_vec[vec_idx].buf_iova; + buf_len = buf_vec[vec_idx].buf_len; - desc_chunck_len = desc->len; - desc_gaddr = desc->addr; - desc_addr = vhost_iova_to_vva(dev, - vq, desc_gaddr, - &desc_chunck_len, - VHOST_ACCESS_RO); - if (unlikely(!desc_addr)) { + if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) { error = -1; goto out; } + if (likely(nr_vec > 1)) + rte_prefetch0((void *)(uintptr_t)buf_vec[1].buf_addr); + if (virtio_net_with_host_offload(dev)) { - if (unlikely(desc_chunck_len < sizeof(struct virtio_net_hdr))) { - uint64_t len = desc_chunck_len; + if (unlikely(buf_len < sizeof(struct virtio_net_hdr))) { + uint64_t len; uint64_t remain = sizeof(struct virtio_net_hdr); - uint64_t src = desc_addr; + uint64_t src; uint64_t dst = (uint64_t)(uintptr_t)&tmp_hdr; - uint64_t guest_addr = desc_gaddr; + uint16_t hdr_vec_idx = 0; /* * No luck, the virtio-net header doesn't fit * in a contiguous virtual area. */ while (remain) { - len = remain; - src = vhost_iova_to_vva(dev, vq, - guest_addr, &len, - VHOST_ACCESS_RO); - if (unlikely(!src || !len)) { - error = -1; - goto out; - } - + len = RTE_MIN(remain, + buf_vec[hdr_vec_idx].buf_len); + src = buf_vec[hdr_vec_idx].buf_addr; rte_memcpy((void *)(uintptr_t)dst, (void *)(uintptr_t)src, len); - guest_addr += len; remain -= len; dst += len; + hdr_vec_idx++; } hdr = &tmp_hdr; } else { - hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr); + hdr = (struct virtio_net_hdr *)((uintptr_t)buf_addr); rte_prefetch0(hdr); } } @@ -1130,61 +1133,40 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, * for Tx: the first for storing the header, and others * for storing the data. */ - if (likely((desc->len == dev->vhost_hlen) && - (desc->flags & VRING_DESC_F_NEXT) != 0)) { - desc = &descs[desc->next]; - if (unlikely(desc->flags & VRING_DESC_F_INDIRECT)) { - error = -1; + if (unlikely(buf_len < dev->vhost_hlen)) { + buf_offset = dev->vhost_hlen - buf_len; + vec_idx++; + buf_addr = buf_vec[vec_idx].buf_addr; + buf_iova = buf_vec[vec_idx].buf_iova; + buf_len = buf_vec[vec_idx].buf_len; + buf_avail = buf_len - buf_offset; + } else if (buf_len == dev->vhost_hlen) { + if (unlikely(++vec_idx >= nr_vec)) goto out; - } - - desc_chunck_len = desc->len; - desc_gaddr = desc->addr; - desc_addr = vhost_iova_to_vva(dev, - vq, desc_gaddr, - &desc_chunck_len, - VHOST_ACCESS_RO); - if (unlikely(!desc_addr)) { - error = -1; - goto out; - } + buf_addr = buf_vec[vec_idx].buf_addr; + buf_iova = buf_vec[vec_idx].buf_iova; + buf_len = buf_vec[vec_idx].buf_len; - desc_offset = 0; - desc_avail = desc->len; - nr_desc += 1; + buf_offset = 0; + buf_avail = buf_len; } else { - desc_avail = desc->len - dev->vhost_hlen; - - if (unlikely(desc_chunck_len < dev->vhost_hlen)) { - desc_chunck_len = desc_avail; - desc_gaddr += dev->vhost_hlen; - desc_addr = vhost_iova_to_vva(dev, - vq, desc_gaddr, - &desc_chunck_len, - VHOST_ACCESS_RO); - if (unlikely(!desc_addr)) { - error = -1; - goto out; - } - - desc_offset = 0; - } else { - desc_offset = dev->vhost_hlen; - desc_chunck_len -= dev->vhost_hlen; - } + buf_offset = dev->vhost_hlen; + buf_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen; } - rte_prefetch0((void *)(uintptr_t)(desc_addr + desc_offset)); + rte_prefetch0((void *)(uintptr_t) + (buf_addr + buf_offset)); - PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset), - (uint32_t)desc_chunck_len, 0); + PRINT_PACKET(dev, + (uintptr_t)(buf_addr + buf_offset), + (uint32_t)buf_avail, 0); mbuf_offset = 0; mbuf_avail = m->buf_len - RTE_PKTMBUF_HEADROOM; while (1) { uint64_t hpa; - cpy_len = RTE_MIN(desc_chunck_len, mbuf_avail); + cpy_len = RTE_MIN(buf_avail, mbuf_avail); /* * A desc buf might across two host physical pages that are @@ -1192,11 +1174,11 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, * will be copied even though zero copy is enabled. */ if (unlikely(dev->dequeue_zero_copy && (hpa = gpa_to_hpa(dev, - desc_gaddr + desc_offset, cpy_len)))) { + buf_iova + buf_offset, cpy_len)))) { cur->data_len = cpy_len; cur->data_off = 0; - cur->buf_addr = (void *)(uintptr_t)(desc_addr - + desc_offset); + cur->buf_addr = + (void *)(uintptr_t)(buf_addr + buf_offset); cur->buf_iova = hpa; /* @@ -1206,81 +1188,53 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, mbuf_avail = cpy_len; } else { if (likely(cpy_len > MAX_BATCH_LEN || - copy_nb >= vq->size || - (hdr && cur == m) || - desc->len != desc_chunck_len)) { + vq->batch_copy_nb_elems >= vq->size || + (hdr && cur == m))) { rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, mbuf_offset), - (void *)((uintptr_t)(desc_addr + - desc_offset)), + (void *)((uintptr_t)(buf_addr + + buf_offset)), cpy_len); } else { - batch_copy[copy_nb].dst = + batch_copy[vq->batch_copy_nb_elems].dst = rte_pktmbuf_mtod_offset(cur, void *, mbuf_offset); - batch_copy[copy_nb].src = - (void *)((uintptr_t)(desc_addr + - desc_offset)); - batch_copy[copy_nb].len = cpy_len; - copy_nb++; + batch_copy[vq->batch_copy_nb_elems].src = + (void *)((uintptr_t)(buf_addr + + buf_offset)); + batch_copy[vq->batch_copy_nb_elems].len = + cpy_len; + vq->batch_copy_nb_elems++; } } mbuf_avail -= cpy_len; mbuf_offset += cpy_len; - desc_avail -= cpy_len; - desc_chunck_len -= cpy_len; - desc_offset += cpy_len; + buf_avail -= cpy_len; + buf_offset += cpy_len; - /* This desc reaches to its end, get the next one */ - if (desc_avail == 0) { - if ((desc->flags & VRING_DESC_F_NEXT) == 0) + /* This buf reaches to its end, get the next one */ + if (buf_avail == 0) { + if (++vec_idx >= nr_vec) break; - if (unlikely(desc->next >= max_desc || - ++nr_desc > max_desc)) { - error = -1; - goto out; - } - desc = &descs[desc->next]; - if (unlikely(desc->flags & VRING_DESC_F_INDIRECT)) { - error = -1; - goto out; - } + buf_addr = buf_vec[vec_idx].buf_addr; + buf_iova = buf_vec[vec_idx].buf_iova; + buf_len = buf_vec[vec_idx].buf_len; - desc_chunck_len = desc->len; - desc_gaddr = desc->addr; - desc_addr = vhost_iova_to_vva(dev, - vq, desc_gaddr, - &desc_chunck_len, - VHOST_ACCESS_RO); - if (unlikely(!desc_addr)) { - error = -1; - goto out; - } + /* + * Prefecth desc n + 1 buffer while + * desc n buffer is processed. + */ + if (vec_idx + 1 < nr_vec) + rte_prefetch0((void *)(uintptr_t) + buf_vec[vec_idx + 1].buf_addr); - rte_prefetch0((void *)(uintptr_t)desc_addr); - - desc_offset = 0; - desc_avail = desc->len; - - PRINT_PACKET(dev, (uintptr_t)desc_addr, - (uint32_t)desc_chunck_len, 0); - } else if (unlikely(desc_chunck_len == 0)) { - desc_chunck_len = desc_avail; - desc_gaddr += desc_offset; - desc_addr = vhost_iova_to_vva(dev, vq, - desc_gaddr, - &desc_chunck_len, - VHOST_ACCESS_RO); - if (unlikely(!desc_addr)) { - error = -1; - goto out; - } - desc_offset = 0; + buf_offset = 0; + buf_avail = buf_len; - PRINT_PACKET(dev, (uintptr_t)desc_addr, - (uint32_t)desc_chunck_len, 0); + PRINT_PACKET(dev, (uintptr_t)buf_addr, + (uint32_t)buf_avail, 0); } /* @@ -1316,40 +1270,10 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, vhost_dequeue_offload(hdr, m); out: - vq->batch_copy_nb_elems = copy_nb; return error; } -static __rte_always_inline void -update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq, - uint32_t used_idx, uint32_t desc_idx) -{ - vq->used->ring[used_idx].id = desc_idx; - vq->used->ring[used_idx].len = 0; - vhost_log_cache_used_vring(dev, vq, - offsetof(struct vring_used, ring[used_idx]), - sizeof(vq->used->ring[used_idx])); -} - -static __rte_always_inline void -update_used_idx(struct virtio_net *dev, struct vhost_virtqueue *vq, - uint32_t count) -{ - if (unlikely(count == 0)) - return; - - rte_smp_wmb(); - rte_smp_rmb(); - - vhost_log_cache_sync(dev, vq); - - vq->used->idx += count; - vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx), - sizeof(vq->used->idx)); - vhost_vring_call(dev, vq); -} - static __rte_always_inline struct zcopy_mbuf * get_zmbuf(struct vhost_virtqueue *vq) { @@ -1409,66 +1333,137 @@ restore_mbuf(struct rte_mbuf *m) } } -uint16_t -rte_vhost_dequeue_burst(int vid, uint16_t queue_id, +static __rte_always_inline uint16_t +virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) { - struct virtio_net *dev; - struct rte_mbuf *rarp_mbuf = NULL; - struct vhost_virtqueue *vq; - uint32_t desc_indexes[MAX_PKT_BURST]; - uint32_t used_idx; - uint32_t i = 0; + uint16_t i; uint16_t free_entries; - uint16_t avail_idx; - dev = get_device(vid); - if (!dev) - return 0; + if (unlikely(dev->dequeue_zero_copy)) { + struct zcopy_mbuf *zmbuf, *next; - if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) { - RTE_LOG(ERR, VHOST_DATA, - "(%d) %s: built-in vhost net backend is disabled.\n", - dev->vid, __func__); - return 0; - } + for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); + zmbuf != NULL; zmbuf = next) { + next = TAILQ_NEXT(zmbuf, next); - if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) { - RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n", - dev->vid, __func__, queue_id); - return 0; + if (mbuf_is_consumed(zmbuf->mbuf)) { + update_shadow_used_ring_split(vq, + zmbuf->desc_idx, 0); + TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); + restore_mbuf(zmbuf->mbuf); + rte_pktmbuf_free(zmbuf->mbuf); + put_zmbuf(zmbuf); + vq->nr_zmbuf -= 1; + } + } + + flush_shadow_used_ring_split(dev, vq); + vhost_vring_call_split(dev, vq); } - vq = dev->virtqueue[queue_id]; + rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]); - if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0)) + free_entries = *((volatile uint16_t *)&vq->avail->idx) - + vq->last_avail_idx; + if (free_entries == 0) return 0; - if (unlikely(vq->enabled == 0)) - goto out_access_unlock; + VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); - vq->batch_copy_nb_elems = 0; + count = RTE_MIN(count, MAX_PKT_BURST); + count = RTE_MIN(count, free_entries); + VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n", + dev->vid, count); - if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) - vhost_user_iotlb_rd_lock(vq); + for (i = 0; i < count; i++) { + struct buf_vector buf_vec[BUF_VECTOR_MAX]; + uint16_t head_idx, dummy_len; + uint16_t nr_vec = 0; + int err; - if (unlikely(vq->access_ok == 0)) - if (unlikely(vring_translate(dev, vq) < 0)) - goto out; + if (unlikely(fill_vec_buf_split(dev, vq, + vq->last_avail_idx + i, + &nr_vec, buf_vec, + &head_idx, &dummy_len, + VHOST_ACCESS_RO) < 0)) + break; + + if (likely(dev->dequeue_zero_copy == 0)) + update_shadow_used_ring_split(vq, head_idx, 0); + + rte_prefetch0((void *)(uintptr_t)buf_vec[0].buf_addr); + + pkts[i] = rte_pktmbuf_alloc(mbuf_pool); + if (unlikely(pkts[i] == NULL)) { + RTE_LOG(ERR, VHOST_DATA, + "Failed to allocate memory for mbuf.\n"); + break; + } + + err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i], + mbuf_pool); + if (unlikely(err)) { + rte_pktmbuf_free(pkts[i]); + break; + } + + if (unlikely(dev->dequeue_zero_copy)) { + struct zcopy_mbuf *zmbuf; + + zmbuf = get_zmbuf(vq); + if (!zmbuf) { + rte_pktmbuf_free(pkts[i]); + break; + } + zmbuf->mbuf = pkts[i]; + zmbuf->desc_idx = head_idx; + + /* + * Pin lock the mbuf; we will check later to see + * whether the mbuf is freed (when we are the last + * user) or not. If that's the case, we then could + * update the used ring safely. + */ + rte_mbuf_refcnt_update(pkts[i], 1); + + vq->nr_zmbuf += 1; + TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next); + } + } + vq->last_avail_idx += i; + + if (likely(dev->dequeue_zero_copy == 0)) { + do_data_copy_dequeue(vq); + if (unlikely(i < count)) + vq->shadow_used_idx = i; + flush_shadow_used_ring_split(dev, vq); + vhost_vring_call_split(dev, vq); + } + + return i; +} + +static __rte_always_inline uint16_t +virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) +{ + uint16_t i; + + rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]); if (unlikely(dev->dequeue_zero_copy)) { struct zcopy_mbuf *zmbuf, *next; - int nr_updated = 0; for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); zmbuf != NULL; zmbuf = next) { next = TAILQ_NEXT(zmbuf, next); if (mbuf_is_consumed(zmbuf->mbuf)) { - used_idx = vq->last_used_idx++ & (vq->size - 1); - update_used_ring(dev, vq, used_idx, - zmbuf->desc_idx); - nr_updated += 1; + update_shadow_used_ring_packed(vq, + zmbuf->desc_idx, + 0, + zmbuf->desc_count); TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); restore_mbuf(zmbuf->mbuf); @@ -1478,122 +1473,46 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id, } } - update_used_idx(dev, vq, nr_updated); + flush_shadow_used_ring_packed(dev, vq); + vhost_vring_call_packed(dev, vq); } - /* - * Construct a RARP broadcast packet, and inject it to the "pkts" - * array, to looks like that guest actually send such packet. - * - * Check user_send_rarp() for more information. - * - * broadcast_rarp shares a cacheline in the virtio_net structure - * with some fields that are accessed during enqueue and - * rte_atomic16_cmpset() causes a write if using cmpxchg. This could - * result in false sharing between enqueue and dequeue. - * - * Prevent unnecessary false sharing by reading broadcast_rarp first - * and only performing cmpset if the read indicates it is likely to - * be set. - */ - - if (unlikely(rte_atomic16_read(&dev->broadcast_rarp) && - rte_atomic16_cmpset((volatile uint16_t *) - &dev->broadcast_rarp.cnt, 1, 0))) { - - rarp_mbuf = rte_net_make_rarp_packet(mbuf_pool, &dev->mac); - if (rarp_mbuf == NULL) { - RTE_LOG(ERR, VHOST_DATA, - "Failed to make RARP packet.\n"); - return 0; - } - count -= 1; - } - - free_entries = *((volatile uint16_t *)&vq->avail->idx) - - vq->last_avail_idx; - if (free_entries == 0) - goto out; - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); - /* Prefetch available and used ring */ - avail_idx = vq->last_avail_idx & (vq->size - 1); - used_idx = vq->last_used_idx & (vq->size - 1); - rte_prefetch0(&vq->avail->ring[avail_idx]); - rte_prefetch0(&vq->used->ring[used_idx]); - count = RTE_MIN(count, MAX_PKT_BURST); - count = RTE_MIN(count, free_entries); VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n", dev->vid, count); - /* Retrieve all of the head indexes first to avoid caching issues. */ for (i = 0; i < count; i++) { - avail_idx = (vq->last_avail_idx + i) & (vq->size - 1); - used_idx = (vq->last_used_idx + i) & (vq->size - 1); - desc_indexes[i] = vq->avail->ring[avail_idx]; - - if (likely(dev->dequeue_zero_copy == 0)) - update_used_ring(dev, vq, used_idx, desc_indexes[i]); - } - - /* Prefetch descriptor index. */ - rte_prefetch0(&vq->desc[desc_indexes[0]]); - for (i = 0; i < count; i++) { - struct vring_desc *desc, *idesc = NULL; - uint16_t sz, idx; - uint64_t dlen; + struct buf_vector buf_vec[BUF_VECTOR_MAX]; + uint16_t buf_id, dummy_len; + uint16_t desc_count, nr_vec = 0; int err; - if (likely(i + 1 < count)) - rte_prefetch0(&vq->desc[desc_indexes[i + 1]]); - - if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) { - dlen = vq->desc[desc_indexes[i]].len; - desc = (struct vring_desc *)(uintptr_t) - vhost_iova_to_vva(dev, vq, - vq->desc[desc_indexes[i]].addr, - &dlen, - VHOST_ACCESS_RO); - if (unlikely(!desc)) - break; + if (unlikely(fill_vec_buf_packed(dev, vq, + vq->last_avail_idx, &desc_count, + buf_vec, &nr_vec, + &buf_id, &dummy_len, + VHOST_ACCESS_RW) < 0)) + break; - if (unlikely(dlen < vq->desc[desc_indexes[i]].len)) { - /* - * The indirect desc table is not contiguous - * in process VA space, we have to copy it. - */ - idesc = alloc_copy_ind_table(dev, vq, - &vq->desc[desc_indexes[i]]); - if (unlikely(!idesc)) - break; - - desc = idesc; - } + if (likely(dev->dequeue_zero_copy == 0)) + update_shadow_used_ring_packed(vq, buf_id, 0, + desc_count); - rte_prefetch0(desc); - sz = vq->desc[desc_indexes[i]].len / sizeof(*desc); - idx = 0; - } else { - desc = vq->desc; - sz = vq->size; - idx = desc_indexes[i]; - } + rte_prefetch0((void *)(uintptr_t)buf_vec[0].buf_addr); pkts[i] = rte_pktmbuf_alloc(mbuf_pool); if (unlikely(pkts[i] == NULL)) { RTE_LOG(ERR, VHOST_DATA, "Failed to allocate memory for mbuf.\n"); - free_ind_table(idesc); break; } - err = copy_desc_to_mbuf(dev, vq, desc, sz, pkts[i], idx, - mbuf_pool); + err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i], + mbuf_pool); if (unlikely(err)) { rte_pktmbuf_free(pkts[i]); - free_ind_table(idesc); break; } @@ -1603,11 +1522,11 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id, zmbuf = get_zmbuf(vq); if (!zmbuf) { rte_pktmbuf_free(pkts[i]); - free_ind_table(idesc); break; } zmbuf->mbuf = pkts[i]; - zmbuf->desc_idx = desc_indexes[i]; + zmbuf->desc_idx = buf_id; + zmbuf->desc_count = desc_count; /* * Pin lock the mbuf; we will check later to see @@ -1621,17 +1540,102 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id, TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next); } - if (unlikely(!!idesc)) - free_ind_table(idesc); + vq->last_avail_idx += desc_count; + if (vq->last_avail_idx >= vq->size) { + vq->last_avail_idx -= vq->size; + vq->avail_wrap_counter ^= 1; + } } - vq->last_avail_idx += i; if (likely(dev->dequeue_zero_copy == 0)) { do_data_copy_dequeue(vq); - vq->last_used_idx += i; - update_used_idx(dev, vq, i); + if (unlikely(i < count)) + vq->shadow_used_idx = i; + flush_shadow_used_ring_packed(dev, vq); + vhost_vring_call_packed(dev, vq); + } + + return i; +} + +uint16_t +rte_vhost_dequeue_burst(int vid, uint16_t queue_id, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) +{ + struct virtio_net *dev; + struct rte_mbuf *rarp_mbuf = NULL; + struct vhost_virtqueue *vq; + + dev = get_device(vid); + if (!dev) + return 0; + + if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) { + RTE_LOG(ERR, VHOST_DATA, + "(%d) %s: built-in vhost net backend is disabled.\n", + dev->vid, __func__); + return 0; + } + + if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) { + RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n", + dev->vid, __func__, queue_id); + return 0; + } + + vq = dev->virtqueue[queue_id]; + + if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0)) + return 0; + + if (unlikely(vq->enabled == 0)) { + count = 0; + goto out_access_unlock; } + if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) + vhost_user_iotlb_rd_lock(vq); + + if (unlikely(vq->access_ok == 0)) + if (unlikely(vring_translate(dev, vq) < 0)) { + count = 0; + goto out; + } + + /* + * Construct a RARP broadcast packet, and inject it to the "pkts" + * array, to looks like that guest actually send such packet. + * + * Check user_send_rarp() for more information. + * + * broadcast_rarp shares a cacheline in the virtio_net structure + * with some fields that are accessed during enqueue and + * rte_atomic16_cmpset() causes a write if using cmpxchg. This could + * result in false sharing between enqueue and dequeue. + * + * Prevent unnecessary false sharing by reading broadcast_rarp first + * and only performing cmpset if the read indicates it is likely to + * be set. + */ + if (unlikely(rte_atomic16_read(&dev->broadcast_rarp) && + rte_atomic16_cmpset((volatile uint16_t *) + &dev->broadcast_rarp.cnt, 1, 0))) { + + rarp_mbuf = rte_net_make_rarp_packet(mbuf_pool, &dev->mac); + if (rarp_mbuf == NULL) { + RTE_LOG(ERR, VHOST_DATA, + "Failed to make RARP packet.\n"); + count = 0; + goto out; + } + count -= 1; + } + + if (vq_is_packed(dev)) + count = virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, count); + else + count = virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count); + out: if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) vhost_user_iotlb_rd_unlock(vq); @@ -1644,10 +1648,10 @@ out_access_unlock: * Inject it to the head of "pkts" array, so that switch's mac * learning table will get updated first. */ - memmove(&pkts[1], pkts, i * sizeof(struct rte_mbuf *)); + memmove(&pkts[1], pkts, count * sizeof(struct rte_mbuf *)); pkts[0] = rarp_mbuf; - i += 1; + count += 1; } - return i; + return count; } diff --git a/lib/meson.build b/lib/meson.build index 9d11571f..eb91f100 100644 --- a/lib/meson.build +++ b/lib/meson.build @@ -9,7 +9,8 @@ # given as a dep, no need to mention ring. This is especially true for the # core libs which are widely reused, so their deps are kept to a minimum. libraries = [ 'compat', # just a header, used for versioning - 'eal', 'ring', 'mempool', 'mbuf', 'net', 'kvargs', 'ethdev', 'pci', # core + 'kvargs', + 'eal', 'ring', 'mempool', 'mbuf', 'net', 'ethdev', 'pci', # core 'metrics', # bitrate/latency stats depends on this 'hash', # efd depends on this 'timer', # eventdev depends on this @@ -25,6 +26,10 @@ libraries = [ 'compat', # just a header, used for versioning # flow_classify lib depends on pkt framework table lib 'flow_classify', 'bpf'] +default_cflags = machine_args +if cc.has_argument('-Wno-format-truncation') + default_cflags += '-Wno-format-truncation' +endif foreach l:libraries build = true name = l @@ -33,7 +38,7 @@ foreach l:libraries sources = [] headers = [] includes = [] - cflags = machine_args + cflags = default_cflags objs = [] # other object files to link against, used e.g. for # instruction-set optimized versions of code @@ -41,9 +46,12 @@ foreach l:libraries # external package/library requirements ext_deps = [] deps = ['eal'] # eal is standard dependency except for itself - if l == 'eal' + if l == 'kvargs' deps = [] endif + if l == 'eal' + deps = ['kvargs'] + endif dir_name = 'librte_' + l subdir(dir_name) @@ -63,6 +71,10 @@ foreach l:libraries shared_deps = ext_deps static_deps = ext_deps foreach d:deps + if not is_variable('shared_rte_' + d) + error('Missing dependency ' + d + + ' for library ' + lib_name) + endif shared_deps += [get_variable('shared_rte_' + d)] static_deps += [get_variable('static_rte_' + d)] endforeach |