From b63264c8342e6a1b6971c79550d2af2024b6a4de Mon Sep 17 00:00:00 2001 From: Luca Boccassi Date: Tue, 14 Aug 2018 18:52:30 +0100 Subject: New upstream version 18.08 Change-Id: I32fdf5e5016556d9c0a6d88ddaf1fc468961790a Signed-off-by: Luca Boccassi --- lib/librte_bpf/Makefile | 41 + lib/librte_bpf/bpf.c | 61 + lib/librte_bpf/bpf_def.h | 143 +++ lib/librte_bpf/bpf_exec.c | 453 ++++++++ lib/librte_bpf/bpf_impl.h | 55 + lib/librte_bpf/bpf_jit_x86.c | 1356 ++++++++++++++++++++++ lib/librte_bpf/bpf_load.c | 148 +++ lib/librte_bpf/bpf_load_elf.c | 322 ++++++ lib/librte_bpf/bpf_pkt.c | 605 ++++++++++ lib/librte_bpf/bpf_validate.c | 2248 ++++++++++++++++++++++++++++++++++++ lib/librte_bpf/meson.build | 25 + lib/librte_bpf/rte_bpf.h | 203 ++++ lib/librte_bpf/rte_bpf_ethdev.h | 117 ++ lib/librte_bpf/rte_bpf_version.map | 16 + 14 files changed, 5793 insertions(+) create mode 100644 lib/librte_bpf/Makefile create mode 100644 lib/librte_bpf/bpf.c create mode 100644 lib/librte_bpf/bpf_def.h create mode 100644 lib/librte_bpf/bpf_exec.c create mode 100644 lib/librte_bpf/bpf_impl.h create mode 100644 lib/librte_bpf/bpf_jit_x86.c create mode 100644 lib/librte_bpf/bpf_load.c create mode 100644 lib/librte_bpf/bpf_load_elf.c create mode 100644 lib/librte_bpf/bpf_pkt.c create mode 100644 lib/librte_bpf/bpf_validate.c create mode 100644 lib/librte_bpf/meson.build create mode 100644 lib/librte_bpf/rte_bpf.h create mode 100644 lib/librte_bpf/rte_bpf_ethdev.h create mode 100644 lib/librte_bpf/rte_bpf_version.map (limited to 'lib/librte_bpf') diff --git a/lib/librte_bpf/Makefile b/lib/librte_bpf/Makefile new file mode 100644 index 00000000..c0e8aaa6 --- /dev/null +++ b/lib/librte_bpf/Makefile @@ -0,0 +1,41 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2018 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_bpf.a + +CFLAGS += -O3 +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) +CFLAGS += -DALLOW_EXPERIMENTAL_API +LDLIBS += -lrte_net -lrte_eal +LDLIBS += -lrte_mempool -lrte_ring +LDLIBS += -lrte_mbuf -lrte_ethdev +ifeq ($(CONFIG_RTE_LIBRTE_BPF_ELF),y) +LDLIBS += -lelf +endif + +EXPORT_MAP := rte_bpf_version.map + +LIBABIVER := 1 + +# all source are stored in SRCS-y +SRCS-$(CONFIG_RTE_LIBRTE_BPF) += bpf.c +SRCS-$(CONFIG_RTE_LIBRTE_BPF) += bpf_exec.c +SRCS-$(CONFIG_RTE_LIBRTE_BPF) += bpf_load.c +SRCS-$(CONFIG_RTE_LIBRTE_BPF) += bpf_pkt.c +SRCS-$(CONFIG_RTE_LIBRTE_BPF) += bpf_validate.c +ifeq ($(CONFIG_RTE_LIBRTE_BPF_ELF),y) +SRCS-$(CONFIG_RTE_LIBRTE_BPF) += bpf_load_elf.c +endif +ifeq ($(CONFIG_RTE_ARCH_X86_64),y) +SRCS-$(CONFIG_RTE_LIBRTE_BPF) += bpf_jit_x86.c +endif + +# install header files +SYMLINK-$(CONFIG_RTE_LIBRTE_BPF)-include += bpf_def.h +SYMLINK-$(CONFIG_RTE_LIBRTE_BPF)-include += rte_bpf.h +SYMLINK-$(CONFIG_RTE_LIBRTE_BPF)-include += rte_bpf_ethdev.h + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/lib/librte_bpf/bpf.c b/lib/librte_bpf/bpf.c new file mode 100644 index 00000000..f590c8c3 --- /dev/null +++ b/lib/librte_bpf/bpf.c @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "bpf_impl.h" + +int rte_bpf_logtype; + +__rte_experimental void +rte_bpf_destroy(struct rte_bpf *bpf) +{ + if (bpf != NULL) { + if (bpf->jit.func != NULL) + munmap(bpf->jit.func, bpf->jit.sz); + munmap(bpf, bpf->sz); + } +} + +__rte_experimental int +rte_bpf_get_jit(const struct rte_bpf *bpf, struct rte_bpf_jit *jit) +{ + if (bpf == NULL || jit == NULL) + return -EINVAL; + + jit[0] = bpf->jit; + return 0; +} + +int +bpf_jit(struct rte_bpf *bpf) +{ + int32_t rc; + +#ifdef RTE_ARCH_X86_64 + rc = bpf_jit_x86(bpf); +#else + rc = -ENOTSUP; +#endif + + if (rc != 0) + RTE_BPF_LOG(WARNING, "%s(%p) failed, error code: %d;\n", + __func__, bpf, rc); + return rc; +} + +RTE_INIT(rte_bpf_init_log) +{ + rte_bpf_logtype = rte_log_register("lib.bpf"); + if (rte_bpf_logtype >= 0) + rte_log_set_level(rte_bpf_logtype, RTE_LOG_INFO); +} diff --git a/lib/librte_bpf/bpf_def.h b/lib/librte_bpf/bpf_def.h new file mode 100644 index 00000000..c10f3aec --- /dev/null +++ b/lib/librte_bpf/bpf_def.h @@ -0,0 +1,143 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 1982, 1986, 1990, 1993 + * The Regents of the University of California. + * Copyright(c) 2018 Intel Corporation. + */ + +#ifndef _RTE_BPF_DEF_H_ +#define _RTE_BPF_DEF_H_ + +/** + * @file + * + * classic BPF (cBPF) and extended BPF (eBPF) related defines. + * For more information regarding cBPF and eBPF ISA and their differences, + * please refer to: + * https://www.kernel.org/doc/Documentation/networking/filter.txt. + * As a rule of thumb for that file: + * all definitions used by both cBPF and eBPF start with bpf(BPF)_ prefix, + * while eBPF only ones start with ebpf(EBPF)) prefix. + */ + +#include + + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The instruction encodings. + */ + +/* Instruction classes */ +#define BPF_CLASS(code) ((code) & 0x07) +#define BPF_LD 0x00 +#define BPF_LDX 0x01 +#define BPF_ST 0x02 +#define BPF_STX 0x03 +#define BPF_ALU 0x04 +#define BPF_JMP 0x05 +#define BPF_RET 0x06 +#define BPF_MISC 0x07 + +#define EBPF_ALU64 0x07 + +/* ld/ldx fields */ +#define BPF_SIZE(code) ((code) & 0x18) +#define BPF_W 0x00 +#define BPF_H 0x08 +#define BPF_B 0x10 +#define EBPF_DW 0x18 + +#define BPF_MODE(code) ((code) & 0xe0) +#define BPF_IMM 0x00 +#define BPF_ABS 0x20 +#define BPF_IND 0x40 +#define BPF_MEM 0x60 +#define BPF_LEN 0x80 +#define BPF_MSH 0xa0 + +#define EBPF_XADD 0xc0 + +/* alu/jmp fields */ +#define BPF_OP(code) ((code) & 0xf0) +#define BPF_ADD 0x00 +#define BPF_SUB 0x10 +#define BPF_MUL 0x20 +#define BPF_DIV 0x30 +#define BPF_OR 0x40 +#define BPF_AND 0x50 +#define BPF_LSH 0x60 +#define BPF_RSH 0x70 +#define BPF_NEG 0x80 +#define BPF_MOD 0x90 +#define BPF_XOR 0xa0 + +#define EBPF_MOV 0xb0 +#define EBPF_ARSH 0xc0 +#define EBPF_END 0xd0 + +#define BPF_JA 0x00 +#define BPF_JEQ 0x10 +#define BPF_JGT 0x20 +#define BPF_JGE 0x30 +#define BPF_JSET 0x40 + +#define EBPF_JNE 0x50 +#define EBPF_JSGT 0x60 +#define EBPF_JSGE 0x70 +#define EBPF_CALL 0x80 +#define EBPF_EXIT 0x90 +#define EBPF_JLT 0xa0 +#define EBPF_JLE 0xb0 +#define EBPF_JSLT 0xc0 +#define EBPF_JSLE 0xd0 + +#define BPF_SRC(code) ((code) & 0x08) +#define BPF_K 0x00 +#define BPF_X 0x08 + +/* if BPF_OP(code) == EBPF_END */ +#define EBPF_TO_LE 0x00 /* convert to little-endian */ +#define EBPF_TO_BE 0x08 /* convert to big-endian */ + +/* + * eBPF registers + */ +enum { + EBPF_REG_0, /* return value from internal function/for eBPF program */ + EBPF_REG_1, /* 0-th argument to internal function */ + EBPF_REG_2, /* 1-th argument to internal function */ + EBPF_REG_3, /* 2-th argument to internal function */ + EBPF_REG_4, /* 3-th argument to internal function */ + EBPF_REG_5, /* 4-th argument to internal function */ + EBPF_REG_6, /* callee saved register */ + EBPF_REG_7, /* callee saved register */ + EBPF_REG_8, /* callee saved register */ + EBPF_REG_9, /* callee saved register */ + EBPF_REG_10, /* stack pointer (read-only) */ + EBPF_REG_NUM, +}; + +/* + * eBPF instruction format + */ +struct ebpf_insn { + uint8_t code; + uint8_t dst_reg:4; + uint8_t src_reg:4; + int16_t off; + int32_t imm; +}; + +/* + * eBPF allows functions with R1-R5 as arguments. + */ +#define EBPF_FUNC_MAX_ARGS (EBPF_REG_6 - EBPF_REG_1) + +#ifdef __cplusplus +} +#endif + +#endif /* RTE_BPF_DEF_H_ */ diff --git a/lib/librte_bpf/bpf_exec.c b/lib/librte_bpf/bpf_exec.c new file mode 100644 index 00000000..6a79139c --- /dev/null +++ b/lib/librte_bpf/bpf_exec.c @@ -0,0 +1,453 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "bpf_impl.h" + +#define BPF_JMP_UNC(ins) ((ins) += (ins)->off) + +#define BPF_JMP_CND_REG(reg, ins, op, type) \ + ((ins) += \ + ((type)(reg)[(ins)->dst_reg] op (type)(reg)[(ins)->src_reg]) ? \ + (ins)->off : 0) + +#define BPF_JMP_CND_IMM(reg, ins, op, type) \ + ((ins) += \ + ((type)(reg)[(ins)->dst_reg] op (type)(ins)->imm) ? \ + (ins)->off : 0) + +#define BPF_NEG_ALU(reg, ins, type) \ + ((reg)[(ins)->dst_reg] = (type)(-(reg)[(ins)->dst_reg])) + +#define EBPF_MOV_ALU_REG(reg, ins, type) \ + ((reg)[(ins)->dst_reg] = (type)(reg)[(ins)->src_reg]) + +#define BPF_OP_ALU_REG(reg, ins, op, type) \ + ((reg)[(ins)->dst_reg] = \ + (type)(reg)[(ins)->dst_reg] op (type)(reg)[(ins)->src_reg]) + +#define EBPF_MOV_ALU_IMM(reg, ins, type) \ + ((reg)[(ins)->dst_reg] = (type)(ins)->imm) + +#define BPF_OP_ALU_IMM(reg, ins, op, type) \ + ((reg)[(ins)->dst_reg] = \ + (type)(reg)[(ins)->dst_reg] op (type)(ins)->imm) + +#define BPF_DIV_ZERO_CHECK(bpf, reg, ins, type) do { \ + if ((type)(reg)[(ins)->src_reg] == 0) { \ + RTE_BPF_LOG(ERR, \ + "%s(%p): division by 0 at pc: %#zx;\n", \ + __func__, bpf, \ + (uintptr_t)(ins) - (uintptr_t)(bpf)->prm.ins); \ + return 0; \ + } \ +} while (0) + +#define BPF_LD_REG(reg, ins, type) \ + ((reg)[(ins)->dst_reg] = \ + *(type *)(uintptr_t)((reg)[(ins)->src_reg] + (ins)->off)) + +#define BPF_ST_IMM(reg, ins, type) \ + (*(type *)(uintptr_t)((reg)[(ins)->dst_reg] + (ins)->off) = \ + (type)(ins)->imm) + +#define BPF_ST_REG(reg, ins, type) \ + (*(type *)(uintptr_t)((reg)[(ins)->dst_reg] + (ins)->off) = \ + (type)(reg)[(ins)->src_reg]) + +#define BPF_ST_XADD_REG(reg, ins, tp) \ + (rte_atomic##tp##_add((rte_atomic##tp##_t *) \ + (uintptr_t)((reg)[(ins)->dst_reg] + (ins)->off), \ + reg[ins->src_reg])) + +static inline void +bpf_alu_be(uint64_t reg[EBPF_REG_NUM], const struct ebpf_insn *ins) +{ + uint64_t *v; + + v = reg + ins->dst_reg; + switch (ins->imm) { + case 16: + *v = rte_cpu_to_be_16(*v); + break; + case 32: + *v = rte_cpu_to_be_32(*v); + break; + case 64: + *v = rte_cpu_to_be_64(*v); + break; + } +} + +static inline void +bpf_alu_le(uint64_t reg[EBPF_REG_NUM], const struct ebpf_insn *ins) +{ + uint64_t *v; + + v = reg + ins->dst_reg; + switch (ins->imm) { + case 16: + *v = rte_cpu_to_le_16(*v); + break; + case 32: + *v = rte_cpu_to_le_32(*v); + break; + case 64: + *v = rte_cpu_to_le_64(*v); + break; + } +} + +static inline uint64_t +bpf_exec(const struct rte_bpf *bpf, uint64_t reg[EBPF_REG_NUM]) +{ + const struct ebpf_insn *ins; + + for (ins = bpf->prm.ins; ; ins++) { + switch (ins->code) { + /* 32 bit ALU IMM operations */ + case (BPF_ALU | BPF_ADD | BPF_K): + BPF_OP_ALU_IMM(reg, ins, +, uint32_t); + break; + case (BPF_ALU | BPF_SUB | BPF_K): + BPF_OP_ALU_IMM(reg, ins, -, uint32_t); + break; + case (BPF_ALU | BPF_AND | BPF_K): + BPF_OP_ALU_IMM(reg, ins, &, uint32_t); + break; + case (BPF_ALU | BPF_OR | BPF_K): + BPF_OP_ALU_IMM(reg, ins, |, uint32_t); + break; + case (BPF_ALU | BPF_LSH | BPF_K): + BPF_OP_ALU_IMM(reg, ins, <<, uint32_t); + break; + case (BPF_ALU | BPF_RSH | BPF_K): + BPF_OP_ALU_IMM(reg, ins, >>, uint32_t); + break; + case (BPF_ALU | BPF_XOR | BPF_K): + BPF_OP_ALU_IMM(reg, ins, ^, uint32_t); + break; + case (BPF_ALU | BPF_MUL | BPF_K): + BPF_OP_ALU_IMM(reg, ins, *, uint32_t); + break; + case (BPF_ALU | BPF_DIV | BPF_K): + BPF_OP_ALU_IMM(reg, ins, /, uint32_t); + break; + case (BPF_ALU | BPF_MOD | BPF_K): + BPF_OP_ALU_IMM(reg, ins, %, uint32_t); + break; + case (BPF_ALU | EBPF_MOV | BPF_K): + EBPF_MOV_ALU_IMM(reg, ins, uint32_t); + break; + /* 32 bit ALU REG operations */ + case (BPF_ALU | BPF_ADD | BPF_X): + BPF_OP_ALU_REG(reg, ins, +, uint32_t); + break; + case (BPF_ALU | BPF_SUB | BPF_X): + BPF_OP_ALU_REG(reg, ins, -, uint32_t); + break; + case (BPF_ALU | BPF_AND | BPF_X): + BPF_OP_ALU_REG(reg, ins, &, uint32_t); + break; + case (BPF_ALU | BPF_OR | BPF_X): + BPF_OP_ALU_REG(reg, ins, |, uint32_t); + break; + case (BPF_ALU | BPF_LSH | BPF_X): + BPF_OP_ALU_REG(reg, ins, <<, uint32_t); + break; + case (BPF_ALU | BPF_RSH | BPF_X): + BPF_OP_ALU_REG(reg, ins, >>, uint32_t); + break; + case (BPF_ALU | BPF_XOR | BPF_X): + BPF_OP_ALU_REG(reg, ins, ^, uint32_t); + break; + case (BPF_ALU | BPF_MUL | BPF_X): + BPF_OP_ALU_REG(reg, ins, *, uint32_t); + break; + case (BPF_ALU | BPF_DIV | BPF_X): + BPF_DIV_ZERO_CHECK(bpf, reg, ins, uint32_t); + BPF_OP_ALU_REG(reg, ins, /, uint32_t); + break; + case (BPF_ALU | BPF_MOD | BPF_X): + BPF_DIV_ZERO_CHECK(bpf, reg, ins, uint32_t); + BPF_OP_ALU_REG(reg, ins, %, uint32_t); + break; + case (BPF_ALU | EBPF_MOV | BPF_X): + EBPF_MOV_ALU_REG(reg, ins, uint32_t); + break; + case (BPF_ALU | BPF_NEG): + BPF_NEG_ALU(reg, ins, uint32_t); + break; + case (BPF_ALU | EBPF_END | EBPF_TO_BE): + bpf_alu_be(reg, ins); + break; + case (BPF_ALU | EBPF_END | EBPF_TO_LE): + bpf_alu_le(reg, ins); + break; + /* 64 bit ALU IMM operations */ + case (EBPF_ALU64 | BPF_ADD | BPF_K): + BPF_OP_ALU_IMM(reg, ins, +, uint64_t); + break; + case (EBPF_ALU64 | BPF_SUB | BPF_K): + BPF_OP_ALU_IMM(reg, ins, -, uint64_t); + break; + case (EBPF_ALU64 | BPF_AND | BPF_K): + BPF_OP_ALU_IMM(reg, ins, &, uint64_t); + break; + case (EBPF_ALU64 | BPF_OR | BPF_K): + BPF_OP_ALU_IMM(reg, ins, |, uint64_t); + break; + case (EBPF_ALU64 | BPF_LSH | BPF_K): + BPF_OP_ALU_IMM(reg, ins, <<, uint64_t); + break; + case (EBPF_ALU64 | BPF_RSH | BPF_K): + BPF_OP_ALU_IMM(reg, ins, >>, uint64_t); + break; + case (EBPF_ALU64 | EBPF_ARSH | BPF_K): + BPF_OP_ALU_IMM(reg, ins, >>, int64_t); + break; + case (EBPF_ALU64 | BPF_XOR | BPF_K): + BPF_OP_ALU_IMM(reg, ins, ^, uint64_t); + break; + case (EBPF_ALU64 | BPF_MUL | BPF_K): + BPF_OP_ALU_IMM(reg, ins, *, uint64_t); + break; + case (EBPF_ALU64 | BPF_DIV | BPF_K): + BPF_OP_ALU_IMM(reg, ins, /, uint64_t); + break; + case (EBPF_ALU64 | BPF_MOD | BPF_K): + BPF_OP_ALU_IMM(reg, ins, %, uint64_t); + break; + case (EBPF_ALU64 | EBPF_MOV | BPF_K): + EBPF_MOV_ALU_IMM(reg, ins, uint64_t); + break; + /* 64 bit ALU REG operations */ + case (EBPF_ALU64 | BPF_ADD | BPF_X): + BPF_OP_ALU_REG(reg, ins, +, uint64_t); + break; + case (EBPF_ALU64 | BPF_SUB | BPF_X): + BPF_OP_ALU_REG(reg, ins, -, uint64_t); + break; + case (EBPF_ALU64 | BPF_AND | BPF_X): + BPF_OP_ALU_REG(reg, ins, &, uint64_t); + break; + case (EBPF_ALU64 | BPF_OR | BPF_X): + BPF_OP_ALU_REG(reg, ins, |, uint64_t); + break; + case (EBPF_ALU64 | BPF_LSH | BPF_X): + BPF_OP_ALU_REG(reg, ins, <<, uint64_t); + break; + case (EBPF_ALU64 | BPF_RSH | BPF_X): + BPF_OP_ALU_REG(reg, ins, >>, uint64_t); + break; + case (EBPF_ALU64 | EBPF_ARSH | BPF_X): + BPF_OP_ALU_REG(reg, ins, >>, int64_t); + break; + case (EBPF_ALU64 | BPF_XOR | BPF_X): + BPF_OP_ALU_REG(reg, ins, ^, uint64_t); + break; + case (EBPF_ALU64 | BPF_MUL | BPF_X): + BPF_OP_ALU_REG(reg, ins, *, uint64_t); + break; + case (EBPF_ALU64 | BPF_DIV | BPF_X): + BPF_DIV_ZERO_CHECK(bpf, reg, ins, uint64_t); + BPF_OP_ALU_REG(reg, ins, /, uint64_t); + break; + case (EBPF_ALU64 | BPF_MOD | BPF_X): + BPF_DIV_ZERO_CHECK(bpf, reg, ins, uint64_t); + BPF_OP_ALU_REG(reg, ins, %, uint64_t); + break; + case (EBPF_ALU64 | EBPF_MOV | BPF_X): + EBPF_MOV_ALU_REG(reg, ins, uint64_t); + break; + case (EBPF_ALU64 | BPF_NEG): + BPF_NEG_ALU(reg, ins, uint64_t); + break; + /* load instructions */ + case (BPF_LDX | BPF_MEM | BPF_B): + BPF_LD_REG(reg, ins, uint8_t); + break; + case (BPF_LDX | BPF_MEM | BPF_H): + BPF_LD_REG(reg, ins, uint16_t); + break; + case (BPF_LDX | BPF_MEM | BPF_W): + BPF_LD_REG(reg, ins, uint32_t); + break; + case (BPF_LDX | BPF_MEM | EBPF_DW): + BPF_LD_REG(reg, ins, uint64_t); + break; + /* load 64 bit immediate value */ + case (BPF_LD | BPF_IMM | EBPF_DW): + reg[ins->dst_reg] = (uint32_t)ins[0].imm | + (uint64_t)(uint32_t)ins[1].imm << 32; + ins++; + break; + /* store instructions */ + case (BPF_STX | BPF_MEM | BPF_B): + BPF_ST_REG(reg, ins, uint8_t); + break; + case (BPF_STX | BPF_MEM | BPF_H): + BPF_ST_REG(reg, ins, uint16_t); + break; + case (BPF_STX | BPF_MEM | BPF_W): + BPF_ST_REG(reg, ins, uint32_t); + break; + case (BPF_STX | BPF_MEM | EBPF_DW): + BPF_ST_REG(reg, ins, uint64_t); + break; + case (BPF_ST | BPF_MEM | BPF_B): + BPF_ST_IMM(reg, ins, uint8_t); + break; + case (BPF_ST | BPF_MEM | BPF_H): + BPF_ST_IMM(reg, ins, uint16_t); + break; + case (BPF_ST | BPF_MEM | BPF_W): + BPF_ST_IMM(reg, ins, uint32_t); + break; + case (BPF_ST | BPF_MEM | EBPF_DW): + BPF_ST_IMM(reg, ins, uint64_t); + break; + /* atomic add instructions */ + case (BPF_STX | EBPF_XADD | BPF_W): + BPF_ST_XADD_REG(reg, ins, 32); + break; + case (BPF_STX | EBPF_XADD | EBPF_DW): + BPF_ST_XADD_REG(reg, ins, 64); + break; + /* jump instructions */ + case (BPF_JMP | BPF_JA): + BPF_JMP_UNC(ins); + break; + /* jump IMM instructions */ + case (BPF_JMP | BPF_JEQ | BPF_K): + BPF_JMP_CND_IMM(reg, ins, ==, uint64_t); + break; + case (BPF_JMP | EBPF_JNE | BPF_K): + BPF_JMP_CND_IMM(reg, ins, !=, uint64_t); + break; + case (BPF_JMP | BPF_JGT | BPF_K): + BPF_JMP_CND_IMM(reg, ins, >, uint64_t); + break; + case (BPF_JMP | EBPF_JLT | BPF_K): + BPF_JMP_CND_IMM(reg, ins, <, uint64_t); + break; + case (BPF_JMP | BPF_JGE | BPF_K): + BPF_JMP_CND_IMM(reg, ins, >=, uint64_t); + break; + case (BPF_JMP | EBPF_JLE | BPF_K): + BPF_JMP_CND_IMM(reg, ins, <=, uint64_t); + break; + case (BPF_JMP | EBPF_JSGT | BPF_K): + BPF_JMP_CND_IMM(reg, ins, >, int64_t); + break; + case (BPF_JMP | EBPF_JSLT | BPF_K): + BPF_JMP_CND_IMM(reg, ins, <, int64_t); + break; + case (BPF_JMP | EBPF_JSGE | BPF_K): + BPF_JMP_CND_IMM(reg, ins, >=, int64_t); + break; + case (BPF_JMP | EBPF_JSLE | BPF_K): + BPF_JMP_CND_IMM(reg, ins, <=, int64_t); + break; + case (BPF_JMP | BPF_JSET | BPF_K): + BPF_JMP_CND_IMM(reg, ins, &, uint64_t); + break; + /* jump REG instructions */ + case (BPF_JMP | BPF_JEQ | BPF_X): + BPF_JMP_CND_REG(reg, ins, ==, uint64_t); + break; + case (BPF_JMP | EBPF_JNE | BPF_X): + BPF_JMP_CND_REG(reg, ins, !=, uint64_t); + break; + case (BPF_JMP | BPF_JGT | BPF_X): + BPF_JMP_CND_REG(reg, ins, >, uint64_t); + break; + case (BPF_JMP | EBPF_JLT | BPF_X): + BPF_JMP_CND_REG(reg, ins, <, uint64_t); + break; + case (BPF_JMP | BPF_JGE | BPF_X): + BPF_JMP_CND_REG(reg, ins, >=, uint64_t); + break; + case (BPF_JMP | EBPF_JLE | BPF_X): + BPF_JMP_CND_REG(reg, ins, <=, uint64_t); + break; + case (BPF_JMP | EBPF_JSGT | BPF_X): + BPF_JMP_CND_REG(reg, ins, >, int64_t); + break; + case (BPF_JMP | EBPF_JSLT | BPF_X): + BPF_JMP_CND_REG(reg, ins, <, int64_t); + break; + case (BPF_JMP | EBPF_JSGE | BPF_X): + BPF_JMP_CND_REG(reg, ins, >=, int64_t); + break; + case (BPF_JMP | EBPF_JSLE | BPF_X): + BPF_JMP_CND_REG(reg, ins, <=, int64_t); + break; + case (BPF_JMP | BPF_JSET | BPF_X): + BPF_JMP_CND_REG(reg, ins, &, uint64_t); + break; + /* call instructions */ + case (BPF_JMP | EBPF_CALL): + reg[EBPF_REG_0] = bpf->prm.xsym[ins->imm].func.val( + reg[EBPF_REG_1], reg[EBPF_REG_2], + reg[EBPF_REG_3], reg[EBPF_REG_4], + reg[EBPF_REG_5]); + break; + /* return instruction */ + case (BPF_JMP | EBPF_EXIT): + return reg[EBPF_REG_0]; + default: + RTE_BPF_LOG(ERR, + "%s(%p): invalid opcode %#x at pc: %#zx;\n", + __func__, bpf, ins->code, + (uintptr_t)ins - (uintptr_t)bpf->prm.ins); + return 0; + } + } + + /* should never be reached */ + RTE_VERIFY(0); + return 0; +} + +__rte_experimental uint32_t +rte_bpf_exec_burst(const struct rte_bpf *bpf, void *ctx[], uint64_t rc[], + uint32_t num) +{ + uint32_t i; + uint64_t reg[EBPF_REG_NUM]; + uint64_t stack[MAX_BPF_STACK_SIZE / sizeof(uint64_t)]; + + for (i = 0; i != num; i++) { + + reg[EBPF_REG_1] = (uintptr_t)ctx[i]; + reg[EBPF_REG_10] = (uintptr_t)(stack + RTE_DIM(stack)); + + rc[i] = bpf_exec(bpf, reg); + } + + return i; +} + +__rte_experimental uint64_t +rte_bpf_exec(const struct rte_bpf *bpf, void *ctx) +{ + uint64_t rc; + + rte_bpf_exec_burst(bpf, &ctx, &rc, 1); + return rc; +} diff --git a/lib/librte_bpf/bpf_impl.h b/lib/librte_bpf/bpf_impl.h new file mode 100644 index 00000000..b577e2cb --- /dev/null +++ b/lib/librte_bpf/bpf_impl.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#ifndef _BPF_H_ +#define _BPF_H_ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_BPF_STACK_SIZE 0x200 + +struct rte_bpf { + struct rte_bpf_prm prm; + struct rte_bpf_jit jit; + size_t sz; + uint32_t stack_sz; +}; + +extern int bpf_validate(struct rte_bpf *bpf); + +extern int bpf_jit(struct rte_bpf *bpf); + +#ifdef RTE_ARCH_X86_64 +extern int bpf_jit_x86(struct rte_bpf *); +#endif + +extern int rte_bpf_logtype; + +#define RTE_BPF_LOG(lvl, fmt, args...) \ + rte_log(RTE_LOG_## lvl, rte_bpf_logtype, fmt, ##args) + +static inline size_t +bpf_size(uint32_t bpf_op_sz) +{ + if (bpf_op_sz == BPF_B) + return sizeof(uint8_t); + else if (bpf_op_sz == BPF_H) + return sizeof(uint16_t); + else if (bpf_op_sz == BPF_W) + return sizeof(uint32_t); + else if (bpf_op_sz == EBPF_DW) + return sizeof(uint64_t); + return 0; +} + +#ifdef __cplusplus +} +#endif + +#endif /* _BPF_H_ */ diff --git a/lib/librte_bpf/bpf_jit_x86.c b/lib/librte_bpf/bpf_jit_x86.c new file mode 100644 index 00000000..68ea389f --- /dev/null +++ b/lib/librte_bpf/bpf_jit_x86.c @@ -0,0 +1,1356 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "bpf_impl.h" + +#define GET_BPF_OP(op) (BPF_OP(op) >> 4) + +enum { + RAX = 0, /* scratch, return value */ + RCX = 1, /* scratch, 4th arg */ + RDX = 2, /* scratch, 3rd arg */ + RBX = 3, /* callee saved */ + RSP = 4, /* stack pointer */ + RBP = 5, /* frame pointer, callee saved */ + RSI = 6, /* scratch, 2nd arg */ + RDI = 7, /* scratch, 1st arg */ + R8 = 8, /* scratch, 5th arg */ + R9 = 9, /* scratch, 6th arg */ + R10 = 10, /* scratch */ + R11 = 11, /* scratch */ + R12 = 12, /* callee saved */ + R13 = 13, /* callee saved */ + R14 = 14, /* callee saved */ + R15 = 15, /* callee saved */ +}; + +#define IS_EXT_REG(r) ((r) >= R8) + +enum { + REX_PREFIX = 0x40, /* fixed value 0100 */ + REX_W = 0x8, /* 64bit operand size */ + REX_R = 0x4, /* extension of the ModRM.reg field */ + REX_X = 0x2, /* extension of the SIB.index field */ + REX_B = 0x1, /* extension of the ModRM.rm field */ +}; + +enum { + MOD_INDIRECT = 0, + MOD_IDISP8 = 1, + MOD_IDISP32 = 2, + MOD_DIRECT = 3, +}; + +enum { + SIB_SCALE_1 = 0, + SIB_SCALE_2 = 1, + SIB_SCALE_4 = 2, + SIB_SCALE_8 = 3, +}; + +/* + * eBPF to x86_64 register mappings. + */ +static const uint32_t ebpf2x86[] = { + [EBPF_REG_0] = RAX, + [EBPF_REG_1] = RDI, + [EBPF_REG_2] = RSI, + [EBPF_REG_3] = RDX, + [EBPF_REG_4] = RCX, + [EBPF_REG_5] = R8, + [EBPF_REG_6] = RBX, + [EBPF_REG_7] = R13, + [EBPF_REG_8] = R14, + [EBPF_REG_9] = R15, + [EBPF_REG_10] = RBP, +}; + +/* + * r10 and r11 are used as a scratch temporary registers. + */ +enum { + REG_DIV_IMM = R9, + REG_TMP0 = R11, + REG_TMP1 = R10, +}; + +/* + * callee saved registers list. + * keep RBP as the last one. + */ +static const uint32_t save_regs[] = {RBX, R12, R13, R14, R15, RBP}; + +struct bpf_jit_state { + uint32_t idx; + size_t sz; + struct { + uint32_t num; + int32_t off; + } exit; + uint32_t reguse; + int32_t *off; + uint8_t *ins; +}; + +#define INUSE(v, r) (((v) >> (r)) & 1) +#define USED(v, r) ((v) |= 1 << (r)) + +union bpf_jit_imm { + uint32_t u32; + uint8_t u8[4]; +}; + +/* + * In many cases for imm8 we can produce shorter code. + */ +static size_t +imm_size(int32_t v) +{ + if (v == (int8_t)v) + return sizeof(int8_t); + return sizeof(int32_t); +} + +static void +emit_bytes(struct bpf_jit_state *st, const uint8_t ins[], uint32_t sz) +{ + uint32_t i; + + if (st->ins != NULL) { + for (i = 0; i != sz; i++) + st->ins[st->sz + i] = ins[i]; + } + st->sz += sz; +} + +static void +emit_imm(struct bpf_jit_state *st, const uint32_t imm, uint32_t sz) +{ + union bpf_jit_imm v; + + v.u32 = imm; + emit_bytes(st, v.u8, sz); +} + +/* + * emit REX byte + */ +static void +emit_rex(struct bpf_jit_state *st, uint32_t op, uint32_t reg, uint32_t rm) +{ + uint8_t rex; + + /* mark operand registers as used*/ + USED(st->reguse, reg); + USED(st->reguse, rm); + + rex = 0; + if (BPF_CLASS(op) == EBPF_ALU64 || + op == (BPF_ST | BPF_MEM | EBPF_DW) || + op == (BPF_STX | BPF_MEM | EBPF_DW) || + op == (BPF_STX | EBPF_XADD | EBPF_DW) || + op == (BPF_LD | BPF_IMM | EBPF_DW) || + (BPF_CLASS(op) == BPF_LDX && + BPF_MODE(op) == BPF_MEM && + BPF_SIZE(op) != BPF_W)) + rex |= REX_W; + + if (IS_EXT_REG(reg)) + rex |= REX_R; + + if (IS_EXT_REG(rm)) + rex |= REX_B; + + /* store using SIL, DIL */ + if (op == (BPF_STX | BPF_MEM | BPF_B) && (reg == RDI || reg == RSI)) + rex |= REX_PREFIX; + + if (rex != 0) { + rex |= REX_PREFIX; + emit_bytes(st, &rex, sizeof(rex)); + } +} + +/* + * emit MODRegRM byte + */ +static void +emit_modregrm(struct bpf_jit_state *st, uint32_t mod, uint32_t reg, uint32_t rm) +{ + uint8_t v; + + v = mod << 6 | (reg & 7) << 3 | (rm & 7); + emit_bytes(st, &v, sizeof(v)); +} + +/* + * emit SIB byte + */ +static void +emit_sib(struct bpf_jit_state *st, uint32_t scale, uint32_t idx, uint32_t base) +{ + uint8_t v; + + v = scale << 6 | (idx & 7) << 3 | (base & 7); + emit_bytes(st, &v, sizeof(v)); +} + +/* + * emit xchg %, % + */ +static void +emit_xchg_reg(struct bpf_jit_state *st, uint32_t sreg, uint32_t dreg) +{ + const uint8_t ops = 0x87; + + emit_rex(st, EBPF_ALU64, sreg, dreg); + emit_bytes(st, &ops, sizeof(ops)); + emit_modregrm(st, MOD_DIRECT, sreg, dreg); +} + +/* + * emit neg % + */ +static void +emit_neg(struct bpf_jit_state *st, uint32_t op, uint32_t dreg) +{ + const uint8_t ops = 0xF7; + const uint8_t mods = 3; + + emit_rex(st, op, 0, dreg); + emit_bytes(st, &ops, sizeof(ops)); + emit_modregrm(st, MOD_DIRECT, mods, dreg); +} + +/* + * emit mov %, % + */ +static void +emit_mov_reg(struct bpf_jit_state *st, uint32_t op, uint32_t sreg, + uint32_t dreg) +{ + const uint8_t ops = 0x89; + + /* if operands are 32-bit, then it can be used to clear upper 32-bit */ + if (sreg != dreg || BPF_CLASS(op) == BPF_ALU) { + emit_rex(st, op, sreg, dreg); + emit_bytes(st, &ops, sizeof(ops)); + emit_modregrm(st, MOD_DIRECT, sreg, dreg); + } +} + +/* + * emit movzwl %, % + */ +static void +emit_movzwl(struct bpf_jit_state *st, uint32_t sreg, uint32_t dreg) +{ + static const uint8_t ops[] = {0x0F, 0xB7}; + + emit_rex(st, BPF_ALU, sreg, dreg); + emit_bytes(st, ops, sizeof(ops)); + emit_modregrm(st, MOD_DIRECT, sreg, dreg); +} + +/* + * emit ror , % + */ +static void +emit_ror_imm(struct bpf_jit_state *st, uint32_t dreg, uint32_t imm) +{ + const uint8_t prfx = 0x66; + const uint8_t ops = 0xC1; + const uint8_t mods = 1; + + emit_bytes(st, &prfx, sizeof(prfx)); + emit_rex(st, BPF_ALU, 0, dreg); + emit_bytes(st, &ops, sizeof(ops)); + emit_modregrm(st, MOD_DIRECT, mods, dreg); + emit_imm(st, imm, imm_size(imm)); +} + +/* + * emit bswap % + */ +static void +emit_be2le_48(struct bpf_jit_state *st, uint32_t dreg, uint32_t imm) +{ + uint32_t rop; + + const uint8_t ops = 0x0F; + const uint8_t mods = 1; + + rop = (imm == 64) ? EBPF_ALU64 : BPF_ALU; + emit_rex(st, rop, 0, dreg); + emit_bytes(st, &ops, sizeof(ops)); + emit_modregrm(st, MOD_DIRECT, mods, dreg); +} + +static void +emit_be2le(struct bpf_jit_state *st, uint32_t dreg, uint32_t imm) +{ + if (imm == 16) { + emit_ror_imm(st, dreg, 8); + emit_movzwl(st, dreg, dreg); + } else + emit_be2le_48(st, dreg, imm); +} + +/* + * In general it is NOP for x86. + * Just clear the upper bits. + */ +static void +emit_le2be(struct bpf_jit_state *st, uint32_t dreg, uint32_t imm) +{ + if (imm == 16) + emit_movzwl(st, dreg, dreg); + else if (imm == 32) + emit_mov_reg(st, BPF_ALU | EBPF_MOV | BPF_X, dreg, dreg); +} + +/* + * emit one of: + * add , % + * and , % + * or , % + * sub , % + * xor , % + */ +static void +emit_alu_imm(struct bpf_jit_state *st, uint32_t op, uint32_t dreg, uint32_t imm) +{ + uint8_t mod, opcode; + uint32_t bop, imsz; + + const uint8_t op8 = 0x83; + const uint8_t op32 = 0x81; + static const uint8_t mods[] = { + [GET_BPF_OP(BPF_ADD)] = 0, + [GET_BPF_OP(BPF_AND)] = 4, + [GET_BPF_OP(BPF_OR)] = 1, + [GET_BPF_OP(BPF_SUB)] = 5, + [GET_BPF_OP(BPF_XOR)] = 6, + }; + + bop = GET_BPF_OP(op); + mod = mods[bop]; + + imsz = imm_size(imm); + opcode = (imsz == 1) ? op8 : op32; + + emit_rex(st, op, 0, dreg); + emit_bytes(st, &opcode, sizeof(opcode)); + emit_modregrm(st, MOD_DIRECT, mod, dreg); + emit_imm(st, imm, imsz); +} + +/* + * emit one of: + * add %, % + * and %, % + * or %, % + * sub %, % + * xor %, % + */ +static void +emit_alu_reg(struct bpf_jit_state *st, uint32_t op, uint32_t sreg, + uint32_t dreg) +{ + uint32_t bop; + + static const uint8_t ops[] = { + [GET_BPF_OP(BPF_ADD)] = 0x01, + [GET_BPF_OP(BPF_AND)] = 0x21, + [GET_BPF_OP(BPF_OR)] = 0x09, + [GET_BPF_OP(BPF_SUB)] = 0x29, + [GET_BPF_OP(BPF_XOR)] = 0x31, + }; + + bop = GET_BPF_OP(op); + + emit_rex(st, op, sreg, dreg); + emit_bytes(st, &ops[bop], sizeof(ops[bop])); + emit_modregrm(st, MOD_DIRECT, sreg, dreg); +} + +static void +emit_shift(struct bpf_jit_state *st, uint32_t op, uint32_t dreg) +{ + uint8_t mod; + uint32_t bop, opx; + + static const uint8_t ops[] = {0xC1, 0xD3}; + static const uint8_t mods[] = { + [GET_BPF_OP(BPF_LSH)] = 4, + [GET_BPF_OP(BPF_RSH)] = 5, + [GET_BPF_OP(EBPF_ARSH)] = 7, + }; + + bop = GET_BPF_OP(op); + mod = mods[bop]; + opx = (BPF_SRC(op) == BPF_X); + + emit_rex(st, op, 0, dreg); + emit_bytes(st, &ops[opx], sizeof(ops[opx])); + emit_modregrm(st, MOD_DIRECT, mod, dreg); +} + +/* + * emit one of: + * shl , % + * shr , % + * sar , % + */ +static void +emit_shift_imm(struct bpf_jit_state *st, uint32_t op, uint32_t dreg, + uint32_t imm) +{ + emit_shift(st, op, dreg); + emit_imm(st, imm, imm_size(imm)); +} + +/* + * emit one of: + * shl % + * shr % + * sar % + * note that rcx is implicitly used as a source register, so few extra + * instructions for register spillage might be necessary. + */ +static void +emit_shift_reg(struct bpf_jit_state *st, uint32_t op, uint32_t sreg, + uint32_t dreg) +{ + if (sreg != RCX) + emit_xchg_reg(st, RCX, sreg); + + emit_shift(st, op, (dreg == RCX) ? sreg : dreg); + + if (sreg != RCX) + emit_xchg_reg(st, RCX, sreg); +} + +/* + * emit mov , % + */ +static void +emit_mov_imm(struct bpf_jit_state *st, uint32_t op, uint32_t dreg, uint32_t imm) +{ + const uint8_t ops = 0xC7; + + if (imm == 0) { + /* replace 'mov 0, %' with 'xor %, %' */ + op = BPF_CLASS(op) | BPF_XOR | BPF_X; + emit_alu_reg(st, op, dreg, dreg); + return; + } + + emit_rex(st, op, 0, dreg); + emit_bytes(st, &ops, sizeof(ops)); + emit_modregrm(st, MOD_DIRECT, 0, dreg); + emit_imm(st, imm, sizeof(imm)); +} + +/* + * emit mov , % + */ +static void +emit_ld_imm64(struct bpf_jit_state *st, uint32_t dreg, uint32_t imm0, + uint32_t imm1) +{ + const uint8_t ops = 0xB8; + + if (imm1 == 0) { + emit_mov_imm(st, EBPF_ALU64 | EBPF_MOV | BPF_K, dreg, imm0); + return; + } + + emit_rex(st, EBPF_ALU64, 0, dreg); + emit_bytes(st, &ops, sizeof(ops)); + emit_modregrm(st, MOD_DIRECT, 0, dreg); + + emit_imm(st, imm0, sizeof(imm0)); + emit_imm(st, imm1, sizeof(imm1)); +} + +/* + * note that rax:rdx are implicitly used as source/destination registers, + * so some reg spillage is necessary. + * emit: + * mov %rax, %r11 + * mov %rdx, %r10 + * mov %, %rax + * either: + * mov %, %rdx + * OR + * mov , %rdx + * mul %rdx + * mov %r10, %rdx + * mov %rax, % + * mov %r11, %rax + */ +static void +emit_mul(struct bpf_jit_state *st, uint32_t op, uint32_t sreg, uint32_t dreg, + uint32_t imm) +{ + const uint8_t ops = 0xF7; + const uint8_t mods = 4; + + /* save rax & rdx */ + emit_mov_reg(st, EBPF_ALU64 | EBPF_MOV | BPF_X, RAX, REG_TMP0); + emit_mov_reg(st, EBPF_ALU64 | EBPF_MOV | BPF_X, RDX, REG_TMP1); + + /* rax = dreg */ + emit_mov_reg(st, EBPF_ALU64 | EBPF_MOV | BPF_X, dreg, RAX); + + if (BPF_SRC(op) == BPF_X) + /* rdx = sreg */ + emit_mov_reg(st, EBPF_ALU64 | EBPF_MOV | BPF_X, + sreg == RAX ? REG_TMP0 : sreg, RDX); + else + /* rdx = imm */ + emit_mov_imm(st, EBPF_ALU64 | EBPF_MOV | BPF_K, RDX, imm); + + emit_rex(st, op, RAX, RDX); + emit_bytes(st, &ops, sizeof(ops)); + emit_modregrm(st, MOD_DIRECT, mods, RDX); + + if (dreg != RDX) + /* restore rdx */ + emit_mov_reg(st, EBPF_ALU64 | EBPF_MOV | BPF_X, REG_TMP1, RDX); + + if (dreg != RAX) { + /* dreg = rax */ + emit_mov_reg(st, EBPF_ALU64 | EBPF_MOV | BPF_X, RAX, dreg); + /* restore rax */ + emit_mov_reg(st, EBPF_ALU64 | EBPF_MOV | BPF_X, REG_TMP0, RAX); + } +} + +/* + * emit mov (%), % + * note that for non 64-bit ops, higher bits have to be cleared. + */ +static void +emit_ld_reg(struct bpf_jit_state *st, uint32_t op, uint32_t sreg, uint32_t dreg, + int32_t ofs) +{ + uint32_t mods, opsz; + const uint8_t op32 = 0x8B; + const uint8_t op16[] = {0x0F, 0xB7}; + const uint8_t op8[] = {0x0F, 0xB6}; + + emit_rex(st, op, dreg, sreg); + + opsz = BPF_SIZE(op); + if (opsz == BPF_B) + emit_bytes(st, op8, sizeof(op8)); + else if (opsz == BPF_H) + emit_bytes(st, op16, sizeof(op16)); + else + emit_bytes(st, &op32, sizeof(op32)); + + mods = (imm_size(ofs) == 1) ? MOD_IDISP8 : MOD_IDISP32; + + emit_modregrm(st, mods, dreg, sreg); + if (sreg == RSP || sreg == R12) + emit_sib(st, SIB_SCALE_1, sreg, sreg); + emit_imm(st, ofs, imm_size(ofs)); +} + +/* + * emit one of: + * mov %, (%) + * mov , (%) + */ +static void +emit_st_common(struct bpf_jit_state *st, uint32_t op, uint32_t sreg, + uint32_t dreg, uint32_t imm, int32_t ofs) +{ + uint32_t mods, imsz, opsz, opx; + const uint8_t prfx16 = 0x66; + + /* 8 bit instruction opcodes */ + static const uint8_t op8[] = {0xC6, 0x88}; + + /* 16/32/64 bit instruction opcodes */ + static const uint8_t ops[] = {0xC7, 0x89}; + + /* is the instruction has immediate value or src reg? */ + opx = (BPF_CLASS(op) == BPF_STX); + + opsz = BPF_SIZE(op); + if (opsz == BPF_H) + emit_bytes(st, &prfx16, sizeof(prfx16)); + + emit_rex(st, op, sreg, dreg); + + if (opsz == BPF_B) + emit_bytes(st, &op8[opx], sizeof(op8[opx])); + else + emit_bytes(st, &ops[opx], sizeof(ops[opx])); + + imsz = imm_size(ofs); + mods = (imsz == 1) ? MOD_IDISP8 : MOD_IDISP32; + + emit_modregrm(st, mods, sreg, dreg); + + if (dreg == RSP || dreg == R12) + emit_sib(st, SIB_SCALE_1, dreg, dreg); + + emit_imm(st, ofs, imsz); + + if (opx == 0) { + imsz = RTE_MIN(bpf_size(opsz), sizeof(imm)); + emit_imm(st, imm, imsz); + } +} + +static void +emit_st_imm(struct bpf_jit_state *st, uint32_t op, uint32_t dreg, uint32_t imm, + int32_t ofs) +{ + emit_st_common(st, op, 0, dreg, imm, ofs); +} + +static void +emit_st_reg(struct bpf_jit_state *st, uint32_t op, uint32_t sreg, uint32_t dreg, + int32_t ofs) +{ + emit_st_common(st, op, sreg, dreg, 0, ofs); +} + +/* + * emit lock add %, (%) + */ +static void +emit_st_xadd(struct bpf_jit_state *st, uint32_t op, uint32_t sreg, + uint32_t dreg, int32_t ofs) +{ + uint32_t imsz, mods; + + const uint8_t lck = 0xF0; /* lock prefix */ + const uint8_t ops = 0x01; /* add opcode */ + + imsz = imm_size(ofs); + mods = (imsz == 1) ? MOD_IDISP8 : MOD_IDISP32; + + emit_bytes(st, &lck, sizeof(lck)); + emit_rex(st, op, sreg, dreg); + emit_bytes(st, &ops, sizeof(ops)); + emit_modregrm(st, mods, sreg, dreg); + emit_imm(st, ofs, imsz); +} + +/* + * emit: + * mov , (%rax) + * call *%rax + */ +static void +emit_call(struct bpf_jit_state *st, uintptr_t trg) +{ + const uint8_t ops = 0xFF; + const uint8_t mods = 2; + + emit_ld_imm64(st, RAX, trg, trg >> 32); + emit_bytes(st, &ops, sizeof(ops)); + emit_modregrm(st, MOD_DIRECT, mods, RAX); +} + +/* + * emit jmp + * where 'ofs' is the target offset for the native code. + */ +static void +emit_abs_jmp(struct bpf_jit_state *st, int32_t ofs) +{ + int32_t joff; + uint32_t imsz; + + const uint8_t op8 = 0xEB; + const uint8_t op32 = 0xE9; + + const int32_t sz8 = sizeof(op8) + sizeof(uint8_t); + const int32_t sz32 = sizeof(op32) + sizeof(uint32_t); + + /* max possible jmp instruction size */ + const int32_t iszm = RTE_MAX(sz8, sz32); + + joff = ofs - st->sz; + imsz = RTE_MAX(imm_size(joff), imm_size(joff + iszm)); + + if (imsz == 1) { + emit_bytes(st, &op8, sizeof(op8)); + joff -= sz8; + } else { + emit_bytes(st, &op32, sizeof(op32)); + joff -= sz32; + } + + emit_imm(st, joff, imsz); +} + +/* + * emit jmp + * where 'ofs' is the target offset for the BPF bytecode. + */ +static void +emit_jmp(struct bpf_jit_state *st, int32_t ofs) +{ + emit_abs_jmp(st, st->off[st->idx + ofs]); +} + +/* + * emit one of: + * cmovz %, <%dreg> + * cmovne %, <%dreg> + * cmova %, <%dreg> + * cmovb %, <%dreg> + * cmovae %, <%dreg> + * cmovbe %, <%dreg> + * cmovg %, <%dreg> + * cmovl %, <%dreg> + * cmovge %, <%dreg> + * cmovle %, <%dreg> + */ +static void +emit_movcc_reg(struct bpf_jit_state *st, uint32_t op, uint32_t sreg, + uint32_t dreg) +{ + uint32_t bop; + + static const uint8_t ops[][2] = { + [GET_BPF_OP(BPF_JEQ)] = {0x0F, 0x44}, /* CMOVZ */ + [GET_BPF_OP(EBPF_JNE)] = {0x0F, 0x45}, /* CMOVNE */ + [GET_BPF_OP(BPF_JGT)] = {0x0F, 0x47}, /* CMOVA */ + [GET_BPF_OP(EBPF_JLT)] = {0x0F, 0x42}, /* CMOVB */ + [GET_BPF_OP(BPF_JGE)] = {0x0F, 0x43}, /* CMOVAE */ + [GET_BPF_OP(EBPF_JLE)] = {0x0F, 0x46}, /* CMOVBE */ + [GET_BPF_OP(EBPF_JSGT)] = {0x0F, 0x4F}, /* CMOVG */ + [GET_BPF_OP(EBPF_JSLT)] = {0x0F, 0x4C}, /* CMOVL */ + [GET_BPF_OP(EBPF_JSGE)] = {0x0F, 0x4D}, /* CMOVGE */ + [GET_BPF_OP(EBPF_JSLE)] = {0x0F, 0x4E}, /* CMOVLE */ + [GET_BPF_OP(BPF_JSET)] = {0x0F, 0x45}, /* CMOVNE */ + }; + + bop = GET_BPF_OP(op); + + emit_rex(st, op, dreg, sreg); + emit_bytes(st, ops[bop], sizeof(ops[bop])); + emit_modregrm(st, MOD_DIRECT, dreg, sreg); +} + +/* + * emit one of: + * je + * jne + * ja + * jb + * jae + * jbe + * jg + * jl + * jge + * jle + * where 'ofs' is the target offset for the native code. + */ +static void +emit_abs_jcc(struct bpf_jit_state *st, uint32_t op, int32_t ofs) +{ + uint32_t bop, imsz; + int32_t joff; + + static const uint8_t op8[] = { + [GET_BPF_OP(BPF_JEQ)] = 0x74, /* JE */ + [GET_BPF_OP(EBPF_JNE)] = 0x75, /* JNE */ + [GET_BPF_OP(BPF_JGT)] = 0x77, /* JA */ + [GET_BPF_OP(EBPF_JLT)] = 0x72, /* JB */ + [GET_BPF_OP(BPF_JGE)] = 0x73, /* JAE */ + [GET_BPF_OP(EBPF_JLE)] = 0x76, /* JBE */ + [GET_BPF_OP(EBPF_JSGT)] = 0x7F, /* JG */ + [GET_BPF_OP(EBPF_JSLT)] = 0x7C, /* JL */ + [GET_BPF_OP(EBPF_JSGE)] = 0x7D, /*JGE */ + [GET_BPF_OP(EBPF_JSLE)] = 0x7E, /* JLE */ + [GET_BPF_OP(BPF_JSET)] = 0x75, /*JNE */ + }; + + static const uint8_t op32[][2] = { + [GET_BPF_OP(BPF_JEQ)] = {0x0F, 0x84}, /* JE */ + [GET_BPF_OP(EBPF_JNE)] = {0x0F, 0x85}, /* JNE */ + [GET_BPF_OP(BPF_JGT)] = {0x0F, 0x87}, /* JA */ + [GET_BPF_OP(EBPF_JLT)] = {0x0F, 0x82}, /* JB */ + [GET_BPF_OP(BPF_JGE)] = {0x0F, 0x83}, /* JAE */ + [GET_BPF_OP(EBPF_JLE)] = {0x0F, 0x86}, /* JBE */ + [GET_BPF_OP(EBPF_JSGT)] = {0x0F, 0x8F}, /* JG */ + [GET_BPF_OP(EBPF_JSLT)] = {0x0F, 0x8C}, /* JL */ + [GET_BPF_OP(EBPF_JSGE)] = {0x0F, 0x8D}, /*JGE */ + [GET_BPF_OP(EBPF_JSLE)] = {0x0F, 0x8E}, /* JLE */ + [GET_BPF_OP(BPF_JSET)] = {0x0F, 0x85}, /*JNE */ + }; + + const int32_t sz8 = sizeof(op8[0]) + sizeof(uint8_t); + const int32_t sz32 = sizeof(op32[0]) + sizeof(uint32_t); + + /* max possible jcc instruction size */ + const int32_t iszm = RTE_MAX(sz8, sz32); + + joff = ofs - st->sz; + imsz = RTE_MAX(imm_size(joff), imm_size(joff + iszm)); + + bop = GET_BPF_OP(op); + + if (imsz == 1) { + emit_bytes(st, &op8[bop], sizeof(op8[bop])); + joff -= sz8; + } else { + emit_bytes(st, op32[bop], sizeof(op32[bop])); + joff -= sz32; + } + + emit_imm(st, joff, imsz); +} + +/* + * emit one of: + * je + * jne + * ja + * jb + * jae + * jbe + * jg + * jl + * jge + * jle + * where 'ofs' is the target offset for the BPF bytecode. + */ +static void +emit_jcc(struct bpf_jit_state *st, uint32_t op, int32_t ofs) +{ + emit_abs_jcc(st, op, st->off[st->idx + ofs]); +} + + +/* + * emit cmp , % + */ +static void +emit_cmp_imm(struct bpf_jit_state *st, uint32_t op, uint32_t dreg, uint32_t imm) +{ + uint8_t ops; + uint32_t imsz; + + const uint8_t op8 = 0x83; + const uint8_t op32 = 0x81; + const uint8_t mods = 7; + + imsz = imm_size(imm); + ops = (imsz == 1) ? op8 : op32; + + emit_rex(st, op, 0, dreg); + emit_bytes(st, &ops, sizeof(ops)); + emit_modregrm(st, MOD_DIRECT, mods, dreg); + emit_imm(st, imm, imsz); +} + +/* + * emit test , % + */ +static void +emit_tst_imm(struct bpf_jit_state *st, uint32_t op, uint32_t dreg, uint32_t imm) +{ + const uint8_t ops = 0xF7; + const uint8_t mods = 0; + + emit_rex(st, op, 0, dreg); + emit_bytes(st, &ops, sizeof(ops)); + emit_modregrm(st, MOD_DIRECT, mods, dreg); + emit_imm(st, imm, imm_size(imm)); +} + +static void +emit_jcc_imm(struct bpf_jit_state *st, uint32_t op, uint32_t dreg, + uint32_t imm, int32_t ofs) +{ + if (BPF_OP(op) == BPF_JSET) + emit_tst_imm(st, EBPF_ALU64, dreg, imm); + else + emit_cmp_imm(st, EBPF_ALU64, dreg, imm); + + emit_jcc(st, op, ofs); +} + +/* + * emit test %, % + */ +static void +emit_tst_reg(struct bpf_jit_state *st, uint32_t op, uint32_t sreg, + uint32_t dreg) +{ + const uint8_t ops = 0x85; + + emit_rex(st, op, sreg, dreg); + emit_bytes(st, &ops, sizeof(ops)); + emit_modregrm(st, MOD_DIRECT, sreg, dreg); +} + +/* + * emit cmp %, % + */ +static void +emit_cmp_reg(struct bpf_jit_state *st, uint32_t op, uint32_t sreg, + uint32_t dreg) +{ + const uint8_t ops = 0x39; + + emit_rex(st, op, sreg, dreg); + emit_bytes(st, &ops, sizeof(ops)); + emit_modregrm(st, MOD_DIRECT, sreg, dreg); + +} + +static void +emit_jcc_reg(struct bpf_jit_state *st, uint32_t op, uint32_t sreg, + uint32_t dreg, int32_t ofs) +{ + if (BPF_OP(op) == BPF_JSET) + emit_tst_reg(st, EBPF_ALU64, sreg, dreg); + else + emit_cmp_reg(st, EBPF_ALU64, sreg, dreg); + + emit_jcc(st, op, ofs); +} + +/* + * note that rax:rdx are implicitly used as source/destination registers, + * so some reg spillage is necessary. + * emit: + * mov %rax, %r11 + * mov %rdx, %r10 + * mov %, %rax + * xor %rdx, %rdx + * for divisor as immediate value: + * mov , %r9 + * div % + * mov %r10, %rdx + * mov %rax, % + * mov %r11, %rax + * either: + * mov %rax, % + * OR + * mov %rdx, % + * mov %r11, %rax + * mov %r10, %rdx + */ +static void +emit_div(struct bpf_jit_state *st, uint32_t op, uint32_t sreg, uint32_t dreg, + uint32_t imm) +{ + uint32_t sr; + + const uint8_t ops = 0xF7; + const uint8_t mods = 6; + + if (BPF_SRC(op) == BPF_X) { + + /* check that src divisor is not zero */ + emit_tst_reg(st, BPF_CLASS(op), sreg, sreg); + + /* exit with return value zero */ + emit_movcc_reg(st, BPF_CLASS(op) | BPF_JEQ | BPF_X, sreg, RAX); + emit_abs_jcc(st, BPF_JMP | BPF_JEQ | BPF_K, st->exit.off); + } + + /* save rax & rdx */ + if (dreg != RAX) + emit_mov_reg(st, EBPF_ALU64 | EBPF_MOV | BPF_X, RAX, REG_TMP0); + if (dreg != RDX) + emit_mov_reg(st, EBPF_ALU64 | EBPF_MOV | BPF_X, RDX, REG_TMP1); + + /* fill rax & rdx */ + emit_mov_reg(st, EBPF_ALU64 | EBPF_MOV | BPF_X, dreg, RAX); + emit_mov_imm(st, EBPF_ALU64 | EBPF_MOV | BPF_K, RDX, 0); + + if (BPF_SRC(op) == BPF_X) { + sr = sreg; + if (sr == RAX) + sr = REG_TMP0; + else if (sr == RDX) + sr = REG_TMP1; + } else { + sr = REG_DIV_IMM; + emit_mov_imm(st, EBPF_ALU64 | EBPF_MOV | BPF_K, sr, imm); + } + + emit_rex(st, op, 0, sr); + emit_bytes(st, &ops, sizeof(ops)); + emit_modregrm(st, MOD_DIRECT, mods, sr); + + if (BPF_OP(op) == BPF_DIV) + emit_mov_reg(st, EBPF_ALU64 | EBPF_MOV | BPF_X, RAX, dreg); + else + emit_mov_reg(st, EBPF_ALU64 | EBPF_MOV | BPF_X, RDX, dreg); + + if (dreg != RAX) + emit_mov_reg(st, EBPF_ALU64 | EBPF_MOV | BPF_X, REG_TMP0, RAX); + if (dreg != RDX) + emit_mov_reg(st, EBPF_ALU64 | EBPF_MOV | BPF_X, REG_TMP1, RDX); +} + +static void +emit_prolog(struct bpf_jit_state *st, int32_t stack_size) +{ + uint32_t i; + int32_t spil, ofs; + + spil = 0; + for (i = 0; i != RTE_DIM(save_regs); i++) + spil += INUSE(st->reguse, save_regs[i]); + + /* we can avoid touching the stack at all */ + if (spil == 0) + return; + + + emit_alu_imm(st, EBPF_ALU64 | BPF_SUB | BPF_K, RSP, + spil * sizeof(uint64_t)); + + ofs = 0; + for (i = 0; i != RTE_DIM(save_regs); i++) { + if (INUSE(st->reguse, save_regs[i]) != 0) { + emit_st_reg(st, BPF_STX | BPF_MEM | EBPF_DW, + save_regs[i], RSP, ofs); + ofs += sizeof(uint64_t); + } + } + + if (INUSE(st->reguse, RBP) != 0) { + emit_mov_reg(st, EBPF_ALU64 | EBPF_MOV | BPF_X, RSP, RBP); + emit_alu_imm(st, EBPF_ALU64 | BPF_SUB | BPF_K, RSP, stack_size); + } +} + +/* + * emit ret + */ +static void +emit_ret(struct bpf_jit_state *st) +{ + const uint8_t ops = 0xC3; + + emit_bytes(st, &ops, sizeof(ops)); +} + +static void +emit_epilog(struct bpf_jit_state *st) +{ + uint32_t i; + int32_t spil, ofs; + + /* if we allready have an epilog generate a jump to it */ + if (st->exit.num++ != 0) { + emit_abs_jmp(st, st->exit.off); + return; + } + + /* store offset of epilog block */ + st->exit.off = st->sz; + + spil = 0; + for (i = 0; i != RTE_DIM(save_regs); i++) + spil += INUSE(st->reguse, save_regs[i]); + + if (spil != 0) { + + if (INUSE(st->reguse, RBP) != 0) + emit_mov_reg(st, EBPF_ALU64 | EBPF_MOV | BPF_X, + RBP, RSP); + + ofs = 0; + for (i = 0; i != RTE_DIM(save_regs); i++) { + if (INUSE(st->reguse, save_regs[i]) != 0) { + emit_ld_reg(st, BPF_LDX | BPF_MEM | EBPF_DW, + RSP, save_regs[i], ofs); + ofs += sizeof(uint64_t); + } + } + + emit_alu_imm(st, EBPF_ALU64 | BPF_ADD | BPF_K, RSP, + spil * sizeof(uint64_t)); + } + + emit_ret(st); +} + +/* + * walk through bpf code and translate them x86_64 one. + */ +static int +emit(struct bpf_jit_state *st, const struct rte_bpf *bpf) +{ + uint32_t i, dr, op, sr; + const struct ebpf_insn *ins; + + /* reset state fields */ + st->sz = 0; + st->exit.num = 0; + + emit_prolog(st, bpf->stack_sz); + + for (i = 0; i != bpf->prm.nb_ins; i++) { + + st->idx = i; + st->off[i] = st->sz; + + ins = bpf->prm.ins + i; + + dr = ebpf2x86[ins->dst_reg]; + sr = ebpf2x86[ins->src_reg]; + op = ins->code; + + switch (op) { + /* 32 bit ALU IMM operations */ + case (BPF_ALU | BPF_ADD | BPF_K): + case (BPF_ALU | BPF_SUB | BPF_K): + case (BPF_ALU | BPF_AND | BPF_K): + case (BPF_ALU | BPF_OR | BPF_K): + case (BPF_ALU | BPF_XOR | BPF_K): + emit_alu_imm(st, op, dr, ins->imm); + break; + case (BPF_ALU | BPF_LSH | BPF_K): + case (BPF_ALU | BPF_RSH | BPF_K): + emit_shift_imm(st, op, dr, ins->imm); + break; + case (BPF_ALU | EBPF_MOV | BPF_K): + emit_mov_imm(st, op, dr, ins->imm); + break; + /* 32 bit ALU REG operations */ + case (BPF_ALU | BPF_ADD | BPF_X): + case (BPF_ALU | BPF_SUB | BPF_X): + case (BPF_ALU | BPF_AND | BPF_X): + case (BPF_ALU | BPF_OR | BPF_X): + case (BPF_ALU | BPF_XOR | BPF_X): + emit_alu_reg(st, op, sr, dr); + break; + case (BPF_ALU | BPF_LSH | BPF_X): + case (BPF_ALU | BPF_RSH | BPF_X): + emit_shift_reg(st, op, sr, dr); + break; + case (BPF_ALU | EBPF_MOV | BPF_X): + emit_mov_reg(st, op, sr, dr); + break; + case (BPF_ALU | BPF_NEG): + emit_neg(st, op, dr); + break; + case (BPF_ALU | EBPF_END | EBPF_TO_BE): + emit_be2le(st, dr, ins->imm); + break; + case (BPF_ALU | EBPF_END | EBPF_TO_LE): + emit_le2be(st, dr, ins->imm); + break; + /* 64 bit ALU IMM operations */ + case (EBPF_ALU64 | BPF_ADD | BPF_K): + case (EBPF_ALU64 | BPF_SUB | BPF_K): + case (EBPF_ALU64 | BPF_AND | BPF_K): + case (EBPF_ALU64 | BPF_OR | BPF_K): + case (EBPF_ALU64 | BPF_XOR | BPF_K): + emit_alu_imm(st, op, dr, ins->imm); + break; + case (EBPF_ALU64 | BPF_LSH | BPF_K): + case (EBPF_ALU64 | BPF_RSH | BPF_K): + case (EBPF_ALU64 | EBPF_ARSH | BPF_K): + emit_shift_imm(st, op, dr, ins->imm); + break; + case (EBPF_ALU64 | EBPF_MOV | BPF_K): + emit_mov_imm(st, op, dr, ins->imm); + break; + /* 64 bit ALU REG operations */ + case (EBPF_ALU64 | BPF_ADD | BPF_X): + case (EBPF_ALU64 | BPF_SUB | BPF_X): + case (EBPF_ALU64 | BPF_AND | BPF_X): + case (EBPF_ALU64 | BPF_OR | BPF_X): + case (EBPF_ALU64 | BPF_XOR | BPF_X): + emit_alu_reg(st, op, sr, dr); + break; + case (EBPF_ALU64 | BPF_LSH | BPF_X): + case (EBPF_ALU64 | BPF_RSH | BPF_X): + case (EBPF_ALU64 | EBPF_ARSH | BPF_X): + emit_shift_reg(st, op, sr, dr); + break; + case (EBPF_ALU64 | EBPF_MOV | BPF_X): + emit_mov_reg(st, op, sr, dr); + break; + case (EBPF_ALU64 | BPF_NEG): + emit_neg(st, op, dr); + break; + /* multiply instructions */ + case (BPF_ALU | BPF_MUL | BPF_K): + case (BPF_ALU | BPF_MUL | BPF_X): + case (EBPF_ALU64 | BPF_MUL | BPF_K): + case (EBPF_ALU64 | BPF_MUL | BPF_X): + emit_mul(st, op, sr, dr, ins->imm); + break; + /* divide instructions */ + case (BPF_ALU | BPF_DIV | BPF_K): + case (BPF_ALU | BPF_MOD | BPF_K): + case (BPF_ALU | BPF_DIV | BPF_X): + case (BPF_ALU | BPF_MOD | BPF_X): + case (EBPF_ALU64 | BPF_DIV | BPF_K): + case (EBPF_ALU64 | BPF_MOD | BPF_K): + case (EBPF_ALU64 | BPF_DIV | BPF_X): + case (EBPF_ALU64 | BPF_MOD | BPF_X): + emit_div(st, op, sr, dr, ins->imm); + break; + /* load instructions */ + case (BPF_LDX | BPF_MEM | BPF_B): + case (BPF_LDX | BPF_MEM | BPF_H): + case (BPF_LDX | BPF_MEM | BPF_W): + case (BPF_LDX | BPF_MEM | EBPF_DW): + emit_ld_reg(st, op, sr, dr, ins->off); + break; + /* load 64 bit immediate value */ + case (BPF_LD | BPF_IMM | EBPF_DW): + emit_ld_imm64(st, dr, ins[0].imm, ins[1].imm); + i++; + break; + /* store instructions */ + case (BPF_STX | BPF_MEM | BPF_B): + case (BPF_STX | BPF_MEM | BPF_H): + case (BPF_STX | BPF_MEM | BPF_W): + case (BPF_STX | BPF_MEM | EBPF_DW): + emit_st_reg(st, op, sr, dr, ins->off); + break; + case (BPF_ST | BPF_MEM | BPF_B): + case (BPF_ST | BPF_MEM | BPF_H): + case (BPF_ST | BPF_MEM | BPF_W): + case (BPF_ST | BPF_MEM | EBPF_DW): + emit_st_imm(st, op, dr, ins->imm, ins->off); + break; + /* atomic add instructions */ + case (BPF_STX | EBPF_XADD | BPF_W): + case (BPF_STX | EBPF_XADD | EBPF_DW): + emit_st_xadd(st, op, sr, dr, ins->off); + break; + /* jump instructions */ + case (BPF_JMP | BPF_JA): + emit_jmp(st, ins->off + 1); + break; + /* jump IMM instructions */ + case (BPF_JMP | BPF_JEQ | BPF_K): + case (BPF_JMP | EBPF_JNE | BPF_K): + case (BPF_JMP | BPF_JGT | BPF_K): + case (BPF_JMP | EBPF_JLT | BPF_K): + case (BPF_JMP | BPF_JGE | BPF_K): + case (BPF_JMP | EBPF_JLE | BPF_K): + case (BPF_JMP | EBPF_JSGT | BPF_K): + case (BPF_JMP | EBPF_JSLT | BPF_K): + case (BPF_JMP | EBPF_JSGE | BPF_K): + case (BPF_JMP | EBPF_JSLE | BPF_K): + case (BPF_JMP | BPF_JSET | BPF_K): + emit_jcc_imm(st, op, dr, ins->imm, ins->off + 1); + break; + /* jump REG instructions */ + case (BPF_JMP | BPF_JEQ | BPF_X): + case (BPF_JMP | EBPF_JNE | BPF_X): + case (BPF_JMP | BPF_JGT | BPF_X): + case (BPF_JMP | EBPF_JLT | BPF_X): + case (BPF_JMP | BPF_JGE | BPF_X): + case (BPF_JMP | EBPF_JLE | BPF_X): + case (BPF_JMP | EBPF_JSGT | BPF_X): + case (BPF_JMP | EBPF_JSLT | BPF_X): + case (BPF_JMP | EBPF_JSGE | BPF_X): + case (BPF_JMP | EBPF_JSLE | BPF_X): + case (BPF_JMP | BPF_JSET | BPF_X): + emit_jcc_reg(st, op, sr, dr, ins->off + 1); + break; + /* call instructions */ + case (BPF_JMP | EBPF_CALL): + emit_call(st, + (uintptr_t)bpf->prm.xsym[ins->imm].func.val); + break; + /* return instruction */ + case (BPF_JMP | EBPF_EXIT): + emit_epilog(st); + break; + default: + RTE_BPF_LOG(ERR, + "%s(%p): invalid opcode %#x at pc: %u;\n", + __func__, bpf, ins->code, i); + return -EINVAL; + } + } + + return 0; +} + +/* + * produce a native ISA version of the given BPF code. + */ +int +bpf_jit_x86(struct rte_bpf *bpf) +{ + int32_t rc; + uint32_t i; + size_t sz; + struct bpf_jit_state st; + + /* init state */ + memset(&st, 0, sizeof(st)); + st.off = malloc(bpf->prm.nb_ins * sizeof(st.off[0])); + if (st.off == NULL) + return -ENOMEM; + + /* fill with fake offsets */ + st.exit.off = INT32_MAX; + for (i = 0; i != bpf->prm.nb_ins; i++) + st.off[i] = INT32_MAX; + + /* + * dry runs, used to calculate total code size and valid jump offsets. + * stop when we get minimal possible size + */ + do { + sz = st.sz; + rc = emit(&st, bpf); + } while (rc == 0 && sz != st.sz); + + if (rc == 0) { + + /* allocate memory needed */ + st.ins = mmap(NULL, st.sz, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (st.ins == MAP_FAILED) + rc = -ENOMEM; + else + /* generate code */ + rc = emit(&st, bpf); + } + + if (rc == 0 && mprotect(st.ins, st.sz, PROT_READ | PROT_EXEC) != 0) + rc = -ENOMEM; + + if (rc != 0) + munmap(st.ins, st.sz); + else { + bpf->jit.func = (void *)st.ins; + bpf->jit.sz = st.sz; + } + + free(st.off); + return rc; +} diff --git a/lib/librte_bpf/bpf_load.c b/lib/librte_bpf/bpf_load.c new file mode 100644 index 00000000..2b84fe72 --- /dev/null +++ b/lib/librte_bpf/bpf_load.c @@ -0,0 +1,148 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "bpf_impl.h" + +static struct rte_bpf * +bpf_load(const struct rte_bpf_prm *prm) +{ + uint8_t *buf; + struct rte_bpf *bpf; + size_t sz, bsz, insz, xsz; + + xsz = prm->nb_xsym * sizeof(prm->xsym[0]); + insz = prm->nb_ins * sizeof(prm->ins[0]); + bsz = sizeof(bpf[0]); + sz = insz + xsz + bsz; + + buf = mmap(NULL, sz, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (buf == MAP_FAILED) + return NULL; + + bpf = (void *)buf; + bpf->sz = sz; + + memcpy(&bpf->prm, prm, sizeof(bpf->prm)); + + memcpy(buf + bsz, prm->xsym, xsz); + memcpy(buf + bsz + xsz, prm->ins, insz); + + bpf->prm.xsym = (void *)(buf + bsz); + bpf->prm.ins = (void *)(buf + bsz + xsz); + + return bpf; +} + +/* + * Check that user provided external symbol. + */ +static int +bpf_check_xsym(const struct rte_bpf_xsym *xsym) +{ + uint32_t i; + + if (xsym->name == NULL) + return -EINVAL; + + if (xsym->type == RTE_BPF_XTYPE_VAR) { + if (xsym->var.desc.type == RTE_BPF_ARG_UNDEF) + return -EINVAL; + } else if (xsym->type == RTE_BPF_XTYPE_FUNC) { + + if (xsym->func.nb_args > EBPF_FUNC_MAX_ARGS) + return -EINVAL; + + /* check function arguments */ + for (i = 0; i != xsym->func.nb_args; i++) { + if (xsym->func.args[i].type == RTE_BPF_ARG_UNDEF) + return -EINVAL; + } + + /* check return value info */ + if (xsym->func.ret.type != RTE_BPF_ARG_UNDEF && + xsym->func.ret.size == 0) + return -EINVAL; + } else + return -EINVAL; + + return 0; +} + +__rte_experimental struct rte_bpf * +rte_bpf_load(const struct rte_bpf_prm *prm) +{ + struct rte_bpf *bpf; + int32_t rc; + uint32_t i; + + if (prm == NULL || prm->ins == NULL || + (prm->nb_xsym != 0 && prm->xsym == NULL)) { + rte_errno = EINVAL; + return NULL; + } + + rc = 0; + for (i = 0; i != prm->nb_xsym && rc == 0; i++) + rc = bpf_check_xsym(prm->xsym + i); + + if (rc != 0) { + rte_errno = -rc; + RTE_BPF_LOG(ERR, "%s: %d-th xsym is invalid\n", __func__, i); + return NULL; + } + + bpf = bpf_load(prm); + if (bpf == NULL) { + rte_errno = ENOMEM; + return NULL; + } + + rc = bpf_validate(bpf); + if (rc == 0) { + bpf_jit(bpf); + if (mprotect(bpf, bpf->sz, PROT_READ) != 0) + rc = -ENOMEM; + } + + if (rc != 0) { + rte_bpf_destroy(bpf); + rte_errno = -rc; + return NULL; + } + + return bpf; +} + +__rte_experimental __attribute__ ((weak)) struct rte_bpf * +rte_bpf_elf_load(const struct rte_bpf_prm *prm, const char *fname, + const char *sname) +{ + if (prm == NULL || fname == NULL || sname == NULL) { + rte_errno = EINVAL; + return NULL; + } + + RTE_BPF_LOG(ERR, "%s() is not supported with current config\n" + "rebuild with libelf installed\n", + __func__); + rte_errno = ENOTSUP; + return NULL; +} diff --git a/lib/librte_bpf/bpf_load_elf.c b/lib/librte_bpf/bpf_load_elf.c new file mode 100644 index 00000000..96d3630f --- /dev/null +++ b/lib/librte_bpf/bpf_load_elf.c @@ -0,0 +1,322 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "bpf_impl.h" + +/* To overcome compatibility issue */ +#ifndef EM_BPF +#define EM_BPF 247 +#endif + +static uint32_t +bpf_find_xsym(const char *sn, enum rte_bpf_xtype type, + const struct rte_bpf_xsym fp[], uint32_t fn) +{ + uint32_t i; + + if (sn == NULL || fp == NULL) + return UINT32_MAX; + + for (i = 0; i != fn; i++) { + if (fp[i].type == type && strcmp(sn, fp[i].name) == 0) + break; + } + + return (i != fn) ? i : UINT32_MAX; +} + +/* + * update BPF code at offset *ofs* with a proper address(index) for external + * symbol *sn* + */ +static int +resolve_xsym(const char *sn, size_t ofs, struct ebpf_insn *ins, size_t ins_sz, + const struct rte_bpf_prm *prm) +{ + uint32_t idx, fidx; + enum rte_bpf_xtype type; + + if (ofs % sizeof(ins[0]) != 0 || ofs >= ins_sz) + return -EINVAL; + + idx = ofs / sizeof(ins[0]); + if (ins[idx].code == (BPF_JMP | EBPF_CALL)) + type = RTE_BPF_XTYPE_FUNC; + else if (ins[idx].code == (BPF_LD | BPF_IMM | EBPF_DW) && + ofs < ins_sz - sizeof(ins[idx])) + type = RTE_BPF_XTYPE_VAR; + else + return -EINVAL; + + fidx = bpf_find_xsym(sn, type, prm->xsym, prm->nb_xsym); + if (fidx == UINT32_MAX) + return -ENOENT; + + /* for function we just need an index in our xsym table */ + if (type == RTE_BPF_XTYPE_FUNC) + ins[idx].imm = fidx; + /* for variable we need to store its absolute address */ + else { + ins[idx].imm = (uintptr_t)prm->xsym[fidx].var.val; + ins[idx + 1].imm = + (uint64_t)(uintptr_t)prm->xsym[fidx].var.val >> 32; + } + + return 0; +} + +static int +check_elf_header(const Elf64_Ehdr *eh) +{ + const char *err; + + err = NULL; + +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN + if (eh->e_ident[EI_DATA] != ELFDATA2LSB) +#else + if (eh->e_ident[EI_DATA] != ELFDATA2MSB) +#endif + err = "not native byte order"; + else if (eh->e_ident[EI_OSABI] != ELFOSABI_NONE) + err = "unexpected OS ABI"; + else if (eh->e_type != ET_REL) + err = "unexpected ELF type"; + else if (eh->e_machine != EM_NONE && eh->e_machine != EM_BPF) + err = "unexpected machine type"; + + if (err != NULL) { + RTE_BPF_LOG(ERR, "%s(): %s\n", __func__, err); + return -EINVAL; + } + + return 0; +} + +/* + * helper function, find executable section by name. + */ +static int +find_elf_code(Elf *elf, const char *section, Elf_Data **psd, size_t *pidx) +{ + Elf_Scn *sc; + const Elf64_Ehdr *eh; + const Elf64_Shdr *sh; + Elf_Data *sd; + const char *sn; + int32_t rc; + + eh = elf64_getehdr(elf); + if (eh == NULL) { + rc = elf_errno(); + RTE_BPF_LOG(ERR, "%s(%p, %s) error code: %d(%s)\n", + __func__, elf, section, rc, elf_errmsg(rc)); + return -EINVAL; + } + + if (check_elf_header(eh) != 0) + return -EINVAL; + + /* find given section by name */ + for (sc = elf_nextscn(elf, NULL); sc != NULL; + sc = elf_nextscn(elf, sc)) { + sh = elf64_getshdr(sc); + sn = elf_strptr(elf, eh->e_shstrndx, sh->sh_name); + if (sn != NULL && strcmp(section, sn) == 0 && + sh->sh_type == SHT_PROGBITS && + sh->sh_flags == (SHF_ALLOC | SHF_EXECINSTR)) + break; + } + + sd = elf_getdata(sc, NULL); + if (sd == NULL || sd->d_size == 0 || + sd->d_size % sizeof(struct ebpf_insn) != 0) { + rc = elf_errno(); + RTE_BPF_LOG(ERR, "%s(%p, %s) error code: %d(%s)\n", + __func__, elf, section, rc, elf_errmsg(rc)); + return -EINVAL; + } + + *psd = sd; + *pidx = elf_ndxscn(sc); + return 0; +} + +/* + * helper function to process data from relocation table. + */ +static int +process_reloc(Elf *elf, size_t sym_idx, Elf64_Rel *re, size_t re_sz, + struct ebpf_insn *ins, size_t ins_sz, const struct rte_bpf_prm *prm) +{ + int32_t rc; + uint32_t i, n; + size_t ofs, sym; + const char *sn; + const Elf64_Ehdr *eh; + Elf_Scn *sc; + const Elf_Data *sd; + Elf64_Sym *sm; + + eh = elf64_getehdr(elf); + + /* get symtable by section index */ + sc = elf_getscn(elf, sym_idx); + sd = elf_getdata(sc, NULL); + if (sd == NULL) + return -EINVAL; + sm = sd->d_buf; + + n = re_sz / sizeof(re[0]); + for (i = 0; i != n; i++) { + + ofs = re[i].r_offset; + + /* retrieve index in the symtable */ + sym = ELF64_R_SYM(re[i].r_info); + if (sym * sizeof(sm[0]) >= sd->d_size) + return -EINVAL; + + sn = elf_strptr(elf, eh->e_shstrndx, sm[sym].st_name); + + rc = resolve_xsym(sn, ofs, ins, ins_sz, prm); + if (rc != 0) { + RTE_BPF_LOG(ERR, + "resolve_xsym(%s, %zu) error code: %d\n", + sn, ofs, rc); + return rc; + } + } + + return 0; +} + +/* + * helper function, find relocation information (if any) + * and update bpf code. + */ +static int +elf_reloc_code(Elf *elf, Elf_Data *ed, size_t sidx, + const struct rte_bpf_prm *prm) +{ + Elf64_Rel *re; + Elf_Scn *sc; + const Elf64_Shdr *sh; + const Elf_Data *sd; + int32_t rc; + + rc = 0; + + /* walk through all sections */ + for (sc = elf_nextscn(elf, NULL); sc != NULL && rc == 0; + sc = elf_nextscn(elf, sc)) { + + sh = elf64_getshdr(sc); + + /* relocation data for our code section */ + if (sh->sh_type == SHT_REL && sh->sh_info == sidx) { + sd = elf_getdata(sc, NULL); + if (sd == NULL || sd->d_size == 0 || + sd->d_size % sizeof(re[0]) != 0) + return -EINVAL; + rc = process_reloc(elf, sh->sh_link, + sd->d_buf, sd->d_size, ed->d_buf, ed->d_size, + prm); + } + } + + return rc; +} + +static struct rte_bpf * +bpf_load_elf(const struct rte_bpf_prm *prm, int32_t fd, const char *section) +{ + Elf *elf; + Elf_Data *sd; + size_t sidx; + int32_t rc; + struct rte_bpf *bpf; + struct rte_bpf_prm np; + + elf_version(EV_CURRENT); + elf = elf_begin(fd, ELF_C_READ, NULL); + + rc = find_elf_code(elf, section, &sd, &sidx); + if (rc == 0) + rc = elf_reloc_code(elf, sd, sidx, prm); + + if (rc == 0) { + np = prm[0]; + np.ins = sd->d_buf; + np.nb_ins = sd->d_size / sizeof(struct ebpf_insn); + bpf = rte_bpf_load(&np); + } else { + bpf = NULL; + rte_errno = -rc; + } + + elf_end(elf); + return bpf; +} + +__rte_experimental struct rte_bpf * +rte_bpf_elf_load(const struct rte_bpf_prm *prm, const char *fname, + const char *sname) +{ + int32_t fd, rc; + struct rte_bpf *bpf; + + if (prm == NULL || fname == NULL || sname == NULL) { + rte_errno = EINVAL; + return NULL; + } + + fd = open(fname, O_RDONLY); + if (fd < 0) { + rc = errno; + RTE_BPF_LOG(ERR, "%s(%s) error code: %d(%s)\n", + __func__, fname, rc, strerror(rc)); + rte_errno = EINVAL; + return NULL; + } + + bpf = bpf_load_elf(prm, fd, sname); + close(fd); + + if (bpf == NULL) { + RTE_BPF_LOG(ERR, + "%s(fname=\"%s\", sname=\"%s\") failed, " + "error code: %d\n", + __func__, fname, sname, rte_errno); + return NULL; + } + + RTE_BPF_LOG(INFO, "%s(fname=\"%s\", sname=\"%s\") " + "successfully creates %p(jit={.func=%p,.sz=%zu});\n", + __func__, fname, sname, bpf, bpf->jit.func, bpf->jit.sz); + return bpf; +} diff --git a/lib/librte_bpf/bpf_pkt.c b/lib/librte_bpf/bpf_pkt.c new file mode 100644 index 00000000..ab9daa52 --- /dev/null +++ b/lib/librte_bpf/bpf_pkt.c @@ -0,0 +1,605 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "bpf_impl.h" + +/* + * information about installed BPF rx/tx callback + */ + +struct bpf_eth_cbi { + /* used by both data & control path */ + uint32_t use; /*usage counter */ + const struct rte_eth_rxtx_callback *cb; /* callback handle */ + struct rte_bpf *bpf; + struct rte_bpf_jit jit; + /* used by control path only */ + LIST_ENTRY(bpf_eth_cbi) link; + uint16_t port; + uint16_t queue; +} __rte_cache_aligned; + +/* + * Odd number means that callback is used by datapath. + * Even number means that callback is not used by datapath. + */ +#define BPF_ETH_CBI_INUSE 1 + +/* + * List to manage RX/TX installed callbacks. + */ +LIST_HEAD(bpf_eth_cbi_list, bpf_eth_cbi); + +enum { + BPF_ETH_RX, + BPF_ETH_TX, + BPF_ETH_NUM, +}; + +/* + * information about all installed BPF rx/tx callbacks + */ +struct bpf_eth_cbh { + rte_spinlock_t lock; + struct bpf_eth_cbi_list list; + uint32_t type; +}; + +static struct bpf_eth_cbh rx_cbh = { + .lock = RTE_SPINLOCK_INITIALIZER, + .list = LIST_HEAD_INITIALIZER(list), + .type = BPF_ETH_RX, +}; + +static struct bpf_eth_cbh tx_cbh = { + .lock = RTE_SPINLOCK_INITIALIZER, + .list = LIST_HEAD_INITIALIZER(list), + .type = BPF_ETH_TX, +}; + +/* + * Marks given callback as used by datapath. + */ +static __rte_always_inline void +bpf_eth_cbi_inuse(struct bpf_eth_cbi *cbi) +{ + cbi->use++; + /* make sure no store/load reordering could happen */ + rte_smp_mb(); +} + +/* + * Marks given callback list as not used by datapath. + */ +static __rte_always_inline void +bpf_eth_cbi_unuse(struct bpf_eth_cbi *cbi) +{ + /* make sure all previous loads are completed */ + rte_smp_rmb(); + cbi->use++; +} + +/* + * Waits till datapath finished using given callback. + */ +static void +bpf_eth_cbi_wait(const struct bpf_eth_cbi *cbi) +{ + uint32_t nuse, puse; + + /* make sure all previous loads and stores are completed */ + rte_smp_mb(); + + puse = cbi->use; + + /* in use, busy wait till current RX/TX iteration is finished */ + if ((puse & BPF_ETH_CBI_INUSE) != 0) { + do { + rte_pause(); + rte_compiler_barrier(); + nuse = cbi->use; + } while (nuse == puse); + } +} + +static void +bpf_eth_cbi_cleanup(struct bpf_eth_cbi *bc) +{ + bc->bpf = NULL; + memset(&bc->jit, 0, sizeof(bc->jit)); +} + +static struct bpf_eth_cbi * +bpf_eth_cbh_find(struct bpf_eth_cbh *cbh, uint16_t port, uint16_t queue) +{ + struct bpf_eth_cbi *cbi; + + LIST_FOREACH(cbi, &cbh->list, link) { + if (cbi->port == port && cbi->queue == queue) + break; + } + return cbi; +} + +static struct bpf_eth_cbi * +bpf_eth_cbh_add(struct bpf_eth_cbh *cbh, uint16_t port, uint16_t queue) +{ + struct bpf_eth_cbi *cbi; + + /* return an existing one */ + cbi = bpf_eth_cbh_find(cbh, port, queue); + if (cbi != NULL) + return cbi; + + cbi = rte_zmalloc(NULL, sizeof(*cbi), RTE_CACHE_LINE_SIZE); + if (cbi != NULL) { + cbi->port = port; + cbi->queue = queue; + LIST_INSERT_HEAD(&cbh->list, cbi, link); + } + return cbi; +} + +/* + * BPF packet processing routinies. + */ + +static inline uint32_t +apply_filter(struct rte_mbuf *mb[], const uint64_t rc[], uint32_t num, + uint32_t drop) +{ + uint32_t i, j, k; + struct rte_mbuf *dr[num]; + + for (i = 0, j = 0, k = 0; i != num; i++) { + + /* filter matches */ + if (rc[i] != 0) + mb[j++] = mb[i]; + /* no match */ + else + dr[k++] = mb[i]; + } + + if (drop != 0) { + /* free filtered out mbufs */ + for (i = 0; i != k; i++) + rte_pktmbuf_free(dr[i]); + } else { + /* copy filtered out mbufs beyond good ones */ + for (i = 0; i != k; i++) + mb[j + i] = dr[i]; + } + + return j; +} + +static inline uint32_t +pkt_filter_vm(const struct rte_bpf *bpf, struct rte_mbuf *mb[], uint32_t num, + uint32_t drop) +{ + uint32_t i; + void *dp[num]; + uint64_t rc[num]; + + for (i = 0; i != num; i++) + dp[i] = rte_pktmbuf_mtod(mb[i], void *); + + rte_bpf_exec_burst(bpf, dp, rc, num); + return apply_filter(mb, rc, num, drop); +} + +static inline uint32_t +pkt_filter_jit(const struct rte_bpf_jit *jit, struct rte_mbuf *mb[], + uint32_t num, uint32_t drop) +{ + uint32_t i, n; + void *dp; + uint64_t rc[num]; + + n = 0; + for (i = 0; i != num; i++) { + dp = rte_pktmbuf_mtod(mb[i], void *); + rc[i] = jit->func(dp); + n += (rc[i] == 0); + } + + if (n != 0) + num = apply_filter(mb, rc, num, drop); + + return num; +} + +static inline uint32_t +pkt_filter_mb_vm(const struct rte_bpf *bpf, struct rte_mbuf *mb[], uint32_t num, + uint32_t drop) +{ + uint64_t rc[num]; + + rte_bpf_exec_burst(bpf, (void **)mb, rc, num); + return apply_filter(mb, rc, num, drop); +} + +static inline uint32_t +pkt_filter_mb_jit(const struct rte_bpf_jit *jit, struct rte_mbuf *mb[], + uint32_t num, uint32_t drop) +{ + uint32_t i, n; + uint64_t rc[num]; + + n = 0; + for (i = 0; i != num; i++) { + rc[i] = jit->func(mb[i]); + n += (rc[i] == 0); + } + + if (n != 0) + num = apply_filter(mb, rc, num, drop); + + return num; +} + +/* + * RX/TX callbacks for raw data bpf. + */ + +static uint16_t +bpf_rx_callback_vm(__rte_unused uint16_t port, __rte_unused uint16_t queue, + struct rte_mbuf *pkt[], uint16_t nb_pkts, + __rte_unused uint16_t max_pkts, void *user_param) +{ + struct bpf_eth_cbi *cbi; + uint16_t rc; + + cbi = user_param; + + bpf_eth_cbi_inuse(cbi); + rc = (cbi->cb != NULL) ? + pkt_filter_vm(cbi->bpf, pkt, nb_pkts, 1) : + nb_pkts; + bpf_eth_cbi_unuse(cbi); + return rc; +} + +static uint16_t +bpf_rx_callback_jit(__rte_unused uint16_t port, __rte_unused uint16_t queue, + struct rte_mbuf *pkt[], uint16_t nb_pkts, + __rte_unused uint16_t max_pkts, void *user_param) +{ + struct bpf_eth_cbi *cbi; + uint16_t rc; + + cbi = user_param; + bpf_eth_cbi_inuse(cbi); + rc = (cbi->cb != NULL) ? + pkt_filter_jit(&cbi->jit, pkt, nb_pkts, 1) : + nb_pkts; + bpf_eth_cbi_unuse(cbi); + return rc; +} + +static uint16_t +bpf_tx_callback_vm(__rte_unused uint16_t port, __rte_unused uint16_t queue, + struct rte_mbuf *pkt[], uint16_t nb_pkts, void *user_param) +{ + struct bpf_eth_cbi *cbi; + uint16_t rc; + + cbi = user_param; + bpf_eth_cbi_inuse(cbi); + rc = (cbi->cb != NULL) ? + pkt_filter_vm(cbi->bpf, pkt, nb_pkts, 0) : + nb_pkts; + bpf_eth_cbi_unuse(cbi); + return rc; +} + +static uint16_t +bpf_tx_callback_jit(__rte_unused uint16_t port, __rte_unused uint16_t queue, + struct rte_mbuf *pkt[], uint16_t nb_pkts, void *user_param) +{ + struct bpf_eth_cbi *cbi; + uint16_t rc; + + cbi = user_param; + bpf_eth_cbi_inuse(cbi); + rc = (cbi->cb != NULL) ? + pkt_filter_jit(&cbi->jit, pkt, nb_pkts, 0) : + nb_pkts; + bpf_eth_cbi_unuse(cbi); + return rc; +} + +/* + * RX/TX callbacks for mbuf. + */ + +static uint16_t +bpf_rx_callback_mb_vm(__rte_unused uint16_t port, __rte_unused uint16_t queue, + struct rte_mbuf *pkt[], uint16_t nb_pkts, + __rte_unused uint16_t max_pkts, void *user_param) +{ + struct bpf_eth_cbi *cbi; + uint16_t rc; + + cbi = user_param; + bpf_eth_cbi_inuse(cbi); + rc = (cbi->cb != NULL) ? + pkt_filter_mb_vm(cbi->bpf, pkt, nb_pkts, 1) : + nb_pkts; + bpf_eth_cbi_unuse(cbi); + return rc; +} + +static uint16_t +bpf_rx_callback_mb_jit(__rte_unused uint16_t port, __rte_unused uint16_t queue, + struct rte_mbuf *pkt[], uint16_t nb_pkts, + __rte_unused uint16_t max_pkts, void *user_param) +{ + struct bpf_eth_cbi *cbi; + uint16_t rc; + + cbi = user_param; + bpf_eth_cbi_inuse(cbi); + rc = (cbi->cb != NULL) ? + pkt_filter_mb_jit(&cbi->jit, pkt, nb_pkts, 1) : + nb_pkts; + bpf_eth_cbi_unuse(cbi); + return rc; +} + +static uint16_t +bpf_tx_callback_mb_vm(__rte_unused uint16_t port, __rte_unused uint16_t queue, + struct rte_mbuf *pkt[], uint16_t nb_pkts, void *user_param) +{ + struct bpf_eth_cbi *cbi; + uint16_t rc; + + cbi = user_param; + bpf_eth_cbi_inuse(cbi); + rc = (cbi->cb != NULL) ? + pkt_filter_mb_vm(cbi->bpf, pkt, nb_pkts, 0) : + nb_pkts; + bpf_eth_cbi_unuse(cbi); + return rc; +} + +static uint16_t +bpf_tx_callback_mb_jit(__rte_unused uint16_t port, __rte_unused uint16_t queue, + struct rte_mbuf *pkt[], uint16_t nb_pkts, void *user_param) +{ + struct bpf_eth_cbi *cbi; + uint16_t rc; + + cbi = user_param; + bpf_eth_cbi_inuse(cbi); + rc = (cbi->cb != NULL) ? + pkt_filter_mb_jit(&cbi->jit, pkt, nb_pkts, 0) : + nb_pkts; + bpf_eth_cbi_unuse(cbi); + return rc; +} + +static rte_rx_callback_fn +select_rx_callback(enum rte_bpf_arg_type type, uint32_t flags) +{ + if (flags & RTE_BPF_ETH_F_JIT) { + if (type == RTE_BPF_ARG_PTR) + return bpf_rx_callback_jit; + else if (type == RTE_BPF_ARG_PTR_MBUF) + return bpf_rx_callback_mb_jit; + } else if (type == RTE_BPF_ARG_PTR) + return bpf_rx_callback_vm; + else if (type == RTE_BPF_ARG_PTR_MBUF) + return bpf_rx_callback_mb_vm; + + return NULL; +} + +static rte_tx_callback_fn +select_tx_callback(enum rte_bpf_arg_type type, uint32_t flags) +{ + if (flags & RTE_BPF_ETH_F_JIT) { + if (type == RTE_BPF_ARG_PTR) + return bpf_tx_callback_jit; + else if (type == RTE_BPF_ARG_PTR_MBUF) + return bpf_tx_callback_mb_jit; + } else if (type == RTE_BPF_ARG_PTR) + return bpf_tx_callback_vm; + else if (type == RTE_BPF_ARG_PTR_MBUF) + return bpf_tx_callback_mb_vm; + + return NULL; +} + +/* + * helper function to perform BPF unload for given port/queue. + * have to introduce extra complexity (and possible slowdown) here, + * as right now there is no safe generic way to remove RX/TX callback + * while IO is active. + * Still don't free memory allocated for callback handle itself, + * again right now there is no safe way to do that without stopping RX/TX + * on given port/queue first. + */ +static void +bpf_eth_cbi_unload(struct bpf_eth_cbi *bc) +{ + /* mark this cbi as empty */ + bc->cb = NULL; + rte_smp_mb(); + + /* make sure datapath doesn't use bpf anymore, then destroy bpf */ + bpf_eth_cbi_wait(bc); + rte_bpf_destroy(bc->bpf); + bpf_eth_cbi_cleanup(bc); +} + +static void +bpf_eth_unload(struct bpf_eth_cbh *cbh, uint16_t port, uint16_t queue) +{ + struct bpf_eth_cbi *bc; + + bc = bpf_eth_cbh_find(cbh, port, queue); + if (bc == NULL || bc->cb == NULL) + return; + + if (cbh->type == BPF_ETH_RX) + rte_eth_remove_rx_callback(port, queue, bc->cb); + else + rte_eth_remove_tx_callback(port, queue, bc->cb); + + bpf_eth_cbi_unload(bc); +} + + +__rte_experimental void +rte_bpf_eth_rx_unload(uint16_t port, uint16_t queue) +{ + struct bpf_eth_cbh *cbh; + + cbh = &rx_cbh; + rte_spinlock_lock(&cbh->lock); + bpf_eth_unload(cbh, port, queue); + rte_spinlock_unlock(&cbh->lock); +} + +__rte_experimental void +rte_bpf_eth_tx_unload(uint16_t port, uint16_t queue) +{ + struct bpf_eth_cbh *cbh; + + cbh = &tx_cbh; + rte_spinlock_lock(&cbh->lock); + bpf_eth_unload(cbh, port, queue); + rte_spinlock_unlock(&cbh->lock); +} + +static int +bpf_eth_elf_load(struct bpf_eth_cbh *cbh, uint16_t port, uint16_t queue, + const struct rte_bpf_prm *prm, const char *fname, const char *sname, + uint32_t flags) +{ + int32_t rc; + struct bpf_eth_cbi *bc; + struct rte_bpf *bpf; + rte_rx_callback_fn frx; + rte_tx_callback_fn ftx; + struct rte_bpf_jit jit; + + frx = NULL; + ftx = NULL; + + if (prm == NULL || rte_eth_dev_is_valid_port(port) == 0 || + queue >= RTE_MAX_QUEUES_PER_PORT) + return -EINVAL; + + if (cbh->type == BPF_ETH_RX) + frx = select_rx_callback(prm->prog_arg.type, flags); + else + ftx = select_tx_callback(prm->prog_arg.type, flags); + + if (frx == NULL && ftx == NULL) { + RTE_BPF_LOG(ERR, "%s(%u, %u): no callback selected;\n", + __func__, port, queue); + return -EINVAL; + } + + bpf = rte_bpf_elf_load(prm, fname, sname); + if (bpf == NULL) + return -rte_errno; + + rte_bpf_get_jit(bpf, &jit); + + if ((flags & RTE_BPF_ETH_F_JIT) != 0 && jit.func == NULL) { + RTE_BPF_LOG(ERR, "%s(%u, %u): no JIT generated;\n", + __func__, port, queue); + rte_bpf_destroy(bpf); + return -ENOTSUP; + } + + /* setup/update global callback info */ + bc = bpf_eth_cbh_add(cbh, port, queue); + if (bc == NULL) + return -ENOMEM; + + /* remove old one, if any */ + if (bc->cb != NULL) + bpf_eth_unload(cbh, port, queue); + + bc->bpf = bpf; + bc->jit = jit; + + if (cbh->type == BPF_ETH_RX) + bc->cb = rte_eth_add_rx_callback(port, queue, frx, bc); + else + bc->cb = rte_eth_add_tx_callback(port, queue, ftx, bc); + + if (bc->cb == NULL) { + rc = -rte_errno; + rte_bpf_destroy(bpf); + bpf_eth_cbi_cleanup(bc); + } else + rc = 0; + + return rc; +} + +__rte_experimental int +rte_bpf_eth_rx_elf_load(uint16_t port, uint16_t queue, + const struct rte_bpf_prm *prm, const char *fname, const char *sname, + uint32_t flags) +{ + int32_t rc; + struct bpf_eth_cbh *cbh; + + cbh = &rx_cbh; + rte_spinlock_lock(&cbh->lock); + rc = bpf_eth_elf_load(cbh, port, queue, prm, fname, sname, flags); + rte_spinlock_unlock(&cbh->lock); + + return rc; +} + +__rte_experimental int +rte_bpf_eth_tx_elf_load(uint16_t port, uint16_t queue, + const struct rte_bpf_prm *prm, const char *fname, const char *sname, + uint32_t flags) +{ + int32_t rc; + struct bpf_eth_cbh *cbh; + + cbh = &tx_cbh; + rte_spinlock_lock(&cbh->lock); + rc = bpf_eth_elf_load(cbh, port, queue, prm, fname, sname, flags); + rte_spinlock_unlock(&cbh->lock); + + return rc; +} diff --git a/lib/librte_bpf/bpf_validate.c b/lib/librte_bpf/bpf_validate.c new file mode 100644 index 00000000..83983efc --- /dev/null +++ b/lib/librte_bpf/bpf_validate.c @@ -0,0 +1,2248 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "bpf_impl.h" + +struct bpf_reg_val { + struct rte_bpf_arg v; + uint64_t mask; + struct { + int64_t min; + int64_t max; + } s; + struct { + uint64_t min; + uint64_t max; + } u; +}; + +struct bpf_eval_state { + struct bpf_reg_val rv[EBPF_REG_NUM]; + struct bpf_reg_val sv[MAX_BPF_STACK_SIZE / sizeof(uint64_t)]; +}; + +/* possible instruction node colour */ +enum { + WHITE, + GREY, + BLACK, + MAX_NODE_COLOUR +}; + +/* possible edge types */ +enum { + UNKNOWN_EDGE, + TREE_EDGE, + BACK_EDGE, + CROSS_EDGE, + MAX_EDGE_TYPE +}; + +#define MAX_EDGES 2 + +struct inst_node { + uint8_t colour; + uint8_t nb_edge:4; + uint8_t cur_edge:4; + uint8_t edge_type[MAX_EDGES]; + uint32_t edge_dest[MAX_EDGES]; + uint32_t prev_node; + struct bpf_eval_state *evst; +}; + +struct bpf_verifier { + const struct rte_bpf_prm *prm; + struct inst_node *in; + uint64_t stack_sz; + uint32_t nb_nodes; + uint32_t nb_jcc_nodes; + uint32_t node_colour[MAX_NODE_COLOUR]; + uint32_t edge_type[MAX_EDGE_TYPE]; + struct bpf_eval_state *evst; + struct inst_node *evin; + struct { + uint32_t num; + uint32_t cur; + struct bpf_eval_state *ent; + } evst_pool; +}; + +struct bpf_ins_check { + struct { + uint16_t dreg; + uint16_t sreg; + } mask; + struct { + uint16_t min; + uint16_t max; + } off; + struct { + uint32_t min; + uint32_t max; + } imm; + const char * (*check)(const struct ebpf_insn *); + const char * (*eval)(struct bpf_verifier *, const struct ebpf_insn *); +}; + +#define ALL_REGS RTE_LEN2MASK(EBPF_REG_NUM, uint16_t) +#define WRT_REGS RTE_LEN2MASK(EBPF_REG_10, uint16_t) +#define ZERO_REG RTE_LEN2MASK(EBPF_REG_1, uint16_t) + +/* + * check and evaluate functions for particular instruction types. + */ + +static const char * +check_alu_bele(const struct ebpf_insn *ins) +{ + if (ins->imm != 16 && ins->imm != 32 && ins->imm != 64) + return "invalid imm field"; + return NULL; +} + +static const char * +eval_exit(struct bpf_verifier *bvf, const struct ebpf_insn *ins) +{ + RTE_SET_USED(ins); + if (bvf->evst->rv[EBPF_REG_0].v.type == RTE_BPF_ARG_UNDEF) + return "undefined return value"; + return NULL; +} + +/* setup max possible with this mask bounds */ +static void +eval_umax_bound(struct bpf_reg_val *rv, uint64_t mask) +{ + rv->u.max = mask; + rv->u.min = 0; +} + +static void +eval_smax_bound(struct bpf_reg_val *rv, uint64_t mask) +{ + rv->s.max = mask >> 1; + rv->s.min = rv->s.max ^ UINT64_MAX; +} + +static void +eval_max_bound(struct bpf_reg_val *rv, uint64_t mask) +{ + eval_umax_bound(rv, mask); + eval_smax_bound(rv, mask); +} + +static void +eval_fill_max_bound(struct bpf_reg_val *rv, uint64_t mask) +{ + eval_max_bound(rv, mask); + rv->v.type = RTE_BPF_ARG_RAW; + rv->mask = mask; +} + +static void +eval_fill_imm64(struct bpf_reg_val *rv, uint64_t mask, uint64_t val) +{ + rv->mask = mask; + rv->s.min = val; + rv->s.max = val; + rv->u.min = val; + rv->u.max = val; +} + +static void +eval_fill_imm(struct bpf_reg_val *rv, uint64_t mask, int32_t imm) +{ + uint64_t v; + + v = (uint64_t)imm & mask; + + rv->v.type = RTE_BPF_ARG_RAW; + eval_fill_imm64(rv, mask, v); +} + +static const char * +eval_ld_imm64(struct bpf_verifier *bvf, const struct ebpf_insn *ins) +{ + uint32_t i; + uint64_t val; + struct bpf_reg_val *rd; + + val = (uint32_t)ins[0].imm | (uint64_t)(uint32_t)ins[1].imm << 32; + + rd = bvf->evst->rv + ins->dst_reg; + rd->v.type = RTE_BPF_ARG_RAW; + eval_fill_imm64(rd, UINT64_MAX, val); + + for (i = 0; i != bvf->prm->nb_xsym; i++) { + + /* load of external variable */ + if (bvf->prm->xsym[i].type == RTE_BPF_XTYPE_VAR && + (uintptr_t)bvf->prm->xsym[i].var.val == val) { + rd->v = bvf->prm->xsym[i].var.desc; + eval_fill_imm64(rd, UINT64_MAX, 0); + break; + } + } + + return NULL; +} + +static void +eval_apply_mask(struct bpf_reg_val *rv, uint64_t mask) +{ + struct bpf_reg_val rt; + + rt.u.min = rv->u.min & mask; + rt.u.max = rv->u.max & mask; + if (rt.u.min != rv->u.min || rt.u.max != rv->u.max) { + rv->u.max = RTE_MAX(rt.u.max, mask); + rv->u.min = 0; + } + + eval_smax_bound(&rt, mask); + rv->s.max = RTE_MIN(rt.s.max, rv->s.max); + rv->s.min = RTE_MAX(rt.s.min, rv->s.min); + + rv->mask = mask; +} + +static void +eval_add(struct bpf_reg_val *rd, const struct bpf_reg_val *rs, uint64_t msk) +{ + struct bpf_reg_val rv; + + rv.u.min = (rd->u.min + rs->u.min) & msk; + rv.u.max = (rd->u.min + rs->u.max) & msk; + rv.s.min = (rd->s.min + rs->s.min) & msk; + rv.s.max = (rd->s.max + rs->s.max) & msk; + + /* + * if at least one of the operands is not constant, + * then check for overflow + */ + if ((rd->u.min != rd->u.max || rs->u.min != rs->u.max) && + (rv.u.min < rd->u.min || rv.u.max < rd->u.max)) + eval_umax_bound(&rv, msk); + + if ((rd->s.min != rd->s.max || rs->s.min != rs->s.max) && + (((rs->s.min < 0 && rv.s.min > rd->s.min) || + rv.s.min < rd->s.min) || + ((rs->s.max < 0 && rv.s.max > rd->s.max) || + rv.s.max < rd->s.max))) + eval_smax_bound(&rv, msk); + + rd->s = rv.s; + rd->u = rv.u; +} + +static void +eval_sub(struct bpf_reg_val *rd, const struct bpf_reg_val *rs, uint64_t msk) +{ + struct bpf_reg_val rv; + + rv.u.min = (rd->u.min - rs->u.min) & msk; + rv.u.max = (rd->u.min - rs->u.max) & msk; + rv.s.min = (rd->s.min - rs->s.min) & msk; + rv.s.max = (rd->s.max - rs->s.max) & msk; + + /* + * if at least one of the operands is not constant, + * then check for overflow + */ + if ((rd->u.min != rd->u.max || rs->u.min != rs->u.max) && + (rv.u.min > rd->u.min || rv.u.max > rd->u.max)) + eval_umax_bound(&rv, msk); + + if ((rd->s.min != rd->s.max || rs->s.min != rs->s.max) && + (((rs->s.min < 0 && rv.s.min < rd->s.min) || + rv.s.min > rd->s.min) || + ((rs->s.max < 0 && rv.s.max < rd->s.max) || + rv.s.max > rd->s.max))) + eval_smax_bound(&rv, msk); + + rd->s = rv.s; + rd->u = rv.u; +} + +static void +eval_lsh(struct bpf_reg_val *rd, const struct bpf_reg_val *rs, size_t opsz, + uint64_t msk) +{ + /* check if shift value is less then max result bits */ + if (rs->u.max >= opsz) { + eval_max_bound(rd, msk); + return; + } + + /* check for overflow */ + if (rd->u.max > RTE_LEN2MASK(opsz - rs->u.max, uint64_t)) + eval_umax_bound(rd, msk); + else { + rd->u.max <<= rs->u.max; + rd->u.min <<= rs->u.min; + } + + /* check that dreg values are and would remain always positive */ + if ((uint64_t)rd->s.min >> (opsz - 1) != 0 || rd->s.max >= + RTE_LEN2MASK(opsz - rs->u.max - 1, int64_t)) + eval_smax_bound(rd, msk); + else { + rd->s.max <<= rs->u.max; + rd->s.min <<= rs->u.min; + } +} + +static void +eval_rsh(struct bpf_reg_val *rd, const struct bpf_reg_val *rs, size_t opsz, + uint64_t msk) +{ + /* check if shift value is less then max result bits */ + if (rs->u.max >= opsz) { + eval_max_bound(rd, msk); + return; + } + + rd->u.max >>= rs->u.min; + rd->u.min >>= rs->u.max; + + /* check that dreg values are always positive */ + if ((uint64_t)rd->s.min >> (opsz - 1) != 0) + eval_smax_bound(rd, msk); + else { + rd->s.max >>= rs->u.min; + rd->s.min >>= rs->u.max; + } +} + +static void +eval_arsh(struct bpf_reg_val *rd, const struct bpf_reg_val *rs, size_t opsz, + uint64_t msk) +{ + uint32_t shv; + + /* check if shift value is less then max result bits */ + if (rs->u.max >= opsz) { + eval_max_bound(rd, msk); + return; + } + + rd->u.max = (int64_t)rd->u.max >> rs->u.min; + rd->u.min = (int64_t)rd->u.min >> rs->u.max; + + /* if we have 32-bit values - extend them to 64-bit */ + if (opsz == sizeof(uint32_t) * CHAR_BIT) { + rd->s.min <<= opsz; + rd->s.max <<= opsz; + shv = opsz; + } else + shv = 0; + + if (rd->s.min < 0) + rd->s.min = (rd->s.min >> (rs->u.min + shv)) & msk; + else + rd->s.min = (rd->s.min >> (rs->u.max + shv)) & msk; + + if (rd->s.max < 0) + rd->s.max = (rd->s.max >> (rs->u.max + shv)) & msk; + else + rd->s.max = (rd->s.max >> (rs->u.min + shv)) & msk; +} + +static uint64_t +eval_umax_bits(uint64_t v, size_t opsz) +{ + if (v == 0) + return 0; + + v = __builtin_clzll(v); + return RTE_LEN2MASK(opsz - v, uint64_t); +} + +/* estimate max possible value for (v1 & v2) */ +static uint64_t +eval_uand_max(uint64_t v1, uint64_t v2, size_t opsz) +{ + v1 = eval_umax_bits(v1, opsz); + v2 = eval_umax_bits(v2, opsz); + return (v1 & v2); +} + +/* estimate max possible value for (v1 | v2) */ +static uint64_t +eval_uor_max(uint64_t v1, uint64_t v2, size_t opsz) +{ + v1 = eval_umax_bits(v1, opsz); + v2 = eval_umax_bits(v2, opsz); + return (v1 | v2); +} + +static void +eval_and(struct bpf_reg_val *rd, const struct bpf_reg_val *rs, size_t opsz, + uint64_t msk) +{ + /* both operands are constants */ + if (rd->u.min == rd->u.max && rs->u.min == rs->u.max) { + rd->u.min &= rs->u.min; + rd->u.max &= rs->u.max; + } else { + rd->u.max = eval_uand_max(rd->u.max, rs->u.max, opsz); + rd->u.min &= rs->u.min; + } + + /* both operands are constants */ + if (rd->s.min == rd->s.max && rs->s.min == rs->s.max) { + rd->s.min &= rs->s.min; + rd->s.max &= rs->s.max; + /* at least one of operand is non-negative */ + } else if (rd->s.min >= 0 || rs->s.min >= 0) { + rd->s.max = eval_uand_max(rd->s.max & (msk >> 1), + rs->s.max & (msk >> 1), opsz); + rd->s.min &= rs->s.min; + } else + eval_smax_bound(rd, msk); +} + +static void +eval_or(struct bpf_reg_val *rd, const struct bpf_reg_val *rs, size_t opsz, + uint64_t msk) +{ + /* both operands are constants */ + if (rd->u.min == rd->u.max && rs->u.min == rs->u.max) { + rd->u.min |= rs->u.min; + rd->u.max |= rs->u.max; + } else { + rd->u.max = eval_uor_max(rd->u.max, rs->u.max, opsz); + rd->u.min |= rs->u.min; + } + + /* both operands are constants */ + if (rd->s.min == rd->s.max && rs->s.min == rs->s.max) { + rd->s.min |= rs->s.min; + rd->s.max |= rs->s.max; + + /* both operands are non-negative */ + } else if (rd->s.min >= 0 || rs->s.min >= 0) { + rd->s.max = eval_uor_max(rd->s.max, rs->s.max, opsz); + rd->s.min |= rs->s.min; + } else + eval_smax_bound(rd, msk); +} + +static void +eval_xor(struct bpf_reg_val *rd, const struct bpf_reg_val *rs, size_t opsz, + uint64_t msk) +{ + /* both operands are constants */ + if (rd->u.min == rd->u.max && rs->u.min == rs->u.max) { + rd->u.min ^= rs->u.min; + rd->u.max ^= rs->u.max; + } else { + rd->u.max = eval_uor_max(rd->u.max, rs->u.max, opsz); + rd->u.min = 0; + } + + /* both operands are constants */ + if (rd->s.min == rd->s.max && rs->s.min == rs->s.max) { + rd->s.min ^= rs->s.min; + rd->s.max ^= rs->s.max; + + /* both operands are non-negative */ + } else if (rd->s.min >= 0 || rs->s.min >= 0) { + rd->s.max = eval_uor_max(rd->s.max, rs->s.max, opsz); + rd->s.min = 0; + } else + eval_smax_bound(rd, msk); +} + +static void +eval_mul(struct bpf_reg_val *rd, const struct bpf_reg_val *rs, size_t opsz, + uint64_t msk) +{ + /* both operands are constants */ + if (rd->u.min == rd->u.max && rs->u.min == rs->u.max) { + rd->u.min = (rd->u.min * rs->u.min) & msk; + rd->u.max = (rd->u.max * rs->u.max) & msk; + /* check for overflow */ + } else if (rd->u.max <= msk >> opsz / 2 && rs->u.max <= msk >> opsz) { + rd->u.max *= rs->u.max; + rd->u.min *= rd->u.min; + } else + eval_umax_bound(rd, msk); + + /* both operands are constants */ + if (rd->s.min == rd->s.max && rs->s.min == rs->s.max) { + rd->s.min = (rd->s.min * rs->s.min) & msk; + rd->s.max = (rd->s.max * rs->s.max) & msk; + /* check that both operands are positive and no overflow */ + } else if (rd->s.min >= 0 && rs->s.min >= 0) { + rd->s.max *= rs->s.max; + rd->s.min *= rd->s.min; + } else + eval_smax_bound(rd, msk); +} + +static const char * +eval_divmod(uint32_t op, struct bpf_reg_val *rd, struct bpf_reg_val *rs, + size_t opsz, uint64_t msk) +{ + /* both operands are constants */ + if (rd->u.min == rd->u.max && rs->u.min == rs->u.max) { + if (rs->u.max == 0) + return "division by 0"; + if (op == BPF_DIV) { + rd->u.min /= rs->u.min; + rd->u.max /= rs->u.max; + } else { + rd->u.min %= rs->u.min; + rd->u.max %= rs->u.max; + } + } else { + if (op == BPF_MOD) + rd->u.max = RTE_MIN(rd->u.max, rs->u.max - 1); + else + rd->u.max = rd->u.max; + rd->u.min = 0; + } + + /* if we have 32-bit values - extend them to 64-bit */ + if (opsz == sizeof(uint32_t) * CHAR_BIT) { + rd->s.min = (int32_t)rd->s.min; + rd->s.max = (int32_t)rd->s.max; + rs->s.min = (int32_t)rs->s.min; + rs->s.max = (int32_t)rs->s.max; + } + + /* both operands are constants */ + if (rd->s.min == rd->s.max && rs->s.min == rs->s.max) { + if (rs->s.max == 0) + return "division by 0"; + if (op == BPF_DIV) { + rd->s.min /= rs->s.min; + rd->s.max /= rs->s.max; + } else { + rd->s.min %= rs->s.min; + rd->s.max %= rs->s.max; + } + } else if (op == BPF_MOD) { + rd->s.min = RTE_MAX(rd->s.max, 0); + rd->s.min = RTE_MIN(rd->s.min, 0); + } else + eval_smax_bound(rd, msk); + + rd->s.max &= msk; + rd->s.min &= msk; + + return NULL; +} + +static void +eval_neg(struct bpf_reg_val *rd, size_t opsz, uint64_t msk) +{ + uint64_t ux, uy; + int64_t sx, sy; + + /* if we have 32-bit values - extend them to 64-bit */ + if (opsz == sizeof(uint32_t) * CHAR_BIT) { + rd->u.min = (int32_t)rd->u.min; + rd->u.max = (int32_t)rd->u.max; + } + + ux = -(int64_t)rd->u.min & msk; + uy = -(int64_t)rd->u.max & msk; + + rd->u.max = RTE_MAX(ux, uy); + rd->u.min = RTE_MIN(ux, uy); + + /* if we have 32-bit values - extend them to 64-bit */ + if (opsz == sizeof(uint32_t) * CHAR_BIT) { + rd->s.min = (int32_t)rd->s.min; + rd->s.max = (int32_t)rd->s.max; + } + + sx = -rd->s.min & msk; + sy = -rd->s.max & msk; + + rd->s.max = RTE_MAX(sx, sy); + rd->s.min = RTE_MIN(sx, sy); +} + +/* + * check that destination and source operand are in defined state. + */ +static const char * +eval_defined(const struct bpf_reg_val *dst, const struct bpf_reg_val *src) +{ + if (dst != NULL && dst->v.type == RTE_BPF_ARG_UNDEF) + return "dest reg value is undefined"; + if (src != NULL && src->v.type == RTE_BPF_ARG_UNDEF) + return "src reg value is undefined"; + return NULL; +} + +static const char * +eval_alu(struct bpf_verifier *bvf, const struct ebpf_insn *ins) +{ + uint64_t msk; + uint32_t op; + size_t opsz; + const char *err; + struct bpf_eval_state *st; + struct bpf_reg_val *rd, rs; + + opsz = (BPF_CLASS(ins->code) == BPF_ALU) ? + sizeof(uint32_t) : sizeof(uint64_t); + opsz = opsz * CHAR_BIT; + msk = RTE_LEN2MASK(opsz, uint64_t); + + st = bvf->evst; + rd = st->rv + ins->dst_reg; + + if (BPF_SRC(ins->code) == BPF_X) { + rs = st->rv[ins->src_reg]; + eval_apply_mask(&rs, msk); + } else + eval_fill_imm(&rs, msk, ins->imm); + + eval_apply_mask(rd, msk); + + op = BPF_OP(ins->code); + + err = eval_defined((op != EBPF_MOV) ? rd : NULL, + (op != BPF_NEG) ? &rs : NULL); + if (err != NULL) + return err; + + if (op == BPF_ADD) + eval_add(rd, &rs, msk); + else if (op == BPF_SUB) + eval_sub(rd, &rs, msk); + else if (op == BPF_LSH) + eval_lsh(rd, &rs, opsz, msk); + else if (op == BPF_RSH) + eval_rsh(rd, &rs, opsz, msk); + else if (op == EBPF_ARSH) + eval_arsh(rd, &rs, opsz, msk); + else if (op == BPF_AND) + eval_and(rd, &rs, opsz, msk); + else if (op == BPF_OR) + eval_or(rd, &rs, opsz, msk); + else if (op == BPF_XOR) + eval_xor(rd, &rs, opsz, msk); + else if (op == BPF_MUL) + eval_mul(rd, &rs, opsz, msk); + else if (op == BPF_DIV || op == BPF_MOD) + err = eval_divmod(op, rd, &rs, opsz, msk); + else if (op == BPF_NEG) + eval_neg(rd, opsz, msk); + else if (op == EBPF_MOV) + *rd = rs; + else + eval_max_bound(rd, msk); + + return err; +} + +static const char * +eval_bele(struct bpf_verifier *bvf, const struct ebpf_insn *ins) +{ + uint64_t msk; + struct bpf_eval_state *st; + struct bpf_reg_val *rd; + const char *err; + + msk = RTE_LEN2MASK(ins->imm, uint64_t); + + st = bvf->evst; + rd = st->rv + ins->dst_reg; + + err = eval_defined(rd, NULL); + if (err != NULL) + return err; + +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN + if (ins->code == (BPF_ALU | EBPF_END | EBPF_TO_BE)) + eval_max_bound(rd, msk); + else + eval_apply_mask(rd, msk); +#else + if (ins->code == (BPF_ALU | EBPF_END | EBPF_TO_LE)) + eval_max_bound(rd, msk); + else + eval_apply_mask(rd, msk); +#endif + + return NULL; +} + +static const char * +eval_ptr(struct bpf_verifier *bvf, struct bpf_reg_val *rm, uint32_t opsz, + uint32_t align, int16_t off) +{ + struct bpf_reg_val rv; + + /* calculate reg + offset */ + eval_fill_imm(&rv, rm->mask, off); + eval_add(rm, &rv, rm->mask); + + if (RTE_BPF_ARG_PTR_TYPE(rm->v.type) == 0) + return "destination is not a pointer"; + + if (rm->mask != UINT64_MAX) + return "pointer truncation"; + + if (rm->u.max + opsz > rm->v.size || + (uint64_t)rm->s.max + opsz > rm->v.size || + rm->s.min < 0) + return "memory boundary violation"; + + if (rm->u.max % align != 0) + return "unaligned memory access"; + + if (rm->v.type == RTE_BPF_ARG_PTR_STACK) { + + if (rm->u.max != rm->u.min || rm->s.max != rm->s.min || + rm->u.max != (uint64_t)rm->s.max) + return "stack access with variable offset"; + + bvf->stack_sz = RTE_MAX(bvf->stack_sz, rm->v.size - rm->u.max); + + /* pointer to mbuf */ + } else if (rm->v.type == RTE_BPF_ARG_PTR_MBUF) { + + if (rm->u.max != rm->u.min || rm->s.max != rm->s.min || + rm->u.max != (uint64_t)rm->s.max) + return "mbuf access with variable offset"; + } + + return NULL; +} + +static void +eval_max_load(struct bpf_reg_val *rv, uint64_t mask) +{ + eval_umax_bound(rv, mask); + + /* full 64-bit load */ + if (mask == UINT64_MAX) + eval_smax_bound(rv, mask); + + /* zero-extend load */ + rv->s.min = rv->u.min; + rv->s.max = rv->u.max; +} + + +static const char * +eval_load(struct bpf_verifier *bvf, const struct ebpf_insn *ins) +{ + uint32_t opsz; + uint64_t msk; + const char *err; + struct bpf_eval_state *st; + struct bpf_reg_val *rd, rs; + const struct bpf_reg_val *sv; + + st = bvf->evst; + rd = st->rv + ins->dst_reg; + rs = st->rv[ins->src_reg]; + opsz = bpf_size(BPF_SIZE(ins->code)); + msk = RTE_LEN2MASK(opsz * CHAR_BIT, uint64_t); + + err = eval_ptr(bvf, &rs, opsz, 1, ins->off); + if (err != NULL) + return err; + + if (rs.v.type == RTE_BPF_ARG_PTR_STACK) { + + sv = st->sv + rs.u.max / sizeof(uint64_t); + if (sv->v.type == RTE_BPF_ARG_UNDEF || sv->mask < msk) + return "undefined value on the stack"; + + *rd = *sv; + + /* pointer to mbuf */ + } else if (rs.v.type == RTE_BPF_ARG_PTR_MBUF) { + + if (rs.u.max == offsetof(struct rte_mbuf, next)) { + eval_fill_imm(rd, msk, 0); + rd->v = rs.v; + } else if (rs.u.max == offsetof(struct rte_mbuf, buf_addr)) { + eval_fill_imm(rd, msk, 0); + rd->v.type = RTE_BPF_ARG_PTR; + rd->v.size = rs.v.buf_size; + } else if (rs.u.max == offsetof(struct rte_mbuf, data_off)) { + eval_fill_imm(rd, msk, RTE_PKTMBUF_HEADROOM); + rd->v.type = RTE_BPF_ARG_RAW; + } else { + eval_max_load(rd, msk); + rd->v.type = RTE_BPF_ARG_RAW; + } + + /* pointer to raw data */ + } else { + eval_max_load(rd, msk); + rd->v.type = RTE_BPF_ARG_RAW; + } + + return NULL; +} + +static const char * +eval_mbuf_store(const struct bpf_reg_val *rv, uint32_t opsz) +{ + uint32_t i; + + static const struct { + size_t off; + size_t sz; + } mbuf_ro_fileds[] = { + { .off = offsetof(struct rte_mbuf, buf_addr), }, + { .off = offsetof(struct rte_mbuf, refcnt), }, + { .off = offsetof(struct rte_mbuf, nb_segs), }, + { .off = offsetof(struct rte_mbuf, buf_len), }, + { .off = offsetof(struct rte_mbuf, pool), }, + { .off = offsetof(struct rte_mbuf, next), }, + { .off = offsetof(struct rte_mbuf, priv_size), }, + }; + + for (i = 0; i != RTE_DIM(mbuf_ro_fileds) && + (mbuf_ro_fileds[i].off + mbuf_ro_fileds[i].sz <= + rv->u.max || rv->u.max + opsz <= mbuf_ro_fileds[i].off); + i++) + ; + + if (i != RTE_DIM(mbuf_ro_fileds)) + return "store to the read-only mbuf field"; + + return NULL; + +} + +static const char * +eval_store(struct bpf_verifier *bvf, const struct ebpf_insn *ins) +{ + uint32_t opsz; + uint64_t msk; + const char *err; + struct bpf_eval_state *st; + struct bpf_reg_val rd, rs, *sv; + + opsz = bpf_size(BPF_SIZE(ins->code)); + msk = RTE_LEN2MASK(opsz * CHAR_BIT, uint64_t); + + st = bvf->evst; + rd = st->rv[ins->dst_reg]; + + if (BPF_CLASS(ins->code) == BPF_STX) { + rs = st->rv[ins->src_reg]; + eval_apply_mask(&rs, msk); + } else + eval_fill_imm(&rs, msk, ins->imm); + + err = eval_defined(NULL, &rs); + if (err != NULL) + return err; + + err = eval_ptr(bvf, &rd, opsz, 1, ins->off); + if (err != NULL) + return err; + + if (rd.v.type == RTE_BPF_ARG_PTR_STACK) { + + sv = st->sv + rd.u.max / sizeof(uint64_t); + if (BPF_CLASS(ins->code) == BPF_STX && + BPF_MODE(ins->code) == EBPF_XADD) + eval_max_bound(sv, msk); + else + *sv = rs; + + /* pointer to mbuf */ + } else if (rd.v.type == RTE_BPF_ARG_PTR_MBUF) { + err = eval_mbuf_store(&rd, opsz); + if (err != NULL) + return err; + } + + return NULL; +} + +static const char * +eval_func_arg(struct bpf_verifier *bvf, const struct rte_bpf_arg *arg, + struct bpf_reg_val *rv) +{ + uint32_t i, n; + struct bpf_eval_state *st; + const char *err; + + st = bvf->evst; + + if (rv->v.type == RTE_BPF_ARG_UNDEF) + return "Undefined argument type"; + + if (arg->type != rv->v.type && + arg->type != RTE_BPF_ARG_RAW && + (arg->type != RTE_BPF_ARG_PTR || + RTE_BPF_ARG_PTR_TYPE(rv->v.type) == 0)) + return "Invalid argument type"; + + err = NULL; + + /* argument is a pointer */ + if (RTE_BPF_ARG_PTR_TYPE(arg->type) != 0) { + + err = eval_ptr(bvf, rv, arg->size, 1, 0); + + /* + * pointer to the variable on the stack is passed + * as an argument, mark stack space it occupies as initialized. + */ + if (err == NULL && rv->v.type == RTE_BPF_ARG_PTR_STACK) { + + i = rv->u.max / sizeof(uint64_t); + n = i + arg->size / sizeof(uint64_t); + while (i != n) { + eval_fill_max_bound(st->sv + i, UINT64_MAX); + i++; + }; + } + } + + return err; +} + +static const char * +eval_call(struct bpf_verifier *bvf, const struct ebpf_insn *ins) +{ + uint64_t msk; + uint32_t i, idx; + struct bpf_reg_val *rv; + const struct rte_bpf_xsym *xsym; + const char *err; + + idx = ins->imm; + + if (idx >= bvf->prm->nb_xsym || + bvf->prm->xsym[idx].type != RTE_BPF_XTYPE_FUNC) + return "invalid external function index"; + + /* for now don't support function calls on 32 bit platform */ + if (sizeof(uint64_t) != sizeof(uintptr_t)) + return "function calls are supported only for 64 bit apps"; + + xsym = bvf->prm->xsym + idx; + + /* evaluate function arguments */ + err = NULL; + for (i = 0; i != xsym->func.nb_args && err == NULL; i++) { + err = eval_func_arg(bvf, xsym->func.args + i, + bvf->evst->rv + EBPF_REG_1 + i); + } + + /* R1-R5 argument/scratch registers */ + for (i = EBPF_REG_1; i != EBPF_REG_6; i++) + bvf->evst->rv[i].v.type = RTE_BPF_ARG_UNDEF; + + /* update return value */ + + rv = bvf->evst->rv + EBPF_REG_0; + rv->v = xsym->func.ret; + msk = (rv->v.type == RTE_BPF_ARG_RAW) ? + RTE_LEN2MASK(rv->v.size * CHAR_BIT, uint64_t) : UINTPTR_MAX; + eval_max_bound(rv, msk); + rv->mask = msk; + + return err; +} + +static void +eval_jeq_jne(struct bpf_reg_val *trd, struct bpf_reg_val *trs) +{ + /* sreg is constant */ + if (trs->u.min == trs->u.max) { + trd->u = trs->u; + /* dreg is constant */ + } else if (trd->u.min == trd->u.max) { + trs->u = trd->u; + } else { + trd->u.max = RTE_MIN(trd->u.max, trs->u.max); + trd->u.min = RTE_MAX(trd->u.min, trs->u.min); + trs->u = trd->u; + } + + /* sreg is constant */ + if (trs->s.min == trs->s.max) { + trd->s = trs->s; + /* dreg is constant */ + } else if (trd->s.min == trd->s.max) { + trs->s = trd->s; + } else { + trd->s.max = RTE_MIN(trd->s.max, trs->s.max); + trd->s.min = RTE_MAX(trd->s.min, trs->s.min); + trs->s = trd->s; + } +} + +static void +eval_jgt_jle(struct bpf_reg_val *trd, struct bpf_reg_val *trs, + struct bpf_reg_val *frd, struct bpf_reg_val *frs) +{ + frd->u.max = RTE_MIN(frd->u.max, frs->u.min); + trd->u.min = RTE_MAX(trd->u.min, trs->u.min + 1); +} + +static void +eval_jlt_jge(struct bpf_reg_val *trd, struct bpf_reg_val *trs, + struct bpf_reg_val *frd, struct bpf_reg_val *frs) +{ + frd->u.min = RTE_MAX(frd->u.min, frs->u.min); + trd->u.max = RTE_MIN(trd->u.max, trs->u.max - 1); +} + +static void +eval_jsgt_jsle(struct bpf_reg_val *trd, struct bpf_reg_val *trs, + struct bpf_reg_val *frd, struct bpf_reg_val *frs) +{ + frd->s.max = RTE_MIN(frd->s.max, frs->s.min); + trd->s.min = RTE_MAX(trd->s.min, trs->s.min + 1); +} + +static void +eval_jslt_jsge(struct bpf_reg_val *trd, struct bpf_reg_val *trs, + struct bpf_reg_val *frd, struct bpf_reg_val *frs) +{ + frd->s.min = RTE_MAX(frd->s.min, frs->s.min); + trd->s.max = RTE_MIN(trd->s.max, trs->s.max - 1); +} + +static const char * +eval_jcc(struct bpf_verifier *bvf, const struct ebpf_insn *ins) +{ + uint32_t op; + const char *err; + struct bpf_eval_state *fst, *tst; + struct bpf_reg_val *frd, *frs, *trd, *trs; + struct bpf_reg_val rvf, rvt; + + tst = bvf->evst; + fst = bvf->evin->evst; + + frd = fst->rv + ins->dst_reg; + trd = tst->rv + ins->dst_reg; + + if (BPF_SRC(ins->code) == BPF_X) { + frs = fst->rv + ins->src_reg; + trs = tst->rv + ins->src_reg; + } else { + frs = &rvf; + trs = &rvt; + eval_fill_imm(frs, UINT64_MAX, ins->imm); + eval_fill_imm(trs, UINT64_MAX, ins->imm); + } + + err = eval_defined(trd, trs); + if (err != NULL) + return err; + + op = BPF_OP(ins->code); + + if (op == BPF_JEQ) + eval_jeq_jne(trd, trs); + else if (op == EBPF_JNE) + eval_jeq_jne(frd, frs); + else if (op == BPF_JGT) + eval_jgt_jle(trd, trs, frd, frs); + else if (op == EBPF_JLE) + eval_jgt_jle(frd, frs, trd, trs); + else if (op == EBPF_JLT) + eval_jlt_jge(trd, trs, frd, frs); + else if (op == BPF_JGE) + eval_jlt_jge(frd, frs, trd, trs); + else if (op == EBPF_JSGT) + eval_jsgt_jsle(trd, trs, frd, frs); + else if (op == EBPF_JSLE) + eval_jsgt_jsle(frd, frs, trd, trs); + else if (op == EBPF_JLT) + eval_jslt_jsge(trd, trs, frd, frs); + else if (op == EBPF_JSGE) + eval_jslt_jsge(frd, frs, trd, trs); + + return NULL; +} + +/* + * validate parameters for each instruction type. + */ +static const struct bpf_ins_check ins_chk[UINT8_MAX] = { + /* ALU IMM 32-bit instructions */ + [(BPF_ALU | BPF_ADD | BPF_K)] = { + .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, + }, + [(BPF_ALU | BPF_SUB | BPF_K)] = { + .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, + }, + [(BPF_ALU | BPF_AND | BPF_K)] = { + .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, + }, + [(BPF_ALU | BPF_OR | BPF_K)] = { + .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, + }, + [(BPF_ALU | BPF_LSH | BPF_K)] = { + .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, + }, + [(BPF_ALU | BPF_RSH | BPF_K)] = { + .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, + }, + [(BPF_ALU | BPF_XOR | BPF_K)] = { + .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, + }, + [(BPF_ALU | BPF_MUL | BPF_K)] = { + .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, + }, + [(BPF_ALU | EBPF_MOV | BPF_K)] = { + .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, + }, + [(BPF_ALU | BPF_DIV | BPF_K)] = { + .mask = { .dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 1, .max = UINT32_MAX}, + .eval = eval_alu, + }, + [(BPF_ALU | BPF_MOD | BPF_K)] = { + .mask = { .dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 1, .max = UINT32_MAX}, + .eval = eval_alu, + }, + /* ALU IMM 64-bit instructions */ + [(EBPF_ALU64 | BPF_ADD | BPF_K)] = { + .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | BPF_SUB | BPF_K)] = { + .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | BPF_AND | BPF_K)] = { + .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | BPF_OR | BPF_K)] = { + .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | BPF_LSH | BPF_K)] = { + .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | BPF_RSH | BPF_K)] = { + .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | EBPF_ARSH | BPF_K)] = { + .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | BPF_XOR | BPF_K)] = { + .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | BPF_MUL | BPF_K)] = { + .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | EBPF_MOV | BPF_K)] = { + .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | BPF_DIV | BPF_K)] = { + .mask = { .dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 1, .max = UINT32_MAX}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | BPF_MOD | BPF_K)] = { + .mask = { .dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 1, .max = UINT32_MAX}, + .eval = eval_alu, + }, + /* ALU REG 32-bit instructions */ + [(BPF_ALU | BPF_ADD | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(BPF_ALU | BPF_SUB | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(BPF_ALU | BPF_AND | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(BPF_ALU | BPF_OR | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(BPF_ALU | BPF_LSH | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(BPF_ALU | BPF_RSH | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(BPF_ALU | BPF_XOR | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(BPF_ALU | BPF_MUL | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(BPF_ALU | BPF_DIV | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(BPF_ALU | BPF_MOD | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(BPF_ALU | EBPF_MOV | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(BPF_ALU | BPF_NEG)] = { + .mask = { .dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(BPF_ALU | EBPF_END | EBPF_TO_BE)] = { + .mask = { .dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 16, .max = 64}, + .check = check_alu_bele, + .eval = eval_bele, + }, + [(BPF_ALU | EBPF_END | EBPF_TO_LE)] = { + .mask = { .dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 16, .max = 64}, + .check = check_alu_bele, + .eval = eval_bele, + }, + /* ALU REG 64-bit instructions */ + [(EBPF_ALU64 | BPF_ADD | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | BPF_SUB | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | BPF_AND | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | BPF_OR | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | BPF_LSH | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | BPF_RSH | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | EBPF_ARSH | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | BPF_XOR | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | BPF_MUL | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | BPF_DIV | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | BPF_MOD | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | EBPF_MOV | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | BPF_NEG)] = { + .mask = { .dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + /* load instructions */ + [(BPF_LDX | BPF_MEM | BPF_B)] = { + .mask = {. dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_load, + }, + [(BPF_LDX | BPF_MEM | BPF_H)] = { + .mask = {. dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_load, + }, + [(BPF_LDX | BPF_MEM | BPF_W)] = { + .mask = {. dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_load, + }, + [(BPF_LDX | BPF_MEM | EBPF_DW)] = { + .mask = {. dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_load, + }, + /* load 64 bit immediate value */ + [(BPF_LD | BPF_IMM | EBPF_DW)] = { + .mask = { .dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_ld_imm64, + }, + /* store REG instructions */ + [(BPF_STX | BPF_MEM | BPF_B)] = { + .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_store, + }, + [(BPF_STX | BPF_MEM | BPF_H)] = { + .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_store, + }, + [(BPF_STX | BPF_MEM | BPF_W)] = { + .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_store, + }, + [(BPF_STX | BPF_MEM | EBPF_DW)] = { + .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_store, + }, + /* atomic add instructions */ + [(BPF_STX | EBPF_XADD | BPF_W)] = { + .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_store, + }, + [(BPF_STX | EBPF_XADD | EBPF_DW)] = { + .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_store, + }, + /* store IMM instructions */ + [(BPF_ST | BPF_MEM | BPF_B)] = { + .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_store, + }, + [(BPF_ST | BPF_MEM | BPF_H)] = { + .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_store, + }, + [(BPF_ST | BPF_MEM | BPF_W)] = { + .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_store, + }, + [(BPF_ST | BPF_MEM | EBPF_DW)] = { + .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_store, + }, + /* jump instruction */ + [(BPF_JMP | BPF_JA)] = { + .mask = { .dreg = ZERO_REG, .sreg = ZERO_REG}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + }, + /* jcc IMM instructions */ + [(BPF_JMP | BPF_JEQ | BPF_K)] = { + .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_jcc, + }, + [(BPF_JMP | EBPF_JNE | BPF_K)] = { + .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_jcc, + }, + [(BPF_JMP | BPF_JGT | BPF_K)] = { + .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_jcc, + }, + [(BPF_JMP | EBPF_JLT | BPF_K)] = { + .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_jcc, + }, + [(BPF_JMP | BPF_JGE | BPF_K)] = { + .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_jcc, + }, + [(BPF_JMP | EBPF_JLE | BPF_K)] = { + .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_jcc, + }, + [(BPF_JMP | EBPF_JSGT | BPF_K)] = { + .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_jcc, + }, + [(BPF_JMP | EBPF_JSLT | BPF_K)] = { + .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_jcc, + }, + [(BPF_JMP | EBPF_JSGE | BPF_K)] = { + .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_jcc, + }, + [(BPF_JMP | EBPF_JSLE | BPF_K)] = { + .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_jcc, + }, + [(BPF_JMP | BPF_JSET | BPF_K)] = { + .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_jcc, + }, + /* jcc REG instructions */ + [(BPF_JMP | BPF_JEQ | BPF_X)] = { + .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_jcc, + }, + [(BPF_JMP | EBPF_JNE | BPF_X)] = { + .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_jcc, + }, + [(BPF_JMP | BPF_JGT | BPF_X)] = { + .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_jcc, + }, + [(BPF_JMP | EBPF_JLT | BPF_X)] = { + .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_jcc, + }, + [(BPF_JMP | BPF_JGE | BPF_X)] = { + .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_jcc, + }, + [(BPF_JMP | EBPF_JLE | BPF_X)] = { + .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_jcc, + }, + [(BPF_JMP | EBPF_JSGT | BPF_X)] = { + .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_jcc, + }, + [(BPF_JMP | EBPF_JSLT | BPF_X)] = { + .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + }, + [(BPF_JMP | EBPF_JSGE | BPF_X)] = { + .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_jcc, + }, + [(BPF_JMP | EBPF_JSLE | BPF_X)] = { + .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_jcc, + }, + [(BPF_JMP | BPF_JSET | BPF_X)] = { + .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_jcc, + }, + /* call instruction */ + [(BPF_JMP | EBPF_CALL)] = { + .mask = { .dreg = ZERO_REG, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_call, + }, + /* ret instruction */ + [(BPF_JMP | EBPF_EXIT)] = { + .mask = { .dreg = ZERO_REG, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_exit, + }, +}; + +/* + * make sure that instruction syntax is valid, + * and it fields don't violate partciular instrcution type restrictions. + */ +static const char * +check_syntax(const struct ebpf_insn *ins) +{ + + uint8_t op; + uint16_t off; + uint32_t imm; + + op = ins->code; + + if (ins_chk[op].mask.dreg == 0) + return "invalid opcode"; + + if ((ins_chk[op].mask.dreg & 1 << ins->dst_reg) == 0) + return "invalid dst-reg field"; + + if ((ins_chk[op].mask.sreg & 1 << ins->src_reg) == 0) + return "invalid src-reg field"; + + off = ins->off; + if (ins_chk[op].off.min > off || ins_chk[op].off.max < off) + return "invalid off field"; + + imm = ins->imm; + if (ins_chk[op].imm.min > imm || ins_chk[op].imm.max < imm) + return "invalid imm field"; + + if (ins_chk[op].check != NULL) + return ins_chk[op].check(ins); + + return NULL; +} + +/* + * helper function, return instruction index for the given node. + */ +static uint32_t +get_node_idx(const struct bpf_verifier *bvf, const struct inst_node *node) +{ + return node - bvf->in; +} + +/* + * helper function, used to walk through constructed CFG. + */ +static struct inst_node * +get_next_node(struct bpf_verifier *bvf, struct inst_node *node) +{ + uint32_t ce, ne, dst; + + ne = node->nb_edge; + ce = node->cur_edge; + if (ce == ne) + return NULL; + + node->cur_edge++; + dst = node->edge_dest[ce]; + return bvf->in + dst; +} + +static void +set_node_colour(struct bpf_verifier *bvf, struct inst_node *node, + uint32_t new) +{ + uint32_t prev; + + prev = node->colour; + node->colour = new; + + bvf->node_colour[prev]--; + bvf->node_colour[new]++; +} + +/* + * helper function, add new edge between two nodes. + */ +static int +add_edge(struct bpf_verifier *bvf, struct inst_node *node, uint32_t nidx) +{ + uint32_t ne; + + if (nidx > bvf->prm->nb_ins) { + RTE_BPF_LOG(ERR, "%s: program boundary violation at pc: %u, " + "next pc: %u\n", + __func__, get_node_idx(bvf, node), nidx); + return -EINVAL; + } + + ne = node->nb_edge; + if (ne >= RTE_DIM(node->edge_dest)) { + RTE_BPF_LOG(ERR, "%s: internal error at pc: %u\n", + __func__, get_node_idx(bvf, node)); + return -EINVAL; + } + + node->edge_dest[ne] = nidx; + node->nb_edge = ne + 1; + return 0; +} + +/* + * helper function, determine type of edge between two nodes. + */ +static void +set_edge_type(struct bpf_verifier *bvf, struct inst_node *node, + const struct inst_node *next) +{ + uint32_t ce, clr, type; + + ce = node->cur_edge - 1; + clr = next->colour; + + type = UNKNOWN_EDGE; + + if (clr == WHITE) + type = TREE_EDGE; + else if (clr == GREY) + type = BACK_EDGE; + else if (clr == BLACK) + /* + * in fact it could be either direct or cross edge, + * but for now, we don't need to distinguish between them. + */ + type = CROSS_EDGE; + + node->edge_type[ce] = type; + bvf->edge_type[type]++; +} + +static struct inst_node * +get_prev_node(struct bpf_verifier *bvf, struct inst_node *node) +{ + return bvf->in + node->prev_node; +} + +/* + * Depth-First Search (DFS) through previously constructed + * Control Flow Graph (CFG). + * Information collected at this path would be used later + * to determine is there any loops, and/or unreachable instructions. + */ +static void +dfs(struct bpf_verifier *bvf) +{ + struct inst_node *next, *node; + + node = bvf->in; + while (node != NULL) { + + if (node->colour == WHITE) + set_node_colour(bvf, node, GREY); + + if (node->colour == GREY) { + + /* find next unprocessed child node */ + do { + next = get_next_node(bvf, node); + if (next == NULL) + break; + set_edge_type(bvf, node, next); + } while (next->colour != WHITE); + + if (next != NULL) { + /* proceed with next child */ + next->prev_node = get_node_idx(bvf, node); + node = next; + } else { + /* + * finished with current node and all it's kids, + * proceed with parent + */ + set_node_colour(bvf, node, BLACK); + node->cur_edge = 0; + node = get_prev_node(bvf, node); + } + } else + node = NULL; + } +} + +/* + * report unreachable instructions. + */ +static void +log_unreachable(const struct bpf_verifier *bvf) +{ + uint32_t i; + struct inst_node *node; + const struct ebpf_insn *ins; + + for (i = 0; i != bvf->prm->nb_ins; i++) { + + node = bvf->in + i; + ins = bvf->prm->ins + i; + + if (node->colour == WHITE && + ins->code != (BPF_LD | BPF_IMM | EBPF_DW)) + RTE_BPF_LOG(ERR, "unreachable code at pc: %u;\n", i); + } +} + +/* + * report loops detected. + */ +static void +log_loop(const struct bpf_verifier *bvf) +{ + uint32_t i, j; + struct inst_node *node; + + for (i = 0; i != bvf->prm->nb_ins; i++) { + + node = bvf->in + i; + if (node->colour != BLACK) + continue; + + for (j = 0; j != node->nb_edge; j++) { + if (node->edge_type[j] == BACK_EDGE) + RTE_BPF_LOG(ERR, + "loop at pc:%u --> pc:%u;\n", + i, node->edge_dest[j]); + } + } +} + +/* + * First pass goes though all instructions in the set, checks that each + * instruction is a valid one (correct syntax, valid field values, etc.) + * and constructs control flow graph (CFG). + * Then deapth-first search is performed over the constructed graph. + * Programs with unreachable instructions and/or loops will be rejected. + */ +static int +validate(struct bpf_verifier *bvf) +{ + int32_t rc; + uint32_t i; + struct inst_node *node; + const struct ebpf_insn *ins; + const char *err; + + rc = 0; + for (i = 0; i < bvf->prm->nb_ins; i++) { + + ins = bvf->prm->ins + i; + node = bvf->in + i; + + err = check_syntax(ins); + if (err != 0) { + RTE_BPF_LOG(ERR, "%s: %s at pc: %u\n", + __func__, err, i); + rc |= -EINVAL; + } + + /* + * construct CFG, jcc nodes have to outgoing edges, + * 'exit' nodes - none, all others nodes have exaclty one + * outgoing edge. + */ + switch (ins->code) { + case (BPF_JMP | EBPF_EXIT): + break; + case (BPF_JMP | BPF_JEQ | BPF_K): + case (BPF_JMP | EBPF_JNE | BPF_K): + case (BPF_JMP | BPF_JGT | BPF_K): + case (BPF_JMP | EBPF_JLT | BPF_K): + case (BPF_JMP | BPF_JGE | BPF_K): + case (BPF_JMP | EBPF_JLE | BPF_K): + case (BPF_JMP | EBPF_JSGT | BPF_K): + case (BPF_JMP | EBPF_JSLT | BPF_K): + case (BPF_JMP | EBPF_JSGE | BPF_K): + case (BPF_JMP | EBPF_JSLE | BPF_K): + case (BPF_JMP | BPF_JSET | BPF_K): + case (BPF_JMP | BPF_JEQ | BPF_X): + case (BPF_JMP | EBPF_JNE | BPF_X): + case (BPF_JMP | BPF_JGT | BPF_X): + case (BPF_JMP | EBPF_JLT | BPF_X): + case (BPF_JMP | BPF_JGE | BPF_X): + case (BPF_JMP | EBPF_JLE | BPF_X): + case (BPF_JMP | EBPF_JSGT | BPF_X): + case (BPF_JMP | EBPF_JSLT | BPF_X): + case (BPF_JMP | EBPF_JSGE | BPF_X): + case (BPF_JMP | EBPF_JSLE | BPF_X): + case (BPF_JMP | BPF_JSET | BPF_X): + rc |= add_edge(bvf, node, i + ins->off + 1); + rc |= add_edge(bvf, node, i + 1); + bvf->nb_jcc_nodes++; + break; + case (BPF_JMP | BPF_JA): + rc |= add_edge(bvf, node, i + ins->off + 1); + break; + /* load 64 bit immediate value */ + case (BPF_LD | BPF_IMM | EBPF_DW): + rc |= add_edge(bvf, node, i + 2); + i++; + break; + default: + rc |= add_edge(bvf, node, i + 1); + break; + } + + bvf->nb_nodes++; + bvf->node_colour[WHITE]++; + } + + if (rc != 0) + return rc; + + dfs(bvf); + + RTE_BPF_LOG(DEBUG, "%s(%p) stats:\n" + "nb_nodes=%u;\n" + "nb_jcc_nodes=%u;\n" + "node_color={[WHITE]=%u, [GREY]=%u,, [BLACK]=%u};\n" + "edge_type={[UNKNOWN]=%u, [TREE]=%u, [BACK]=%u, [CROSS]=%u};\n", + __func__, bvf, + bvf->nb_nodes, + bvf->nb_jcc_nodes, + bvf->node_colour[WHITE], bvf->node_colour[GREY], + bvf->node_colour[BLACK], + bvf->edge_type[UNKNOWN_EDGE], bvf->edge_type[TREE_EDGE], + bvf->edge_type[BACK_EDGE], bvf->edge_type[CROSS_EDGE]); + + if (bvf->node_colour[BLACK] != bvf->nb_nodes) { + RTE_BPF_LOG(ERR, "%s(%p) unreachable instructions;\n", + __func__, bvf); + log_unreachable(bvf); + return -EINVAL; + } + + if (bvf->node_colour[GREY] != 0 || bvf->node_colour[WHITE] != 0 || + bvf->edge_type[UNKNOWN_EDGE] != 0) { + RTE_BPF_LOG(ERR, "%s(%p) DFS internal error;\n", + __func__, bvf); + return -EINVAL; + } + + if (bvf->edge_type[BACK_EDGE] != 0) { + RTE_BPF_LOG(ERR, "%s(%p) loops detected;\n", + __func__, bvf); + log_loop(bvf); + return -EINVAL; + } + + return 0; +} + +/* + * helper functions get/free eval states. + */ +static struct bpf_eval_state * +pull_eval_state(struct bpf_verifier *bvf) +{ + uint32_t n; + + n = bvf->evst_pool.cur; + if (n == bvf->evst_pool.num) + return NULL; + + bvf->evst_pool.cur = n + 1; + return bvf->evst_pool.ent + n; +} + +static void +push_eval_state(struct bpf_verifier *bvf) +{ + bvf->evst_pool.cur--; +} + +static void +evst_pool_fini(struct bpf_verifier *bvf) +{ + bvf->evst = NULL; + free(bvf->evst_pool.ent); + memset(&bvf->evst_pool, 0, sizeof(bvf->evst_pool)); +} + +static int +evst_pool_init(struct bpf_verifier *bvf) +{ + uint32_t n; + + n = bvf->nb_jcc_nodes + 1; + + bvf->evst_pool.ent = calloc(n, sizeof(bvf->evst_pool.ent[0])); + if (bvf->evst_pool.ent == NULL) + return -ENOMEM; + + bvf->evst_pool.num = n; + bvf->evst_pool.cur = 0; + + bvf->evst = pull_eval_state(bvf); + return 0; +} + +/* + * Save current eval state. + */ +static int +save_eval_state(struct bpf_verifier *bvf, struct inst_node *node) +{ + struct bpf_eval_state *st; + + /* get new eval_state for this node */ + st = pull_eval_state(bvf); + if (st == NULL) { + RTE_BPF_LOG(ERR, + "%s: internal error (out of space) at pc: %u\n", + __func__, get_node_idx(bvf, node)); + return -ENOMEM; + } + + /* make a copy of current state */ + memcpy(st, bvf->evst, sizeof(*st)); + + /* swap current state with new one */ + node->evst = bvf->evst; + bvf->evst = st; + + RTE_BPF_LOG(DEBUG, "%s(bvf=%p,node=%u) old/new states: %p/%p;\n", + __func__, bvf, get_node_idx(bvf, node), node->evst, bvf->evst); + + return 0; +} + +/* + * Restore previous eval state and mark current eval state as free. + */ +static void +restore_eval_state(struct bpf_verifier *bvf, struct inst_node *node) +{ + RTE_BPF_LOG(DEBUG, "%s(bvf=%p,node=%u) old/new states: %p/%p;\n", + __func__, bvf, get_node_idx(bvf, node), bvf->evst, node->evst); + + bvf->evst = node->evst; + node->evst = NULL; + push_eval_state(bvf); +} + +static void +log_eval_state(const struct bpf_verifier *bvf, const struct ebpf_insn *ins, + uint32_t pc, int32_t loglvl) +{ + const struct bpf_eval_state *st; + const struct bpf_reg_val *rv; + + rte_log(loglvl, rte_bpf_logtype, "%s(pc=%u):\n", __func__, pc); + + st = bvf->evst; + rv = st->rv + ins->dst_reg; + + rte_log(loglvl, rte_bpf_logtype, + "r%u={\n" + "\tv={type=%u, size=%zu},\n" + "\tmask=0x%" PRIx64 ",\n" + "\tu={min=0x%" PRIx64 ", max=0x%" PRIx64 "},\n" + "\ts={min=%" PRId64 ", max=%" PRId64 "},\n" + "};\n", + ins->dst_reg, + rv->v.type, rv->v.size, + rv->mask, + rv->u.min, rv->u.max, + rv->s.min, rv->s.max); +} + +/* + * Do second pass through CFG and try to evaluate instructions + * via each possible path. + * Right now evaluation functionality is quite limited. + * Still need to add extra checks for: + * - use/return uninitialized registers. + * - use uninitialized data from the stack. + * - memory boundaries violation. + */ +static int +evaluate(struct bpf_verifier *bvf) +{ + int32_t rc; + uint32_t idx, op; + const char *err; + const struct ebpf_insn *ins; + struct inst_node *next, *node; + + /* initial state of frame pointer */ + static const struct bpf_reg_val rvfp = { + .v = { + .type = RTE_BPF_ARG_PTR_STACK, + .size = MAX_BPF_STACK_SIZE, + }, + .mask = UINT64_MAX, + .u = {.min = MAX_BPF_STACK_SIZE, .max = MAX_BPF_STACK_SIZE}, + .s = {.min = MAX_BPF_STACK_SIZE, .max = MAX_BPF_STACK_SIZE}, + }; + + bvf->evst->rv[EBPF_REG_1].v = bvf->prm->prog_arg; + bvf->evst->rv[EBPF_REG_1].mask = UINT64_MAX; + if (bvf->prm->prog_arg.type == RTE_BPF_ARG_RAW) + eval_max_bound(bvf->evst->rv + EBPF_REG_1, UINT64_MAX); + + bvf->evst->rv[EBPF_REG_10] = rvfp; + + ins = bvf->prm->ins; + node = bvf->in; + next = node; + rc = 0; + + while (node != NULL && rc == 0) { + + /* + * current node evaluation, make sure we evaluate + * each node only once. + */ + if (next != NULL) { + + bvf->evin = node; + idx = get_node_idx(bvf, node); + op = ins[idx].code; + + /* for jcc node make a copy of evaluatoion state */ + if (node->nb_edge > 1) + rc |= save_eval_state(bvf, node); + + if (ins_chk[op].eval != NULL && rc == 0) { + err = ins_chk[op].eval(bvf, ins + idx); + if (err != NULL) { + RTE_BPF_LOG(ERR, "%s: %s at pc: %u\n", + __func__, err, idx); + rc = -EINVAL; + } + } + + log_eval_state(bvf, ins + idx, idx, RTE_LOG_DEBUG); + bvf->evin = NULL; + } + + /* proceed through CFG */ + next = get_next_node(bvf, node); + if (next != NULL) { + + /* proceed with next child */ + if (node->cur_edge == node->nb_edge && + node->evst != NULL) + restore_eval_state(bvf, node); + + next->prev_node = get_node_idx(bvf, node); + node = next; + } else { + /* + * finished with current node and all it's kids, + * proceed with parent + */ + node->cur_edge = 0; + node = get_prev_node(bvf, node); + + /* finished */ + if (node == bvf->in) + node = NULL; + } + } + + return rc; +} + +int +bpf_validate(struct rte_bpf *bpf) +{ + int32_t rc; + struct bpf_verifier bvf; + + /* check input argument type, don't allow mbuf ptr on 32-bit */ + if (bpf->prm.prog_arg.type != RTE_BPF_ARG_RAW && + bpf->prm.prog_arg.type != RTE_BPF_ARG_PTR && + (sizeof(uint64_t) != sizeof(uintptr_t) || + bpf->prm.prog_arg.type != RTE_BPF_ARG_PTR_MBUF)) { + RTE_BPF_LOG(ERR, "%s: unsupported argument type\n", __func__); + return -ENOTSUP; + } + + memset(&bvf, 0, sizeof(bvf)); + bvf.prm = &bpf->prm; + bvf.in = calloc(bpf->prm.nb_ins, sizeof(bvf.in[0])); + if (bvf.in == NULL) + return -ENOMEM; + + rc = validate(&bvf); + + if (rc == 0) { + rc = evst_pool_init(&bvf); + if (rc == 0) + rc = evaluate(&bvf); + evst_pool_fini(&bvf); + } + + free(bvf.in); + + /* copy collected info */ + if (rc == 0) + bpf->stack_sz = bvf.stack_sz; + + return rc; +} diff --git a/lib/librte_bpf/meson.build b/lib/librte_bpf/meson.build new file mode 100644 index 00000000..bc0cd78f --- /dev/null +++ b/lib/librte_bpf/meson.build @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2018 Intel Corporation + +allow_experimental_apis = true +sources = files('bpf.c', + 'bpf_exec.c', + 'bpf_load.c', + 'bpf_pkt.c', + 'bpf_validate.c') + +if arch_subdir == 'x86' and cc.sizeof('void *') == 8 + sources += files('bpf_jit_x86.c') +endif + +install_headers = files('bpf_def.h', + 'rte_bpf.h', + 'rte_bpf_ethdev.h') + +deps += ['mbuf', 'net', 'ethdev'] + +dep = cc.find_library('elf', required: false) +if dep.found() == true and cc.has_header('libelf.h', dependencies: dep) + sources += files('bpf_load_elf.c') + ext_deps += dep +endif diff --git a/lib/librte_bpf/rte_bpf.h b/lib/librte_bpf/rte_bpf.h new file mode 100644 index 00000000..ad62ef2c --- /dev/null +++ b/lib/librte_bpf/rte_bpf.h @@ -0,0 +1,203 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#ifndef _RTE_BPF_H_ +#define _RTE_BPF_H_ + +/** + * @file rte_bpf.h + * @b EXPERIMENTAL: this API may change without prior notice + * + * RTE BPF support. + * librte_bpf provides a framework to load and execute eBPF bytecode + * inside user-space dpdk based applications. + * It supports basic set of features from eBPF spec + * (https://www.kernel.org/doc/Documentation/networking/filter.txt). + */ + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Possible types for function/BPF program arguments. + */ +enum rte_bpf_arg_type { + RTE_BPF_ARG_UNDEF, /**< undefined */ + RTE_BPF_ARG_RAW, /**< scalar value */ + RTE_BPF_ARG_PTR = 0x10, /**< pointer to data buffer */ + RTE_BPF_ARG_PTR_MBUF, /**< pointer to rte_mbuf */ + RTE_BPF_ARG_PTR_STACK, +}; + +/** + * function argument information + */ +struct rte_bpf_arg { + enum rte_bpf_arg_type type; + /** + * for ptr type - max size of data buffer it points to + * for raw type - the size (in bytes) of the value + */ + size_t size; + size_t buf_size; + /**< for mbuf ptr type, max size of rte_mbuf data buffer */ +}; + +/** + * determine is argument a pointer + */ +#define RTE_BPF_ARG_PTR_TYPE(x) ((x) & RTE_BPF_ARG_PTR) + +/** + * Possible types for external symbols. + */ +enum rte_bpf_xtype { + RTE_BPF_XTYPE_FUNC, /**< function */ + RTE_BPF_XTYPE_VAR, /**< variable */ + RTE_BPF_XTYPE_NUM +}; + +/** + * Definition for external symbols available in the BPF program. + */ +struct rte_bpf_xsym { + const char *name; /**< name */ + enum rte_bpf_xtype type; /**< type */ + union { + struct { + uint64_t (*val)(uint64_t, uint64_t, uint64_t, + uint64_t, uint64_t); + uint32_t nb_args; + struct rte_bpf_arg args[EBPF_FUNC_MAX_ARGS]; + /**< Function arguments descriptions. */ + struct rte_bpf_arg ret; /**< function return value. */ + } func; + struct { + void *val; /**< actual memory location */ + struct rte_bpf_arg desc; /**< type, size, etc. */ + } var; /**< external variable */ + }; +}; + +/** + * Input parameters for loading eBPF code. + */ +struct rte_bpf_prm { + const struct ebpf_insn *ins; /**< array of eBPF instructions */ + uint32_t nb_ins; /**< number of instructions in ins */ + const struct rte_bpf_xsym *xsym; + /**< array of external symbols that eBPF code is allowed to reference */ + uint32_t nb_xsym; /**< number of elements in xsym */ + struct rte_bpf_arg prog_arg; /**< eBPF program input arg description */ +}; + +/** + * Information about compiled into native ISA eBPF code. + */ +struct rte_bpf_jit { + uint64_t (*func)(void *); /**< JIT-ed native code */ + size_t sz; /**< size of JIT-ed code */ +}; + +struct rte_bpf; + +/** + * De-allocate all memory used by this eBPF execution context. + * + * @param bpf + * BPF handle to destroy. + */ +void __rte_experimental +rte_bpf_destroy(struct rte_bpf *bpf); + +/** + * Create a new eBPF execution context and load given BPF code into it. + * + * @param prm + * Parameters used to create and initialise the BPF exeution context. + * @return + * BPF handle that is used in future BPF operations, + * or NULL on error, with error code set in rte_errno. + * Possible rte_errno errors include: + * - EINVAL - invalid parameter passed to function + * - ENOMEM - can't reserve enough memory + */ +struct rte_bpf * __rte_experimental +rte_bpf_load(const struct rte_bpf_prm *prm); + +/** + * Create a new eBPF execution context and load BPF code from given ELF + * file into it. + * + * @param prm + * Parameters used to create and initialise the BPF exeution context. + * @param fname + * Pathname for a ELF file. + * @param sname + * Name of the executable section within the file to load. + * @return + * BPF handle that is used in future BPF operations, + * or NULL on error, with error code set in rte_errno. + * Possible rte_errno errors include: + * - EINVAL - invalid parameter passed to function + * - ENOMEM - can't reserve enough memory + */ +struct rte_bpf * __rte_experimental +rte_bpf_elf_load(const struct rte_bpf_prm *prm, const char *fname, + const char *sname); +/** + * Execute given BPF bytecode. + * + * @param bpf + * handle for the BPF code to execute. + * @param ctx + * pointer to input context. + * @return + * BPF execution return value. + */ +uint64_t __rte_experimental +rte_bpf_exec(const struct rte_bpf *bpf, void *ctx); + +/** + * Execute given BPF bytecode over a set of input contexts. + * + * @param bpf + * handle for the BPF code to execute. + * @param ctx + * array of pointers to the input contexts. + * @param rc + * array of return values (one per input). + * @param num + * number of elements in ctx[] (and rc[]). + * @return + * number of successfully processed inputs. + */ +uint32_t __rte_experimental +rte_bpf_exec_burst(const struct rte_bpf *bpf, void *ctx[], uint64_t rc[], + uint32_t num); + +/** + * Provide information about natively compield code for given BPF handle. + * + * @param bpf + * handle for the BPF code. + * @param jit + * pointer to the rte_bpf_jit structure to be filled with related data. + * @return + * - -EINVAL if the parameters are invalid. + * - Zero if operation completed successfully. + */ +int __rte_experimental +rte_bpf_get_jit(const struct rte_bpf *bpf, struct rte_bpf_jit *jit); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_BPF_H_ */ diff --git a/lib/librte_bpf/rte_bpf_ethdev.h b/lib/librte_bpf/rte_bpf_ethdev.h new file mode 100644 index 00000000..31731e7a --- /dev/null +++ b/lib/librte_bpf/rte_bpf_ethdev.h @@ -0,0 +1,117 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#ifndef _RTE_BPF_ETHDEV_H_ +#define _RTE_BPF_ETHDEV_H_ + +/** + * @file rte_bpf_ethdev.h + * @b EXPERIMENTAL: this API may change without prior notice + * + * API to install BPF filter as RX/TX callbacks for eth devices. + * Note that right now: + * - it is not MT safe, i.e. it is not allowed to do load/unload for the + * same port/queue from different threads in parallel. + * - though it allows to do load/unload at runtime + * (while RX/TX is ongoing on given port/queue). + * - allows only one BPF program per port/queue, + * i.e. new load will replace previously loaded for that port/queue BPF program. + * Filter behaviour - if BPF program returns zero value for a given packet, + * then it will be dropped inside callback and no further processing + * on RX - it will be dropped inside callback and no further processing + * for that packet will happen. + * on TX - packet will remain unsent, and it is responsibility of the user + * to handle such situation (drop, try to send again, etc.). + */ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +enum { + RTE_BPF_ETH_F_NONE = 0, + RTE_BPF_ETH_F_JIT = 0x1, /*< use compiled into native ISA code */ +}; + +/** + * Unload previously loaded BPF program (if any) from given RX port/queue + * and remove appropriate RX port/queue callback. + * + * @param port + * The identifier of the ethernet port + * @param queue + * The identifier of the RX queue on the given port + */ +void __rte_experimental +rte_bpf_eth_rx_unload(uint16_t port, uint16_t queue); + +/** + * Unload previously loaded BPF program (if any) from given TX port/queue + * and remove appropriate TX port/queue callback. + * + * @param port + * The identifier of the ethernet port + * @param queue + * The identifier of the TX queue on the given port + */ +void __rte_experimental +rte_bpf_eth_tx_unload(uint16_t port, uint16_t queue); + +/** + * Load BPF program from the ELF file and install callback to execute it + * on given RX port/queue. + * + * @param port + * The identifier of the ethernet port + * @param queue + * The identifier of the RX queue on the given port + * @param fname + * Pathname for a ELF file. + * @param sname + * Name of the executable section within the file to load. + * @param prm + * Parameters used to create and initialise the BPF exeution context. + * @param flags + * Flags that define expected expected behavior of the loaded filter + * (i.e. jited/non-jited version to use). + * @return + * Zero on successful completion or negative error code otherwise. + */ +int __rte_experimental +rte_bpf_eth_rx_elf_load(uint16_t port, uint16_t queue, + const struct rte_bpf_prm *prm, const char *fname, const char *sname, + uint32_t flags); + +/** + * Load BPF program from the ELF file and install callback to execute it + * on given TX port/queue. + * + * @param port + * The identifier of the ethernet port + * @param queue + * The identifier of the TX queue on the given port + * @param fname + * Pathname for a ELF file. + * @param sname + * Name of the executable section within the file to load. + * @param prm + * Parameters used to create and initialise the BPF exeution context. + * @param flags + * Flags that define expected expected behavior of the loaded filter + * (i.e. jited/non-jited version to use). + * @return + * Zero on successful completion or negative error code otherwise. + */ +int __rte_experimental +rte_bpf_eth_tx_elf_load(uint16_t port, uint16_t queue, + const struct rte_bpf_prm *prm, const char *fname, const char *sname, + uint32_t flags); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_BPF_ETHDEV_H_ */ diff --git a/lib/librte_bpf/rte_bpf_version.map b/lib/librte_bpf/rte_bpf_version.map new file mode 100644 index 00000000..a203e088 --- /dev/null +++ b/lib/librte_bpf/rte_bpf_version.map @@ -0,0 +1,16 @@ +EXPERIMENTAL { + global: + + rte_bpf_destroy; + rte_bpf_elf_load; + rte_bpf_eth_rx_elf_load; + rte_bpf_eth_rx_unload; + rte_bpf_eth_tx_elf_load; + rte_bpf_eth_tx_unload; + rte_bpf_exec; + rte_bpf_exec_burst; + rte_bpf_get_jit; + rte_bpf_load; + + local: *; +}; -- cgit 1.2.3-korg