diff options
Diffstat (limited to 'app/nginx/src/tldk')
-rw-r--r-- | app/nginx/src/tldk/be.c | 1240 | ||||
-rw-r--r-- | app/nginx/src/tldk/be.h | 56 | ||||
-rw-r--r-- | app/nginx/src/tldk/debug.h | 75 | ||||
-rw-r--r-- | app/nginx/src/tldk/module.c | 497 | ||||
-rw-r--r-- | app/nginx/src/tldk/ngx_tldk.h | 182 | ||||
-rw-r--r-- | app/nginx/src/tldk/parse.c | 456 | ||||
-rw-r--r-- | app/nginx/src/tldk/tldk_event.c | 276 | ||||
-rw-r--r-- | app/nginx/src/tldk/tldk_sock.c | 549 | ||||
-rw-r--r-- | app/nginx/src/tldk/tldk_sock.h | 108 |
9 files changed, 3439 insertions, 0 deletions
diff --git a/app/nginx/src/tldk/be.c b/app/nginx/src/tldk/be.c new file mode 100644 index 0000000..ba4039a --- /dev/null +++ b/app/nginx/src/tldk/be.c @@ -0,0 +1,1240 @@ +/* + * Copyright (c) 2017 Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <assert.h> +#include <netinet/ip6.h> + +#include <rte_version.h> +#include <rte_cycles.h> +#include <rte_ethdev.h> +#include <rte_errno.h> +#include <rte_lpm6.h> +#include <rte_lpm.h> +#include <rte_ip.h> +#include <rte_tcp.h> + +#include <tle_tcp.h> + +#include <ngx_config.h> +#include <ngx_core.h> + +#include "be.h" + +#define RX_RING_SIZE 0x400 +#define TX_RING_SIZE 0x800 +#define MAX_RULES 0x100 +#define MAX_TBL8 0x800 + +#define MPOOL_CACHE_SIZE 0x100 +#define MPOOL_NB_BUF 0x20000 + +#define FRAG_MBUF_BUF_SIZE (RTE_PKTMBUF_HEADROOM + TLE_DST_MAX_HDR) + +#define RX_CSUM_OFFLOAD (DEV_RX_OFFLOAD_IPV4_CKSUM | DEV_RX_OFFLOAD_TCP_CKSUM) + +#define TCP_MAX_PROCESS 0x20 + +static const struct rte_eth_conf port_conf_default = { + .rxmode = { + .hw_vlan_strip = 1, + }, +}; + +struct ptype2cb { + uint32_t mask; + const char *name; + rte_rx_callback_fn fn; +}; + +enum { + ETHER_PTYPE = 0x1, + IPV4_PTYPE = 0x2, + IPV4_EXT_PTYPE = 0x4, + IPV6_PTYPE = 0x8, + IPV6_EXT_PTYPE = 0x10, + TCP_PTYPE = 0x20, + UDP_PTYPE = 0x40, +}; + +int +be_lcore_lpm_init(struct tldk_ctx *tcx, uint32_t sid, + const struct tldk_ctx_conf *cf) +{ + ngx_uint_t worker = cf->worker; + uint32_t lcore = cf->lcore; + char str[RTE_LPM_NAMESIZE]; + + const struct rte_lpm_config lpm4_cfg = { + .max_rules = MAX_RULES, + .number_tbl8s = MAX_TBL8, + }; + + const struct rte_lpm6_config lpm6_cfg = { + .max_rules = MAX_RULES, + .number_tbl8s = MAX_TBL8, + }; + + snprintf(str, sizeof(str), "LPM4%lu-%u\n", worker, lcore); + tcx->lpm4 = rte_lpm_create(str, sid, &lpm4_cfg); + RTE_LOG(NOTICE, USER1, "%s(worker=%lu, lcore=%u): lpm4=%p;\n", + __func__, worker, lcore, tcx->lpm4); + if (tcx->lpm4 == NULL) + return -ENOMEM; + + snprintf(str, sizeof(str), "LPM6%lu-%u\n", worker, lcore); + tcx->lpm6 = rte_lpm6_create(str, sid, &lpm6_cfg); + RTE_LOG(NOTICE, USER1, "%s(worker=%lu, lcore=%u): lpm6=%p;\n", + __func__, worker, lcore, tcx->lpm6); + if (tcx->lpm6 == NULL) { + rte_lpm_free(tcx->lpm4); + return -ENOMEM; + } + + return 0; +} + +int +be_lpm4_dst_lookup(void *data, const struct in_addr *addr, + struct tle_dest *res) +{ + int32_t rc; + uint32_t idx; + struct tldk_ctx *tcx; + struct tle_dest *dst; + + tcx = data; + rc = rte_lpm_lookup(tcx->lpm4, rte_be_to_cpu_32(addr->s_addr), &idx); + if (rc == 0) { + dst = &tcx->dst4[idx]; + memcpy(res, dst, dst->l2_len + dst->l3_len + + offsetof(struct tle_dest, hdr)); + } + + return rc; +} + +int +be_lpm6_dst_lookup(void *data, const struct in6_addr *addr, + struct tle_dest *res) +{ + int32_t rc; + struct tldk_ctx *tcx; + struct tle_dest *dst; + uintptr_t p; +#if RTE_VERSION_NUM(17, 5, 0, 0) <= RTE_VERSION + uint32_t idx; +#else + uint8_t idx; +#endif + + tcx = data; + p = (uintptr_t)addr->s6_addr; + rc = rte_lpm6_lookup(tcx->lpm6, (uint8_t *)p, &idx); + if (rc == 0) { + dst = &tcx->dst6[idx]; + memcpy(res, dst, dst->l2_len + dst->l3_len + + offsetof(struct tle_dest, hdr)); + } + + return rc; +} + +/* + * Initialise DPDK port. + */ +static int +port_init(const struct tldk_port_conf *pcf) +{ + int32_t rc; + struct rte_eth_conf port_conf; + struct rte_eth_dev_info dev_info; + + rte_eth_dev_info_get(pcf->id, &dev_info); + + if ((dev_info.rx_offload_capa & pcf->rx_offload) != pcf->rx_offload) { + RTE_LOG(ERR, USER1, + "port#%u supported/requested RX offloads don't match, " + "supported: %#x, requested: %#x;\n", + pcf->id, dev_info.rx_offload_capa, pcf->rx_offload); + return NGX_ERROR; + } + if ((dev_info.tx_offload_capa & pcf->tx_offload) != pcf->tx_offload) { + RTE_LOG(ERR, USER1, + "port#%u supported/requested TX offloads don't match, " + "supported: %#x, requested: %#x;\n", + pcf->id, dev_info.tx_offload_capa, pcf->tx_offload); + return NGX_ERROR; + } + + port_conf = port_conf_default; + + if ((pcf->rx_offload & RX_CSUM_OFFLOAD) != 0) { + RTE_LOG(ERR, USER1, "%s(%u): enabling RX csum offload;\n", + __func__, pcf->id); + port_conf.rxmode.hw_ip_checksum = 1; + } + + port_conf.rxmode.max_rx_pkt_len = pcf->mtu + ETHER_CRC_LEN; + if (port_conf.rxmode.max_rx_pkt_len > ETHER_MAX_LEN) + port_conf.rxmode.jumbo_frame = 1; + port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; + port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_IP | ETH_RSS_TCP; + + rc = rte_eth_dev_configure(pcf->id, pcf->nb_queues, pcf->nb_queues, + &port_conf); + RTE_LOG(NOTICE, USER1, + "%s: rte_eth_dev_configure(prt_id=%u, nb_rxq=%u, nb_txq=%u) " + "returns %d;\n", __func__, pcf->id, pcf->nb_queues, + pcf->nb_queues, rc); + + if (rc != 0) + return NGX_ERROR; + + return NGX_OK; +} + +/* + * Check that lcore is enabled, not master, and not in use already. + */ +int +be_check_lcore(uint32_t lid) +{ + if (rte_lcore_is_enabled(lid) == 0) { + RTE_LOG(ERR, USER1, "lcore %u is not enabled\n", lid); + return -EINVAL; + } + + if (rte_get_master_lcore() != lid && + rte_eal_get_lcore_state(lid) == RUNNING) { + RTE_LOG(ERR, USER1, "lcore %u already running %p\n", + lid, lcore_config[lid].f); + return -EINVAL; + } + + return 0; +} + +int +be_mpool_init(struct tldk_ctx *tcx) +{ + int32_t rc; + uint32_t nmb, sid; + struct rte_mempool *mp; + char name[RTE_MEMPOOL_NAMESIZE]; + + ngx_uint_t worker = tcx->cf->worker; + uint32_t lcore = tcx->cf->lcore; + + sid = rte_lcore_to_socket_id(tcx->cf->lcore); + nmb = (tcx->cf->nb_mbuf == 0) ? MPOOL_NB_BUF : tcx->cf->nb_mbuf; + + snprintf(name, sizeof(name), "MP%lu-%u", worker, lcore); + mp = rte_pktmbuf_pool_create(name, nmb, MPOOL_CACHE_SIZE, 0, + RTE_MBUF_DEFAULT_BUF_SIZE, sid); + if (mp == NULL) { + rc = -rte_errno; + RTE_LOG(ERR, USER1, "%s:Mempool creation failed for " + "ctx:wrk(%lu)-ctx:lcore(%u) with error code: %d\n", + __func__, worker, lcore, rc); + return rc; + } + + tcx->mpool = mp; + + snprintf(name, sizeof(name), "frag_MP%lu-%u", + worker, lcore); + mp = rte_pktmbuf_pool_create(name, nmb, + MPOOL_CACHE_SIZE, 0, FRAG_MBUF_BUF_SIZE, sid - 1); + if (mp == NULL) { + rc = -rte_errno; + RTE_LOG(ERR, USER1, "%s:Frag mempool creation failed for " + "ctx:wrk(%lu)-ctx:lcore(%u) with error code: %d\n", + __func__, worker, lcore, rc); + return rc; + } + + tcx->frag_mpool = mp; + + return 0; +} + +int +be_queue_init(struct tldk_ctx *tcx, const tldk_conf_t *cf) +{ + int32_t socket, rc; + uint16_t queue_id; + uint32_t port_id, i; + struct rte_eth_dev_info dev_info; + const struct tldk_ctx_conf *ctx; + const struct tldk_port_conf *pcf; + + ctx = tcx->cf; + for (i = 0; i < ctx->nb_dev; i++) { + port_id = ctx->dev[i].port; + queue_id = ctx->dev[i].queue; + pcf = &cf->port[port_id]; + + rte_eth_dev_info_get(port_id, &dev_info); + dev_info.default_rxconf.rx_drop_en = 1; + dev_info.default_txconf.tx_free_thresh = TX_RING_SIZE / 2; + + if (pcf->tx_offload != 0) { + RTE_LOG(ERR, USER1, + "%s(port=%u): enabling full featured TX;\n", + __func__, port_id); + dev_info.default_txconf.txq_flags = 0; + } + + socket = rte_eth_dev_socket_id(port_id); + + rc = rte_eth_rx_queue_setup(port_id, queue_id, RX_RING_SIZE, + socket, &dev_info.default_rxconf, tcx->mpool); + if (rc < 0) { + RTE_LOG(ERR, USER1, + "%s: rx queue=%u setup failed with error " + "code: %d\n", __func__, queue_id, rc); + return rc; + } + + rc = rte_eth_tx_queue_setup(port_id, queue_id, TX_RING_SIZE, + socket, &dev_info.default_txconf); + if (rc < 0) { + RTE_LOG(ERR, USER1, + "%s: tx queue=%u setup failed with error " + "code: %d\n", __func__, queue_id, rc); + return rc; + } + } + + return 0; +} + +/* + * Setup all enabled ports. + */ +int +be_port_init(tldk_conf_t *cf) +{ + int32_t rc; + uint32_t i; + struct tldk_port_conf *dpf; + + for (i = 0; i != cf->nb_port; i++) { + dpf = &cf->port[i]; + rc = port_init(dpf); + if (rc != 0) { + RTE_LOG(ERR, USER1, + "%s: port=%u init failed with error code: %d\n", + __func__, dpf->id, rc); + return NGX_ERROR; + } + rte_eth_macaddr_get(dpf->id, &dpf->mac); + rte_eth_promiscuous_enable(dpf->id); + } + + return NGX_OK; +} + +static int +be_add_ipv4_route(struct tldk_ctx *tcx, const struct tldk_dest_conf *dcf, + uint8_t idx) +{ + int32_t rc; + uint32_t addr, depth; + char str[INET_ADDRSTRLEN]; + + depth = dcf->prfx; + addr = rte_be_to_cpu_32(dcf->ipv4.s_addr); + + inet_ntop(AF_INET, &dcf->ipv4, str, sizeof(str)); + rc = rte_lpm_add(tcx->lpm4, addr, depth, idx); + RTE_LOG(NOTICE, USER1, "%s(lcore=%u,dev_id=%u,dev=%p," + "ipv4=%s/%u,mtu=%u," + "mac=%02hhx:%02hhx:%02hhx:%02hhx:%02hhx:%02hhx) " + "returns %d;\n", + __func__, tcx->cf->lcore, dcf->dev, tcx->dst4[idx].dev, + str, depth, tcx->dst4[idx].mtu, + dcf->mac.addr_bytes[0], dcf->mac.addr_bytes[1], + dcf->mac.addr_bytes[2], dcf->mac.addr_bytes[3], + dcf->mac.addr_bytes[4], dcf->mac.addr_bytes[5], + rc); + + return rc; +} + +static int +be_add_ipv6_route(struct tldk_ctx *tcx, const struct tldk_dest_conf *dcf, + uint8_t idx) +{ + int32_t rc; + uint32_t depth; + char str[INET6_ADDRSTRLEN]; + + depth = dcf->prfx; + + rc = rte_lpm6_add(tcx->lpm6, (uint8_t *)(uintptr_t)dcf->ipv6.s6_addr, + depth, idx); + + inet_ntop(AF_INET6, &dcf->ipv6, str, sizeof(str)); + RTE_LOG(NOTICE, USER1, "%s(lcore=%u,dev_id=%u,dev=%p," + "ipv6=%s/%u,mtu=%u," + "mac=%02hhx:%02hhx:%02hhx:%02hhx:%02hhx:%02hhx) " + "returns %d;\n", + __func__, tcx->cf->lcore, dcf->dev, tcx->dst6[idx].dev, + str, depth, tcx->dst4[idx].mtu, + dcf->mac.addr_bytes[0], dcf->mac.addr_bytes[1], + dcf->mac.addr_bytes[2], dcf->mac.addr_bytes[3], + dcf->mac.addr_bytes[4], dcf->mac.addr_bytes[5], + rc); + + return rc; +} + +static void +fill_dst(struct tle_dest *dst, const struct tldk_dev *td, + const struct tldk_port_conf *pcf, const struct tldk_dest_conf *dest, + uint16_t l3_type, struct rte_mempool *mp) +{ + struct ether_hdr *eth; + struct ipv4_hdr *ip4h; + struct ipv6_hdr *ip6h; + + dst->dev = td->dev; + dst->head_mp = mp; + dst->mtu = RTE_MIN(dest->mtu, pcf->mtu); + dst->l2_len = sizeof(*eth); + + eth = (struct ether_hdr *)dst->hdr; + + ether_addr_copy(&pcf->mac, ð->s_addr); + ether_addr_copy(&dest->mac, ð->d_addr); + eth->ether_type = rte_cpu_to_be_16(l3_type); + + if (l3_type == ETHER_TYPE_IPv4) { + dst->l3_len = sizeof(*ip4h); + ip4h = (struct ipv4_hdr *)(eth + 1); + ip4h->version_ihl = 4 << 4 | + sizeof(*ip4h) / IPV4_IHL_MULTIPLIER; + ip4h->time_to_live = 64; + ip4h->next_proto_id = IPPROTO_TCP; + } else if (l3_type == ETHER_TYPE_IPv6) { + dst->l3_len = sizeof(*ip6h); + ip6h = (struct ipv6_hdr *)(eth + 1); + ip6h->vtc_flow = 6 << 4; + ip6h->proto = IPPROTO_TCP; + ip6h->hop_limits = 64; + } +} + +static int +be_add_dest(const struct tldk_dest_conf *dcf, struct tldk_ctx *tcx, + uint32_t dev_idx, const struct tldk_port_conf *pcf, uint32_t family, + uint32_t dnum) +{ + struct tle_dest *dp; + uint32_t i, n, m; + uint16_t l3_type; + int32_t rc = 0; + + if (family == AF_INET) { + n = tcx->dst4_num; + dp = tcx->dst4 + n; + m = RTE_DIM(tcx->dst4); + l3_type = ETHER_TYPE_IPv4; + } else { + n = tcx->dst6_num; + dp = tcx->dst6 + n; + m = RTE_DIM(tcx->dst6); + l3_type = ETHER_TYPE_IPv6; + } + + if (n + dnum >= m) { + RTE_LOG(ERR, USER1, "%s(lcore=%u, family=%hu, dnum=%u) exceeds " + "maximum allowed number of destinations(%u);\n", + __func__, tcx->cf->lcore, family, dnum, m); + return -ENOSPC; + } + + for (i = 0; i != dnum && rc == 0; i++) { + fill_dst(dp + i, &tcx->dev[dev_idx], pcf, dcf, + l3_type, tcx->frag_mpool); + if (family == AF_INET) + rc = be_add_ipv4_route(tcx, dcf, n + i); + else + rc = be_add_ipv6_route(tcx, dcf, n + i); + } + + if (family == AF_INET) + tcx->dst4_num = n + i; + else + tcx->dst6_num = n + i; + + return rc; +} + +int +be_dst_init(struct tldk_ctx *tcx, const tldk_conf_t *cf) +{ + uint32_t i, f, d, l, port_id; + const struct tldk_ctx_conf *ctx_cf = tcx->cf; + const struct tldk_dest_conf *dcf; + const struct tldk_port_conf *pcf; + int32_t rc = 0; + + for (i = 0; i < ctx_cf->nb_dest; i++) { + dcf = &ctx_cf->dest[i]; + f = dcf->family; + d = dcf->dev; + for (l = 0; l != tcx->nb_dev; l++) { + if (tcx->dev[l].cf.id == d) { + /* fetch the port conf for the port + * associated with device + */ + port_id = tcx->dev[l].cf.port; + pcf = &cf->port[port_id]; + rc = be_add_dest(dcf, tcx, l, pcf, f, 1); + if (rc != 0) { + RTE_LOG(ERR, USER1, + "%s(tcx=%u, family=%u) " + "could not add " + "destinations(%u)\n", + __func__, ctx_cf->lcore, f, i); + return -ENOSPC; + } + break; + } + } + } + + return rc; +} + +int +be_add_dev(struct tldk_ctx *tcx, const tldk_conf_t *cf) +{ + int32_t rc = 0; + uint32_t i, port_id; + struct tle_dev_param dprm; + const struct tldk_port_conf *pcf; + + memset(&dprm, 0, sizeof(dprm)); + + /* add the tle_dev on all applicable ports of the context */ + for (i = 0; i != tcx->cf->nb_dev; i++) { + + /* get the port id associated with the device */ + port_id = tcx->cf->dev[i].port; + + /* get the port config by port id */ + pcf = &cf->port[port_id]; + + /* populate the tle_dev_param struct */ + dprm.rx_offload = pcf->rx_offload; + dprm.tx_offload = pcf->tx_offload; + dprm.local_addr4.s_addr = pcf->ipv4; + + memcpy(&dprm.local_addr6, &pcf->ipv6, + sizeof(pcf->ipv6)); + + /* add the tle_dev */ + tcx->dev[i].dev = tle_add_dev(tcx->ctx, &dprm); + + RTE_LOG(NOTICE, USER1, "%s(port=%u), dev: %p\n", + __func__, port_id, + tcx->dev[i].dev); + + if (tcx->dev[i].dev == NULL) + rc = -rte_errno; + + if (rc != 0) + return rc; + + tcx->nb_dev++; + tcx->dev[i].cf = tcx->cf->dev[i]; + } + + return rc; +} + +static uint32_t +get_ptypes(const struct tldk_dev *td) +{ + uint32_t smask; + int32_t i, rc; + const uint32_t pmask = RTE_PTYPE_L2_MASK | RTE_PTYPE_L3_MASK | + RTE_PTYPE_L4_MASK; + + smask = 0; + rc = rte_eth_dev_get_supported_ptypes(td->cf.port, pmask, NULL, 0); + if (rc < 0) { + RTE_LOG(ERR, USER1, + "%s(port=%u) failed to get supported ptypes;\n", + __func__, td->cf.port); + return smask; + } + + uint32_t ptype[rc]; + rc = rte_eth_dev_get_supported_ptypes(td->cf.port, pmask, ptype, rc); + + for (i = 0; i != rc; i++) { + switch (ptype[i]) { + case RTE_PTYPE_L2_ETHER: + smask |= ETHER_PTYPE; + break; + case RTE_PTYPE_L3_IPV4: + case RTE_PTYPE_L3_IPV4_EXT_UNKNOWN: + smask |= IPV4_PTYPE; + break; + case RTE_PTYPE_L3_IPV4_EXT: + smask |= IPV4_EXT_PTYPE; + break; + case RTE_PTYPE_L3_IPV6: + case RTE_PTYPE_L3_IPV6_EXT_UNKNOWN: + smask |= IPV6_PTYPE; + break; + case RTE_PTYPE_L3_IPV6_EXT: + smask |= IPV6_EXT_PTYPE; + break; + case RTE_PTYPE_L4_TCP: + smask |= TCP_PTYPE; + break; + case RTE_PTYPE_L4_UDP: + smask |= UDP_PTYPE; + break; + } + } + + return smask; +} + +static inline uint64_t +_mbuf_tx_offload(uint64_t il2, uint64_t il3, uint64_t il4, uint64_t tso, + uint64_t ol3, uint64_t ol2) +{ + return il2 | il3 << 7 | il4 << 16 | tso << 24 | ol3 << 40 | ol2 << 49; +} + +static inline void +fill_pkt_hdr_len(struct rte_mbuf *m, uint32_t l2, uint32_t l3, uint32_t l4) +{ + m->tx_offload = _mbuf_tx_offload(l2, l3, l4, 0, 0, 0); +} + +static inline int +is_ipv4_frag(const struct ipv4_hdr *iph) +{ + const uint16_t mask = rte_cpu_to_be_16(~IPV4_HDR_DF_FLAG); + + return ((mask & iph->fragment_offset) != 0); +} + +static inline uint32_t +get_tcp_header_size(struct rte_mbuf *m, uint32_t l2_len, uint32_t l3_len) +{ + const struct tcp_hdr *tcp; + + tcp = rte_pktmbuf_mtod_offset(m, struct tcp_hdr *, l2_len + l3_len); + return (tcp->data_off >> 4) * 4; +} + +static inline void +adjust_ipv4_pktlen(struct rte_mbuf *m, uint32_t l2_len) +{ + uint32_t plen, trim; + const struct ipv4_hdr *iph; + + iph = rte_pktmbuf_mtod_offset(m, const struct ipv4_hdr *, l2_len); + plen = rte_be_to_cpu_16(iph->total_length) + l2_len; + if (plen < m->pkt_len) { + trim = m->pkt_len - plen; + rte_pktmbuf_trim(m, trim); + } +} + +static inline void +adjust_ipv6_pktlen(struct rte_mbuf *m, uint32_t l2_len) +{ + uint32_t plen, trim; + const struct ipv6_hdr *iph; + + iph = rte_pktmbuf_mtod_offset(m, const struct ipv6_hdr *, l2_len); + plen = rte_be_to_cpu_16(iph->payload_len) + sizeof(*iph) + l2_len; + if (plen < m->pkt_len) { + trim = m->pkt_len - plen; + rte_pktmbuf_trim(m, trim); + } +} + +static inline void +tcp_stat_update(struct tldk_ctx *lc, const struct rte_mbuf *m, + uint32_t l2_len, uint32_t l3_len) +{ + const struct tcp_hdr *th; + + th = rte_pktmbuf_mtod_offset(m, struct tcp_hdr *, l2_len + l3_len); + lc->tcp_stat.flags[th->tcp_flags]++; +} + +static inline uint32_t +get_ipv4_hdr_len(struct rte_mbuf *m, uint32_t l2, uint32_t proto, uint32_t frag) +{ + const struct ipv4_hdr *iph; + int32_t dlen, len; + + dlen = rte_pktmbuf_data_len(m); + dlen -= l2; + + iph = rte_pktmbuf_mtod_offset(m, const struct ipv4_hdr *, l2); + len = (iph->version_ihl & IPV4_HDR_IHL_MASK) * IPV4_IHL_MULTIPLIER; + + if (frag != 0 && is_ipv4_frag(iph)) { + m->packet_type &= ~RTE_PTYPE_L4_MASK; + m->packet_type |= RTE_PTYPE_L4_FRAG; + } + + if (len > dlen || (proto <= IPPROTO_MAX && iph->next_proto_id != proto)) + m->packet_type = RTE_PTYPE_UNKNOWN; + + return len; +} + +static inline int +ipv6x_hdr(uint32_t proto) +{ + return (proto == IPPROTO_HOPOPTS || + proto == IPPROTO_ROUTING || + proto == IPPROTO_FRAGMENT || + proto == IPPROTO_AH || + proto == IPPROTO_NONE || + proto == IPPROTO_DSTOPTS); +} + +static inline uint32_t +get_ipv6x_hdr_len(struct rte_mbuf *m, uint32_t l2, uint32_t nproto, + uint32_t fproto) +{ + const struct ip6_ext *ipx; + int32_t dlen, len, ofs; + + len = sizeof(struct ipv6_hdr); + + dlen = rte_pktmbuf_data_len(m); + dlen -= l2; + + ofs = l2 + len; + ipx = rte_pktmbuf_mtod_offset(m, const struct ip6_ext *, ofs); + + while (ofs > 0 && len < dlen) { + + switch (nproto) { + case IPPROTO_HOPOPTS: + case IPPROTO_ROUTING: + case IPPROTO_DSTOPTS: + ofs = (ipx->ip6e_len + 1) << 3; + break; + case IPPROTO_AH: + ofs = (ipx->ip6e_len + 2) << 2; + break; + case IPPROTO_FRAGMENT: + /* + * tso_segsz is not used by RX, so use it as temporary + * buffer to store the fragment offset. + */ + m->tso_segsz = ofs; + ofs = sizeof(struct ip6_frag); + m->packet_type &= ~RTE_PTYPE_L4_MASK; + m->packet_type |= RTE_PTYPE_L4_FRAG; + break; + default: + ofs = 0; + } + + if (ofs > 0) { + nproto = ipx->ip6e_nxt; + len += ofs; + ipx += ofs / sizeof(*ipx); + } + } + + /* unrecognized or invalid packet. */ + if ((ofs == 0 && nproto != fproto) || len > dlen) + m->packet_type = RTE_PTYPE_UNKNOWN; + + return len; +} + +static inline uint32_t +get_ipv6_hdr_len(struct rte_mbuf *m, uint32_t l2, uint32_t fproto) +{ + const struct ipv6_hdr *iph; + + iph = rte_pktmbuf_mtod_offset(m, const struct ipv6_hdr *, + sizeof(struct ether_hdr)); + + if (iph->proto == fproto) + return sizeof(struct ipv6_hdr); + else if (ipv6x_hdr(iph->proto) != 0) + return get_ipv6x_hdr_len(m, l2, iph->proto, fproto); + + m->packet_type = RTE_PTYPE_UNKNOWN; + return 0; +} + +static inline void +fill_eth_tcp_hdr_len(struct rte_mbuf *m) +{ + uint32_t dlen, l2_len, l3_len, l4_len; + uint16_t etp; + const struct ether_hdr *eth; + + dlen = rte_pktmbuf_data_len(m); + + /* check that first segment is at least 54B long. */ + if (dlen < sizeof(struct ether_hdr) + sizeof(struct ipv4_hdr) + + sizeof(struct tcp_hdr)) { + m->packet_type = RTE_PTYPE_UNKNOWN; + return; + } + + l2_len = sizeof(*eth); + + eth = rte_pktmbuf_mtod(m, const struct ether_hdr *); + etp = eth->ether_type; + if (etp == rte_be_to_cpu_16(ETHER_TYPE_VLAN)) + l2_len += sizeof(struct vlan_hdr); + + if (etp == rte_be_to_cpu_16(ETHER_TYPE_IPv4)) { + m->packet_type = RTE_PTYPE_L4_TCP | + RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L2_ETHER; + l3_len = get_ipv4_hdr_len(m, l2_len, IPPROTO_TCP, 1); + l4_len = get_tcp_header_size(m, l2_len, l3_len); + fill_pkt_hdr_len(m, l2_len, l3_len, l4_len); + adjust_ipv4_pktlen(m, l2_len); + } else if (etp == rte_be_to_cpu_16(ETHER_TYPE_IPv6) && + dlen >= l2_len + sizeof(struct ipv6_hdr) + + sizeof(struct tcp_hdr)) { + m->packet_type = RTE_PTYPE_L4_TCP | + RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L2_ETHER; + l3_len = get_ipv6_hdr_len(m, l2_len, IPPROTO_TCP); + l4_len = get_tcp_header_size(m, l2_len, l3_len); + fill_pkt_hdr_len(m, l2_len, l3_len, l4_len); + adjust_ipv6_pktlen(m, l2_len); + } else + m->packet_type = RTE_PTYPE_UNKNOWN; +} + +/* + * HW can recognize L2/L3 with/without extensions/L4 (ixgbe/igb/fm10k) + */ +static uint16_t +type0_tcp_rx_callback(__rte_unused uint8_t port, __rte_unused uint16_t queue, + struct rte_mbuf *pkt[], uint16_t nb_pkts, + __rte_unused uint16_t max_pkts, __rte_unused void *user_param) +{ + uint32_t j, tp; + uint32_t l4_len, l3_len, l2_len; + const struct ether_hdr *eth; + + l2_len = sizeof(*eth); + + for (j = 0; j != nb_pkts; j++) { + + BE_PKT_DUMP(pkt[j]); + + tp = pkt[j]->packet_type & (RTE_PTYPE_L4_MASK | + RTE_PTYPE_L3_MASK | RTE_PTYPE_L2_MASK); + + switch (tp) { + /* non fragmented tcp packets. */ + case (RTE_PTYPE_L4_TCP | RTE_PTYPE_L3_IPV4 | + RTE_PTYPE_L2_ETHER): + l4_len = get_tcp_header_size(pkt[j], l2_len, + sizeof(struct ipv4_hdr)); + fill_pkt_hdr_len(pkt[j], l2_len, + sizeof(struct ipv4_hdr), l4_len); + adjust_ipv4_pktlen(pkt[j], l2_len); + break; + case (RTE_PTYPE_L4_TCP | RTE_PTYPE_L3_IPV6 | + RTE_PTYPE_L2_ETHER): + l4_len = get_tcp_header_size(pkt[j], l2_len, + sizeof(struct ipv6_hdr)); + fill_pkt_hdr_len(pkt[j], l2_len, + sizeof(struct ipv6_hdr), l4_len); + adjust_ipv6_pktlen(pkt[j], l2_len); + break; + case (RTE_PTYPE_L4_TCP | RTE_PTYPE_L3_IPV4_EXT | + RTE_PTYPE_L2_ETHER): + l3_len = get_ipv4_hdr_len(pkt[j], l2_len, + IPPROTO_TCP, 0); + l4_len = get_tcp_header_size(pkt[j], l2_len, l3_len); + fill_pkt_hdr_len(pkt[j], l2_len, l3_len, l4_len); + adjust_ipv4_pktlen(pkt[j], l2_len); + break; + case (RTE_PTYPE_L4_TCP | RTE_PTYPE_L3_IPV6_EXT | + RTE_PTYPE_L2_ETHER): + l3_len = get_ipv6_hdr_len(pkt[j], l2_len, IPPROTO_TCP); + l4_len = get_tcp_header_size(pkt[j], l2_len, l3_len); + fill_pkt_hdr_len(pkt[j], l2_len, l3_len, l4_len); + adjust_ipv6_pktlen(pkt[j], l2_len); + break; + default: + /* treat packet types as invalid. */ + pkt[j]->packet_type = RTE_PTYPE_UNKNOWN; + break; + } + } + + return nb_pkts; +} + +/* + * HW can recognize L2/L3/L4 and fragments (i40e). + */ +static uint16_t +type1_tcp_rx_callback(__rte_unused uint8_t port, __rte_unused uint16_t queue, + struct rte_mbuf *pkt[], uint16_t nb_pkts, + __rte_unused uint16_t max_pkts, void *user_param) +{ + uint32_t j, tp; + struct tldk_ctx *tcx; + uint32_t l4_len, l3_len, l2_len; + const struct ether_hdr *eth; + + tcx = user_param; + l2_len = sizeof(*eth); + + for (j = 0; j != nb_pkts; j++) { + + BE_PKT_DUMP(pkt[j]); + + tp = pkt[j]->packet_type & (RTE_PTYPE_L4_MASK | + RTE_PTYPE_L3_MASK | RTE_PTYPE_L2_MASK); + + switch (tp) { + case (RTE_PTYPE_L4_TCP | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L2_ETHER): + l3_len = get_ipv4_hdr_len(pkt[j], l2_len, + IPPROTO_TCP, 0); + l4_len = get_tcp_header_size(pkt[j], l2_len, l3_len); + fill_pkt_hdr_len(pkt[j], l2_len, l3_len, l4_len); + adjust_ipv4_pktlen(pkt[j], l2_len); + tcp_stat_update(tcx, pkt[j], l2_len, l3_len); + break; + case (RTE_PTYPE_L4_TCP | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L2_ETHER): + l3_len = get_ipv6_hdr_len(pkt[j], l2_len, IPPROTO_TCP); + l4_len = get_tcp_header_size(pkt[j], l2_len, l3_len); + fill_pkt_hdr_len(pkt[j], l2_len, l3_len, l4_len); + adjust_ipv6_pktlen(pkt[j], l2_len); + tcp_stat_update(tcx, pkt[j], l2_len, l3_len); + break; + default: + /* treat packet types as invalid. */ + pkt[j]->packet_type = RTE_PTYPE_UNKNOWN; + break; + } + + } + + return nb_pkts; +} + +static uint16_t +typen_tcp_rx_callback(__rte_unused uint8_t port, __rte_unused uint16_t queue, + struct rte_mbuf *pkt[], uint16_t nb_pkts, + __rte_unused uint16_t max_pkts, __rte_unused void *user_param) +{ + uint32_t j; + + for (j = 0; j != nb_pkts; j++) { + + BE_PKT_DUMP(pkt[j]); + fill_eth_tcp_hdr_len(pkt[j]); + } + + return nb_pkts; +} + +int +setup_rx_cb(const struct tldk_dev *td, struct tldk_ctx *tcx) +{ + int32_t rc; + uint32_t i, n, smask; + void *cb; + const struct ptype2cb *ptype2cb; + + static const struct ptype2cb tcp_ptype2cb[] = { + { + .mask = ETHER_PTYPE | IPV4_PTYPE | IPV4_EXT_PTYPE | + IPV6_PTYPE | IPV6_EXT_PTYPE | TCP_PTYPE, + .name = "HW l2/l3x/l4-tcp ptype", + .fn = type0_tcp_rx_callback, + }, + { + .mask = ETHER_PTYPE | IPV4_PTYPE | IPV6_PTYPE | + TCP_PTYPE, + .name = "HW l2/l3/l4-tcp ptype", + .fn = type1_tcp_rx_callback, + }, + { + .mask = 0, + .name = "tcp no HW ptype", + .fn = typen_tcp_rx_callback, + }, + }; + + smask = get_ptypes(td); + + ptype2cb = tcp_ptype2cb; + n = RTE_DIM(tcp_ptype2cb); + + for (i = 0; i != n; i++) { + if ((smask & ptype2cb[i].mask) == ptype2cb[i].mask) { + cb = rte_eth_add_rx_callback(td->cf.port, td->cf.queue, + ptype2cb[i].fn, tcx); + rc = -rte_errno; + RTE_LOG(ERR, USER1, + "%s(port=%u), setup RX callback \"%s\" " + "returns %p;\n", + __func__, td->cf.port, ptype2cb[i].name, cb); + return ((cb == NULL) ? rc : 0); + } + } + + /* no proper callback found. */ + RTE_LOG(ERR, USER1, + "%s(port=%u) failed to find an appropriate callback;\n", + __func__, td->cf.port); + return -ENOENT; +} + +int +be_lcore_setup(struct tldk_ctx *tcx) +{ + uint32_t i; + int32_t rc; + + RTE_LOG(NOTICE, USER1, "%s:(lcore=%u, ctx=%p) start\n", + __func__, tcx->cf->lcore, tcx->ctx); + + rc = 0; + for (i = 0; i != tcx->nb_dev && rc == 0; i++) { + RTE_LOG(NOTICE, USER1, "%s:%u(port=%u, q=%u)\n", + __func__, i, tcx->dev[i].cf.port, tcx->dev[i].cf.queue); + + rc = setup_rx_cb(&tcx->dev[i], tcx); + if (rc < 0) + return rc; + } + + return rc; +} + +static inline void +be_rx(struct tldk_dev *dev) +{ + uint32_t j, k, n; + struct rte_mbuf *pkt[MAX_PKT_BURST]; + struct rte_mbuf *rp[MAX_PKT_BURST]; + int32_t rc[MAX_PKT_BURST]; + + n = rte_eth_rx_burst(dev->cf.port, + dev->cf.queue, pkt, RTE_DIM(pkt)); + + if (n != 0) { + dev->rx_stat.in += n; + BE_TRACE("%s(%u): rte_eth_rx_burst(%u, %u) returns %u\n", + __func__, dev->cf.id, dev->cf.port, + dev->cf.queue, n); + + k = tle_tcp_rx_bulk(dev->dev, pkt, rp, rc, n); + + dev->rx_stat.up += k; + dev->rx_stat.drop += n - k; + BE_TRACE("%s: tle_tcp_rx_bulk(%p, %u) returns %u\n", + __func__, dev->dev, n, k); + + for (j = 0; j != n - k; j++) { + BE_TRACE("%s:%d(port=%u) rp[%u]={%p, %d};\n", + __func__, __LINE__, dev->cf.port, + j, rp[j], rc[j]); + rte_pktmbuf_free(rp[j]); + } + } +} + +static inline void +be_tx(struct tldk_dev *dev) +{ + uint32_t j = 0, k, n; + struct rte_mbuf **mb; + + n = dev->tx_buf.num; + k = RTE_DIM(dev->tx_buf.pkt) - n; + mb = dev->tx_buf.pkt; + + if (k >= RTE_DIM(dev->tx_buf.pkt) / 2) { + j = tle_tcp_tx_bulk(dev->dev, mb + n, k); + n += j; + dev->tx_stat.down += j; + } + + if (n == 0) + return; + + BE_TRACE("%s: tle_tcp_tx_bulk(%p) returns %u,\n" + "total pkts to send: %u\n", + __func__, dev->dev, j, n); + + for (j = 0; j != n; j++) + BE_PKT_DUMP(mb[j]); + + k = rte_eth_tx_burst(dev->cf.port, + dev->cf.queue, mb, n); + + dev->tx_stat.out += k; + dev->tx_stat.drop += n - k; + BE_TRACE("%s: rte_eth_tx_burst(%u, %u, %u) returns %u\n", + __func__, dev->cf.port, + dev->cf.queue, n, k); + + dev->tx_buf.num = n - k; + if (k != 0) + for (j = k; j != n; j++) + mb[j - k] = mb[j]; +} + +void +be_lcore_tcp(struct tldk_ctx *tcx) +{ + uint32_t i; + + if (tcx == NULL) + return; + + for (i = 0; i != tcx->nb_dev; i++) { + be_rx(&tcx->dev[i]); + be_tx(&tcx->dev[i]); + } + tle_tcp_process(tcx->ctx, TCP_MAX_PROCESS); +} + +void +be_lcore_clear(struct tldk_ctx *tcx) +{ + uint32_t i, j; + + if (tcx == NULL) + return; + + RTE_LOG(NOTICE, USER1, "%s(lcore=%u, ctx: %p) finish\n", + __func__, tcx->cf->lcore, tcx->ctx); + for (i = 0; i != tcx->nb_dev; i++) { + RTE_LOG(NOTICE, USER1, "%s:%u(port=%u, q=%u, lcore=%u, dev=%p) " + "rx_stats={" + "in=%" PRIu64 ",up=%" PRIu64 ",drop=%" PRIu64 "}, " + "tx_stats={" + "in=%" PRIu64 ",up=%" PRIu64 ",drop=%" PRIu64 "};\n", + __func__, i, tcx->dev[i].cf.port, tcx->dev[i].cf.queue, + tcx->cf->lcore, + tcx->dev[i].dev, + tcx->dev[i].rx_stat.in, + tcx->dev[i].rx_stat.up, + tcx->dev[i].rx_stat.drop, + tcx->dev[i].tx_stat.down, + tcx->dev[i].tx_stat.out, + tcx->dev[i].tx_stat.drop); + } + + RTE_LOG(NOTICE, USER1, "tcp_stat={\n"); + for (i = 0; i != RTE_DIM(tcx->tcp_stat.flags); i++) { + if (tcx->tcp_stat.flags[i] != 0) + RTE_LOG(NOTICE, USER1, "[flag=%#x]==%" PRIu64 ";\n", + i, tcx->tcp_stat.flags[i]); + } + RTE_LOG(NOTICE, USER1, "};\n"); + + for (i = 0; i != tcx->nb_dev; i++) + for (j = 0; j != tcx->dev[i].tx_buf.num; j++) + rte_pktmbuf_free(tcx->dev[i].tx_buf.pkt[j]); + +} + +void +be_stop_port(uint32_t port) +{ + struct rte_eth_stats stats; + + RTE_LOG(NOTICE, USER1, "%s: stoping port %u\n", __func__, port); + + rte_eth_stats_get(port, &stats); + RTE_LOG(NOTICE, USER1, "port %u stats={\n" + "ipackets=%" PRIu64 ";" + "ibytes=%" PRIu64 ";" + "ierrors=%" PRIu64 ";" + "imissed=%" PRIu64 ";\n" + "opackets=%" PRIu64 ";" + "obytes=%" PRIu64 ";" + "oerrors=%" PRIu64 ";\n" + "}\n", + port, + stats.ipackets, + stats.ibytes, + stats.ierrors, + stats.imissed, + stats.opackets, + stats.obytes, + stats.oerrors); + rte_eth_dev_stop(port); +} + +int +be_lcore_main(void *arg) +{ + int32_t rc; + uint32_t lid, i; + struct tldk_ctx *tcx; + struct lcore_ctxs_list *lc_ctx; + + lc_ctx = arg; + lid = rte_lcore_id(); + + RTE_LOG(NOTICE, USER1, "%s(lcore=%u) start\n", __func__, lid); + + rc = 0; + while (force_quit == 0) { + for (i = 0; i < lc_ctx->nb_ctxs; i++) { + tcx = lc_ctx->ctxs[i]; + be_lcore_tcp(tcx); + } + } + + RTE_LOG(NOTICE, USER1, "%s(lcore=%u) finish\n", __func__, lid); + + return rc; +} diff --git a/app/nginx/src/tldk/be.h b/app/nginx/src/tldk/be.h new file mode 100644 index 0000000..900dfa8 --- /dev/null +++ b/app/nginx/src/tldk/be.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2017 Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef __BE_H__ +#define __BE_H__ + +#include "ngx_tldk.h" + +extern volatile int force_quit; + +int be_lpm4_dst_lookup(void *data, const struct in_addr *addr, + struct tle_dest *res); +int be_lpm6_dst_lookup(void *data, const struct in6_addr *addr, + struct tle_dest *res); +int be_lcore_lpm_init(struct tldk_ctx *tcx, uint32_t sid, + const struct tldk_ctx_conf *cf); +int be_port_init(tldk_conf_t *cf); +int be_dst_init(struct tldk_ctx *tcx, const tldk_conf_t *cf); +int be_add_dev(struct tldk_ctx *tcx, const tldk_conf_t *cf); +int be_mpool_init(struct tldk_ctx *tcx); +int be_queue_init(struct tldk_ctx *tcx, const tldk_conf_t *cf); +int be_check_lcore(uint32_t lc); + +void +be_lcore_tcp(struct tldk_ctx *tcx); + +int be_lcore_main(void *arg); +int be_lcore_setup(struct tldk_ctx *tcx); +void be_lcore_clear(struct tldk_ctx *tcx); + +void be_stop_port(uint32_t port); + +#endif /*__BE_H__ */ diff --git a/app/nginx/src/tldk/debug.h b/app/nginx/src/tldk/debug.h new file mode 100644 index 0000000..2f19dcb --- /dev/null +++ b/app/nginx/src/tldk/debug.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2017 Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _TLDK_DEBUG_H_ +#define _TLDK_DEBUG_H_ + +#include <rte_cycles.h> + +#define FUNC_STAT(v, c) do { \ + static uint64_t nb_call, nb_data; \ + nb_call++; \ + nb_data += (v); \ + if ((nb_call & ((c) - 1)) == 0) { \ + printf("%s#%d@%u: nb_call=%lu, avg(" #v ")=%#Lf\n", \ + __func__, __LINE__, rte_lcore_id(), nb_call, \ + (long double)nb_data / nb_call); \ + nb_call = 0; \ + nb_data = 0; \ + } \ +} while (0) + +#define FUNC_TM_STAT(v, c) do { \ + static uint64_t nb_call, nb_data; \ + static uint64_t cts, pts, sts; \ + cts = rte_rdtsc(); \ + if (pts != 0) \ + sts += cts - pts; \ + pts = cts; \ + nb_call++; \ + nb_data += (v); \ + if ((nb_call & ((c) - 1)) == 0) { \ + printf("%s#%d@%u: nb_call=%lu, " \ + "avg(" #v ")=%#Lf, " \ + "avg(cycles)=%#Lf, " \ + "avg(cycles/" #v ")=%#Lf\n", \ + __func__, __LINE__, rte_lcore_id(), nb_call, \ + (long double)nb_data / nb_call, \ + (long double)sts / nb_call, \ + (long double)sts / nb_data); \ + nb_call = 0; \ + nb_data = 0; \ + sts = 0; \ + } \ +} while (0) + +#define COND_FUNC_STAT(e, v, c) do { \ + if (e) { \ + FUNC_STAT(v, c); \ + } \ +} while (0) + +#endif /* _TLDK_DEBUG_H_ */ diff --git a/app/nginx/src/tldk/module.c b/app/nginx/src/tldk/module.c new file mode 100644 index 0000000..4ddea36 --- /dev/null +++ b/app/nginx/src/tldk/module.c @@ -0,0 +1,497 @@ +/* + * Copyright (c) 2017 Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <rte_ethdev.h> +#include <rte_lpm6.h> +#include <rte_lpm.h> + +#include <ngx_config.h> +#include <ngx_core.h> + +#include "ngx_tldk.h" +#include "be.h" +#include "tldk_sock.h" + +extern ngx_module_t ngx_tldk_module; + +/* map from ngx_worker to corresponding TLDK ctx */ +struct tldk_ctx wrk2ctx[RTE_MAX_LCORE] = {}; + +/* per be lcore tldk_ctx(s) */ +static struct lcore_ctxs_list lc_ctxs[RTE_MAX_LCORE]; + +volatile int force_quit; + +static void * +tldk_module_create_conf(ngx_cycle_t *cycle) +{ + tldk_conf_t *cf; + + cf = ngx_pcalloc(cycle->pool, sizeof(*cf)); + if (cf == NULL) + return NULL; + return cf; +} + +static char * +tldk_module_init_conf(ngx_cycle_t *cycle, void *conf) +{ + tldk_conf_t *cf; + + cf = conf; + (void)cf; + return NGX_CONF_OK; +} + +static void +fini_context(struct tldk_ctx *tcx) +{ + tle_ctx_destroy(tcx->ctx); + rte_lpm_free(tcx->lpm4); + rte_lpm6_free(tcx->lpm6); + rte_mempool_free(tcx->mpool); + rte_mempool_free(tcx->frag_mpool); + memset(tcx, 0, sizeof(*tcx)); +} + +static int +init_context(struct tldk_ctx *tcx, const struct tldk_ctx_conf *cf, + tldk_conf_t *cft) +{ + uint32_t lc, rc, sid; + struct tle_ctx_param cprm; + + lc = cf->lcore; + sid = rte_lcore_to_socket_id(lc); + rc = be_lcore_lpm_init(tcx, sid, cf); + if (rc != 0) + return rc; + + memset(&cprm, 0, sizeof(cprm)); + cprm.socket_id = sid; + cprm.proto = TLE_PROTO_TCP; + cprm.max_streams = cf->nb_stream; + cprm.max_stream_rbufs = cf->nb_rbuf; + cprm.max_stream_sbufs = cf->nb_sbuf; + if (cf->be_in_worker != 0) + cprm.flags |= TLE_CTX_FLAG_ST; + cprm.timewait = cf->tcp_timewait; + cprm.lookup4 = be_lpm4_dst_lookup; + cprm.lookup4_data = tcx; + cprm.lookup6 = be_lpm6_dst_lookup; + cprm.lookup6_data = tcx; + cprm.secret_key.u64[0] = rte_rand(); + cprm.secret_key.u64[1] = rte_rand(); + + tcx->ctx = tle_ctx_create(&cprm); + RTE_LOG(NOTICE, USER1, + "%s: tle_ctx_create(lcore=%u) for worker=%lu returns %p;\n", + __func__, lc, cf->worker, tcx->ctx); + if (tcx->ctx == NULL) { + rte_lpm_free(tcx->lpm4); + rte_lpm6_free(tcx->lpm6); + return -ENOMEM; + } + + tcx->cf = cf; + + /* create mempool for the context */ + rc = be_mpool_init(tcx); + if (rc != 0) + return rc; + + /* initialize queues of the device given in the context */ + rc = be_queue_init(tcx, cft); + if (rc != 0) + return rc; + + /* create devices of the context */ + rc = be_add_dev(tcx, cft); + if (rc != 0) + return rc; + + /* + *1.create LPMs and add to the context + *2.add routes for the given destinations to the context + */ + rc = be_dst_init(tcx, cft); + if (rc != 0) + return rc; + + return 0; +} + +void +tldk_module_fini(ngx_cycle_t *cycle) +{ + tldk_conf_t *cf; + uint32_t i, wrk; + + cf = (tldk_conf_t *)ngx_get_conf(cycle->conf_ctx, ngx_tldk_module); + + /* signal all launched slave lcores to stop */ + force_quit = 1; + + /* wait all slave lcores to be stopped */ + for (i = 0; i != cf->nb_ctx; i++) + rte_eal_wait_lcore(cf->ctx[i].lcore); + + /* finish all TLDK contexts */ + for (i = 0; i != cf->nb_ctx; i++) { + wrk = cf->ctx[i].worker; + /* free up all tx pkt buffers of the tldk_dev 'ses + * of the tldk_ctx + */ + be_lcore_clear(wrk2ctx + wrk); + fini_context(wrk2ctx + wrk); + } + + /* stop all ports */ + for (i = 0; i != cf->nb_port; i++) + be_stop_port(cf->port[i].id); +} + +/* configuration sanity check */ +static int +process_conf(tldk_conf_t *cf) +{ + uint32_t i, j, port_id, mask; + uint16_t queue_id; + const struct tldk_ctx_conf *ctx; + struct tldk_port_conf *pcf; + + /* + * count the number of queues associated + * with each port by iterating through all tldk_ctx'ses. + * if same queue of the port is used in multiple tldk_ctx'ses + * error will be returned. + */ + for (i = 0; i < cf->nb_ctx; i++) { + ctx = &cf->ctx[i]; + for (j = 0; j < ctx->nb_dev; j++) { + port_id = ctx->dev[j].port; + queue_id = ctx->dev[j].queue; + pcf = &cf->port[port_id]; + + if (queue_id >= MAX_PORT_QUEUE) + return -EINVAL; + + mask = 1 << queue_id; + + if (pcf->queue_map & mask) + /* tldk_port_conf already has the queue */ + return -EEXIST; + + pcf->queue_map |= mask; + if (pcf->nb_queues <= queue_id) + pcf->nb_queues = queue_id + 1; + } + } + + return 0; +} + +static ngx_int_t +tldk_module_init(ngx_cycle_t *cycle) +{ + int32_t rc; + uint32_t i, j, wrk, num, lc, ctx_lim; + tldk_conf_t *cf; + struct tldk_ctx *tcx; + + cf = (tldk_conf_t *)ngx_get_conf(cycle->conf_ctx, ngx_tldk_module); + + rc = rte_eal_init(cf->eal_argc, cf->eal_argv); + if (rc < 0) { + RTE_LOG(ERR, USER1, + "%s: rte_eal_init failed with error code: %d\n", + __func__, rc); + return NGX_ERROR; + } + + rc = process_conf(cf); + if (rc < 0) { + RTE_LOG(ERR, USER1, + "%s: process_conf failed with error code: %d\n", + __func__, rc); + return NGX_ERROR; + } + + /* port initialization */ + rc = be_port_init(cf); + if (rc != 0) { + RTE_LOG(ERR, USER1, + "%s: be_port_init failed with error code: %d\n", + __func__, rc); + + return NGX_ERROR; + } + + /* initialise TLDK contexts */ + for (i = 0; i != cf->nb_ctx; i++) { + wrk = cf->ctx[i].worker; + rc = init_context(wrk2ctx + wrk, cf->ctx + i, cf); + if (rc != 0) + break; + } + + if (i != cf->nb_ctx) { + for (j = 0; j != i; j++) { + wrk = cf->ctx[j].worker; + fini_context(wrk2ctx + wrk); + } + RTE_LOG(ERR, USER1, + "%s: init_context failed with error code: %d\n", + __func__, rc); + return NGX_ERROR; + } + + /* start the ports */ + for (i = 0; i != cf->nb_port; i++) { + RTE_LOG(NOTICE, USER1, "%s: starting port %u\n", + __func__, cf->port[i].id); + + rc = rte_eth_dev_start(cf->port[i].id); + if (rc != 0) { + RTE_LOG(ERR, USER1, + "%s: rte_eth_dev_start(%u) returned " + "error code: %d\n", + __func__, cf->port[i].id, rc); + goto freectx; + } + } + + /* accumulate all tldk_ctx(s) that belongs to one be lcore */ + for (i = 0; i != cf->nb_ctx; i++) { + tcx = &wrk2ctx[cf->ctx[i].worker]; + /* setup rx callbacks */ + rc = be_lcore_setup(tcx); + if (rc != 0) { + RTE_LOG(ERR, USER1, + "%s: be_lcore_setup failed with error " + "code: %d\n", __func__, rc); + goto freectx; + } + + if (tcx->cf->be_in_worker) + continue; + + lc = cf->ctx[i].lcore; + num = lc_ctxs[lc].nb_ctxs; + ctx_lim = RTE_DIM(lc_ctxs[lc].ctxs); + + if (num < ctx_lim) { + lc_ctxs[lc].ctxs[num] = tcx; + lc_ctxs[lc].nb_ctxs++; + } else { + RTE_LOG(ERR, USER1, + "%s: cannot assign more than supported %u " + "ctx(s) for the given lcore %u\n", + __func__, ctx_lim, lc); + goto freectx; + } + } + + /* + * launch slave lcores with lcore_main_tcp to handle + * multiple tldk_ctx(s) of that lcore. + */ + for (i = 0; i < RTE_MAX_LCORE; i++) { + if (lc_ctxs[i].nb_ctxs != 0) { + if (be_check_lcore(i) != 0 || + rte_eal_remote_launch(be_lcore_main, + &lc_ctxs[i], i) != 0) { + RTE_LOG(ERR, USER1, + "%s: failed to launch " + "be_lcore_main for core =%u\n", + __func__, i); + goto freectx; + } + } + } + + return NGX_OK; + +freectx: + tldk_module_fini(cycle); + return NGX_ERROR; +} + +static int +tldk_open_listening(ngx_cycle_t *cycle, struct tldk_ctx *tcx) +{ + uint32_t i; + ngx_listening_t *ls; + char host[NI_MAXHOST]; + char srv[NI_MAXSERV]; + + ls = cycle->listening.elts; + for (i = 0; i != cycle->listening.nelts; i++) { + + if (ls[i].ignore != 0 || ls[i].listen == 0) + continue; + + ngx_close_socket(ls[i].fd); + ls[i].fd = -1; + + getnameinfo(ls[i].sockaddr, ls[i].socklen, + host, sizeof(host), srv, sizeof(srv), + NI_NUMERICHOST | NI_NUMERICSERV); + + ls[i].fd = tldk_open_bind_listen(tcx, + ls[i].sockaddr->sa_family, ls[i].type, + ls[i].sockaddr, ls[i].socklen, + ls[i].backlog); + + RTE_LOG(NOTICE, USER1, "%s(worker=%lu): " + "listen() for %s:%s returns %d, errno=%d;\n", + __func__, ngx_worker, host, srv, ls[i].fd, errno); + + if (ls[i].fd == -1) + return NGX_ERROR; + } + + return NGX_OK; +} + +static void +tldk_process_fini(ngx_cycle_t *cycle) +{ + struct tldk_ctx *tcx; + tcx = wrk2ctx + ngx_worker; + + tldk_stbl_fini(); + if (tcx->cf->be_in_worker != 0) + be_lcore_clear(tcx); +} + + +static ngx_int_t +tldk_process_init(ngx_cycle_t *cycle) +{ + ngx_event_conf_t *ecf; + int32_t rc; + + if (ngx_process != NGX_PROCESS_WORKER) + return NGX_OK; + + rc = tldk_stbl_init(cycle, wrk2ctx + ngx_worker); + if (rc != 0) + return NGX_ERROR; + + rc = tldk_open_listening(cycle, wrk2ctx + ngx_worker); + if (rc != 0) + return NGX_ERROR; + + /* use tldk event module from now on*/ + ecf = ngx_event_get_conf(cycle->conf_ctx, ngx_event_core_module); + ecf->use = ngx_tldk_event_module.ctx_index; + + return NGX_OK; +} + +static char * +tldk_conf_block(ngx_conf_t *cf, ngx_command_t *cmd, void *conf) +{ + char *rv; + ngx_conf_t save; + + save = *cf; + cf->handler = tldk_block_parse; + cf->handler_conf = conf; + rv = ngx_conf_parse(cf, NULL); + *cf = save; + + return rv; +} + +static char * +tldk_ctx_block(ngx_conf_t *cf, ngx_command_t *cmd, void *conf) +{ + char *rv; + tldk_conf_t *tcf; + struct tldk_ctx_conf *tcx; + ngx_conf_t save; + + tcf = (tldk_conf_t *)((void **)conf)[0]; + if (tcf->nb_ctx >= RTE_DIM(tcf->ctx)) + return NGX_CONF_ERROR; + + /* setup default non-zero values, if any */ + tcx = tcf->ctx + tcf->nb_ctx; + tcx->tcp_timewait = TLE_TCP_TIMEWAIT_DEFAULT; + + save = *cf; + cf->handler = tldk_ctx_parse; + cf->handler_conf = conf; + rv = ngx_conf_parse(cf, NULL); + *cf = save; + + if (rv == NGX_CONF_OK) + tcf->nb_ctx++; + + return rv; +} + +/* + * define NGX TLDK module. + */ + +static ngx_command_t tldk_commands[] = { + + { + .name = ngx_string("tldk_main"), + .type = NGX_MAIN_CONF|NGX_CONF_BLOCK|NGX_CONF_NOARGS, + .set = tldk_conf_block, + }, + { + .name = ngx_string("tldk_ctx"), + .type = NGX_MAIN_CONF|NGX_CONF_BLOCK|NGX_CONF_NOARGS, + .set = tldk_ctx_block, + }, + ngx_null_command, +}; + +static ngx_core_module_t tldk_module_ctx = { + ngx_string("tldk"), + tldk_module_create_conf, + tldk_module_init_conf +}; + +ngx_module_t ngx_tldk_module = { + NGX_MODULE_V1, + &tldk_module_ctx, /* module context */ + tldk_commands, /* module directives */ + NGX_CORE_MODULE, /* module type */ + NULL, /* init master */ + tldk_module_init, /* init module */ + tldk_process_init, /* init process */ + NULL, /* init thread */ + NULL, /* exit thread */ + tldk_process_fini, /* exit process */ + tldk_module_fini, /* exit master */ + NGX_MODULE_V1_PADDING +}; diff --git a/app/nginx/src/tldk/ngx_tldk.h b/app/nginx/src/tldk/ngx_tldk.h new file mode 100644 index 0000000..01ac556 --- /dev/null +++ b/app/nginx/src/tldk/ngx_tldk.h @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2017 Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef __NGX_TLDK_H__ +#define __NGX_TLDK_H__ + +#include <stdint.h> +#include <stddef.h> +#include <inttypes.h> + +#include <ngx_config.h> +#include <ngx_core.h> + +#include <rte_config.h> +#include <rte_eal.h> +#include <rte_common.h> +#include <rte_ether.h> + +#include <tle_ctx.h> +#include <tle_event.h> + +#define MAX_PKT_BURST 0x20 + +#define MAX_PORT_QUEUE \ + (sizeof(((struct tldk_port_conf *)NULL)->queue_map) * CHAR_BIT) + +#define MAX_CTX_PER_LOCRE 32 + +struct tldk_port_conf { + uint32_t id; + uint32_t nb_queues; + uint32_t queue_map; + uint32_t mtu; + uint32_t rx_offload; + uint32_t tx_offload; + uint32_t ipv4; + struct in6_addr ipv6; + struct ether_addr mac; +}; + +struct tldk_dev_conf { + uint32_t id; + uint32_t port; + uint32_t queue; +}; + +struct tldk_dest_conf { + uint32_t dev; + uint32_t mtu; + uint32_t prfx; + uint16_t family; + union { + struct in_addr ipv4; + struct in6_addr ipv6; + }; + struct ether_addr mac; +}; + +#define TLDK_MAX_DEST 0x10 + +struct tldk_ctx_conf { + ngx_uint_t worker; + uint32_t lcore; + uint32_t nb_mbuf; + uint32_t nb_stream; + uint32_t nb_rbuf; + uint32_t nb_sbuf; + uint32_t nb_dev; + uint32_t nb_dest; + uint32_t be_in_worker; + uint32_t tcp_timewait; /* TCP TIME_WAIT value in milliseconds */ + struct tldk_dev_conf dev[RTE_MAX_ETHPORTS]; + struct tldk_dest_conf dest[TLDK_MAX_DEST]; +}; + +typedef struct tldk_conf tldk_conf_t; + +struct tldk_conf { + uint32_t eal_argc; + char *eal_argv[NGX_CONF_MAX_ARGS]; + char eal_cmd[PATH_MAX]; + uint32_t nb_port; + struct tldk_port_conf port[RTE_MAX_ETHPORTS]; + uint32_t nb_ctx; + struct tldk_ctx_conf ctx[RTE_MAX_LCORE]; +}; + +extern char *tldk_block_parse(ngx_conf_t *, ngx_command_t *, void *); +extern char *tldk_ctx_parse(ngx_conf_t *, ngx_command_t *, void *); + +struct pkt_buf { + uint32_t num; + struct rte_mbuf *pkt[2 * MAX_PKT_BURST]; +}; + +struct tldk_dev { + struct tle_dev *dev; + struct tldk_dev_conf cf; + struct { + uint64_t in; + uint64_t up; + uint64_t drop; + } rx_stat; + struct { + uint64_t down; + uint64_t out; + uint64_t drop; + } tx_stat; + struct pkt_buf tx_buf; +}; + +#define LCORE_MAX_DST (UINT8_MAX + 1) + +struct tldk_ctx { + const struct tldk_ctx_conf *cf; + struct rte_lpm *lpm4; + struct rte_lpm6 *lpm6; + struct tle_ctx *ctx; + struct rte_mempool *mpool; + struct rte_mempool *frag_mpool; + uint32_t nb_dev; + struct tldk_dev dev[RTE_MAX_ETHPORTS]; + uint32_t dst4_num; + uint32_t dst6_num; + struct tle_dest dst4[LCORE_MAX_DST]; + struct tle_dest dst6[LCORE_MAX_DST]; + struct { + uint64_t flags[UINT8_MAX + 1]; + } tcp_stat; +} __rte_cache_aligned; + +extern struct tldk_ctx wrk2ctx[RTE_MAX_LCORE]; + +struct lcore_ctxs_list { + uint32_t nb_ctxs; + struct tldk_ctx *ctxs[MAX_CTX_PER_LOCRE]; +}; + +/* helper macros */ +#define DUMMY_MACRO do {} while (0) + +#ifdef BE_DEBUG +#define BE_TRACE(fmt, arg...) printf(fmt, ##arg) +#define BE_PKT_DUMP(p) rte_pktmbuf_dump(stdout, (p), 74) +#else +#define BE_TRACE(fmt, arg...) DUMMY_MACRO +#define BE_PKT_DUMP(p) DUMMY_MACRO +#endif + +#ifdef FE_DEBUG +#define FE_TRACE(fmt, arg...) printf(fmt, ##arg) +#define FE_PKT_DUMP(p) rte_pktmbuf_dump(stdout, (p), 74) +#else +#define FE_TRACE(fmt, arg...) DUMMY_MACRO +#define FE_PKT_DUMP(p) DUMMY_MACRO +#endif + + +#endif /* __NGX_TLDK_H__ */ diff --git a/app/nginx/src/tldk/parse.c b/app/nginx/src/tldk/parse.c new file mode 100644 index 0000000..5d71d9e --- /dev/null +++ b/app/nginx/src/tldk/parse.c @@ -0,0 +1,456 @@ +/* + * Copyright (c) 2017 Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <ngx_config.h> +#include <ngx_core.h> + +#include <ngx_tldk.h> + +union parse_val { + uint64_t u64; + struct { + uint16_t family; + union { + struct in_addr addr4; + struct in6_addr addr6; + }; + } in; + struct ether_addr mac; + rte_cpuset_t cpuset; +}; + +struct key_handler { + const char *name; + int (*func)(const char *, void *); +}; + +static int +parse_uint_val(const char *val, void *prm) +{ + union parse_val *rv; + unsigned long v; + char *end; + + rv = prm; + errno = 0; + v = strtoul(val, &end, 0); + if (errno != 0 || end[0] != 0 || v > UINT32_MAX) + return -EINVAL; + + rv->u64 = v; + return 0; +} + +static int +parse_ipv4_val(const char *val, void *prm) +{ + union parse_val *rv; + + rv = prm; + if (inet_pton(AF_INET, val, &rv->in.addr4) != 1) + return -EINVAL; + rv->in.family = AF_INET; + return 0; +} + +static int +parse_ipv6_val(const char *val, void *prm) +{ + union parse_val *rv; + + rv = prm; + if (inet_pton(AF_INET6, val, &rv->in.addr6) != 1) + return -EINVAL; + rv->in.family = AF_INET6; + return 0; +} + +static int +parse_ip_val(const char *val, void *prm) +{ + if (parse_ipv6_val(val, prm) != 0 && + parse_ipv4_val(val, prm) != 0) + return -EINVAL; + return 0; +} + +#define PARSE_UINT8x16(s, v, l) \ +do { \ + char *end; \ + unsigned long t; \ + errno = 0; \ + t = strtoul((s), &end, 16); \ + if (errno != 0 || end[0] != (l) || t > UINT8_MAX) \ + return -EINVAL; \ + (s) = end + 1; \ + (v) = t; \ +} while (0) + +static int +parse_mac_val(const char *val, void *prm) +{ + union parse_val *rv; + const char *s; + + rv = prm; + s = val; + + PARSE_UINT8x16(s, rv->mac.addr_bytes[0], ':'); + PARSE_UINT8x16(s, rv->mac.addr_bytes[1], ':'); + PARSE_UINT8x16(s, rv->mac.addr_bytes[2], ':'); + PARSE_UINT8x16(s, rv->mac.addr_bytes[3], ':'); + PARSE_UINT8x16(s, rv->mac.addr_bytes[4], ':'); + PARSE_UINT8x16(s, rv->mac.addr_bytes[5], 0); + return 0; +} + +static char * +tldk_port_parse(ngx_conf_t *cf, struct tldk_port_conf *prt) +{ + uint32_t i, j; + ngx_str_t *v; + + static const struct key_handler kh[] = { + { + .name = "port", + .func = parse_uint_val, + }, + { + .name = "mtu", + .func = parse_uint_val, + }, + { + .name = "rx_offload", + .func = parse_uint_val, + }, + { + .name = "tx_offload", + .func = parse_uint_val, + }, + { + .name = "ipv4", + .func = parse_ipv4_val, + }, + { + .name = "ipv6", + .func = parse_ipv6_val, + }, + }; + + union parse_val pvl[RTE_DIM(kh)]; + + memset(pvl, 0, sizeof(pvl)); + pvl[1].u64 = ETHER_MAX_LEN - ETHER_CRC_LEN; + + if (cf->args->nelts % 2 != 0) + return NGX_CONF_ERROR; + + v = cf->args->elts; + for (i = 0; i != cf->args->nelts; i += 2) { + + for (j = 0; j != RTE_DIM(kh); j++) { + if (ngx_strcmp(v[i].data, kh[j].name) == 0) { + if (kh[j].func((const char *)v[i + 1].data, + pvl + j) < 0) + return NGX_CONF_ERROR; + else + break; + } + } + + /* unknow key */ + if (j == RTE_DIM(kh)) + return NGX_CONF_ERROR; + } + + memset(prt, 0, sizeof(*prt)); + + prt->id = pvl[0].u64; + prt->mtu = pvl[1].u64; + prt->rx_offload = pvl[2].u64; + prt->tx_offload = pvl[3].u64; + prt->ipv4 = pvl[4].in.addr4.s_addr; + prt->ipv6 = pvl[5].in.addr6; + + return NGX_CONF_OK; +} + +static char * +tldk_dev_parse(ngx_conf_t *cf, struct tldk_dev_conf *dev, + tldk_conf_t *tcf) +{ + uint32_t i, j; + ngx_str_t *v; + + static const struct key_handler kh[] = { + { + .name = "dev", + .func = parse_uint_val, + }, + { + .name = "port", + .func = parse_uint_val, + }, + { + .name = "queue", + .func = parse_uint_val, + }, + }; + + union parse_val pvl[RTE_DIM(kh)]; + + memset(pvl, 0, sizeof(pvl)); + + if (cf->args->nelts % 2 != 0) + return NGX_CONF_ERROR; + + v = cf->args->elts; + for (i = 0; i != cf->args->nelts; i += 2) { + + for (j = 0; j != RTE_DIM(kh); j++) { + if (ngx_strcmp(v[i].data, kh[j].name) == 0) { + if (kh[j].func((const char *)v[i + 1].data, + pvl + j) < 0) + return NGX_CONF_ERROR; + else + break; + } + } + + /* unknow key */ + if (j == RTE_DIM(kh)) + return NGX_CONF_ERROR; + } + + memset(dev, 0, sizeof(*dev)); + + dev->id = pvl[0].u64; + dev->port = pvl[1].u64; + dev->queue = pvl[2].u64; + + return NGX_CONF_OK; +} + +static char * +tldk_dest_parse(ngx_conf_t *cf, struct tldk_dest_conf *dst) +{ + uint32_t i, j; + ngx_str_t *v; + + static const struct key_handler kh[] = { + { + .name = "dev", + .func = parse_uint_val, + }, + { + .name = "mtu", + .func = parse_uint_val, + }, + { + .name = "masklen", + .func = parse_uint_val, + }, + { + .name = "addr", + .func = parse_ip_val, + }, + { + .name = "mac", + .func = parse_mac_val, + }, + }; + + union parse_val pvl[RTE_DIM(kh)]; + + memset(pvl, 0, sizeof(pvl)); + pvl[1].u64 = ETHER_MAX_LEN - ETHER_CRC_LEN; + + if (cf->args->nelts % 2 != 1 || cf->args->nelts == 1) + return NGX_CONF_ERROR; + + v = cf->args->elts; + for (i = 1; i != cf->args->nelts; i += 2) { + + for (j = 0; j != RTE_DIM(kh); j++) { + if (ngx_strcmp(v[i].data, kh[j].name) == 0) { + if (kh[j].func((const char *)v[i + 1].data, + pvl + j) < 0) + return NGX_CONF_ERROR; + else + break; + } + } + + /* unknow key */ + if (j == RTE_DIM(kh)) + return NGX_CONF_ERROR; + } + + memset(dst, 0, sizeof(*dst)); + + dst->dev = pvl[0].u64; + dst->mtu = pvl[1].u64; + dst->prfx = pvl[2].u64; + + dst->family = pvl[3].in.family; + if (pvl[3].in.family == AF_INET) + dst->ipv4 = pvl[3].in.addr4; + else + dst->ipv6 = pvl[3].in.addr6; + + memcpy(&dst->mac, &pvl[4].mac, sizeof(dst->mac)); + + return NGX_CONF_OK; +} + +char * +tldk_block_parse(ngx_conf_t *cf, ngx_command_t *dummy, void *conf) +{ + uint32_t i, len, n; + tldk_conf_t *tcf; + ngx_str_t *v; + char *rv, *s; + struct tldk_port_conf prt; + + tcf = (tldk_conf_t *)((void **)conf)[0]; + v = cf->args->elts; + + if (ngx_strcmp(v[0].data, "eal_cmd") == 0) { + + if (cf->args->nelts == 1 || + cf->args->nelts > RTE_DIM(tcf->eal_argv)) + return NGX_CONF_ERROR; + + s = tcf->eal_cmd; + len = sizeof(tcf->eal_cmd); + for (i = 0; i != cf->args->nelts; i++) { + n = snprintf(s, len, "%s", v[i].data) + 1; + if (n > len) + return NGX_CONF_ERROR; + tcf->eal_argv[i] = s; + s += n; + len -= n; + } + + tcf->eal_argc = i; + return NGX_CONF_OK; + + } else if (ngx_strcmp(v[0].data, "port") == 0) { + + rv = tldk_port_parse(cf, &prt); + if (rv == NGX_CONF_OK) { + + /* too many ports */ + if (tcf->nb_port >= RTE_DIM(tcf->port)) + return NGX_CONF_ERROR; + + /* copy stuff */ + tcf->port[tcf->nb_port++] = prt; + } + return rv; + } + + return NGX_CONF_ERROR; +} + +char * +tldk_ctx_parse(ngx_conf_t *cf, ngx_command_t *dummy, void *conf) +{ + char *rv; + ngx_str_t *v; + tldk_conf_t *tcf; + struct tldk_ctx_conf *tcx; + union parse_val pvl; + + tcf = (tldk_conf_t *)((void **)conf)[0]; + tcx = tcf->ctx + tcf->nb_ctx; + v = cf->args->elts; + + if (ngx_strcmp(v[0].data, "worker") == 0) { + if (cf->args->nelts != 2 || + parse_uint_val((const char *)v[1].data, + &pvl) < 0) + return NGX_CONF_ERROR; + tcx->worker = pvl.u64; + } else if (ngx_strcmp(v[0].data, "lcore") == 0) { + if (cf->args->nelts != 2 || + parse_uint_val((const char *)v[1].data, + &pvl) < 0) + return NGX_CONF_ERROR; + tcx->lcore = pvl.u64; + } else if (ngx_strcmp(v[0].data, "mbufs") == 0) { + if (cf->args->nelts != 2 || + parse_uint_val((const char *)v[1].data, + &pvl) < 0) + return NGX_CONF_ERROR; + tcx->nb_mbuf = pvl.u64; + } else if (ngx_strcmp(v[0].data, "streams") == 0) { + if (cf->args->nelts != 2 || + parse_uint_val((const char *)v[1].data, + &pvl) < 0) + return NGX_CONF_ERROR; + tcx->nb_stream = pvl.u64; + } else if (ngx_strcmp(v[0].data, "rbufs") == 0) { + if (cf->args->nelts != 2 || + parse_uint_val((const char *)v[1].data, + &pvl) < 0) + return NGX_CONF_ERROR; + tcx->nb_rbuf = pvl.u64; + } else if (ngx_strcmp(v[0].data, "sbufs") == 0) { + if (cf->args->nelts != 2 || + parse_uint_val((const char *)v[1].data, + &pvl) < 0) + return NGX_CONF_ERROR; + tcx->nb_sbuf = pvl.u64; + } else if (ngx_strcmp(v[0].data, "be_in_worker") == 0) { + if (cf->args->nelts != 1) + return NGX_CONF_ERROR; + tcx->be_in_worker = 1; + } else if (ngx_strcmp(v[0].data, "tcp_timewait") == 0) { + if (cf->args->nelts != 2 || + parse_uint_val((const char *)v[1].data, + &pvl) < 0) + return NGX_CONF_ERROR; + tcx->tcp_timewait = pvl.u64; + } else if (ngx_strcmp(v[0].data, "dev") == 0) { + if (tcx->nb_dev >= RTE_DIM(tcx->dev)) + return NGX_CONF_ERROR; + rv = tldk_dev_parse(cf, tcx->dev + tcx->nb_dev, tcf); + if (rv != NGX_CONF_OK) + return rv; + tcx->nb_dev++; + return rv; + } else if (ngx_strcmp(v[0].data, "dest") == 0) { + if (tcx->nb_dest >= RTE_DIM(tcx->dest)) + return NGX_CONF_ERROR; + rv = tldk_dest_parse(cf, tcx->dest + tcx->nb_dest); + if (rv != NGX_CONF_OK) + return rv; + tcx->nb_dest++; + } + + return NGX_CONF_OK; +} diff --git a/app/nginx/src/tldk/tldk_event.c b/app/nginx/src/tldk/tldk_event.c new file mode 100644 index 0000000..4630326 --- /dev/null +++ b/app/nginx/src/tldk/tldk_event.c @@ -0,0 +1,276 @@ +/* + * Copyright (c) 2017 Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <ngx_tldk.h> +#include <tldk_sock.h> + +#include <rte_cycles.h> + +#include "be.h" +#include "debug.h" + +#define EVENT_BULK 32 + +enum { + EV_ACCEPT, + EV_RECV, + EV_SEND, + EV_ERR, + EV_NUM +}; + +struct tldk_event_stat { + uint64_t nb_get[EV_NUM]; + uint64_t nb_post[EV_NUM]; +}; + +static struct tldk_event_stat event_stat; + +extern ngx_event_module_t tldk_event_module; + +/* + * TLDK event module implementation + */ + +static ngx_int_t +tldk_add_event(ngx_event_t *ev, ngx_int_t event, ngx_uint_t flags) +{ + struct tldk_sock *ts; + ngx_connection_t *c; + + c = ev->data; + + FE_TRACE("%s(ev=%p,event=%#lx,flags=%#lx): fd=%d;\n", + __func__, ev, event, flags, c->fd); + + ts = sd_to_sock(c->fd); + if (ts == NULL) + return NGX_OK; + + if (event == NGX_READ_EVENT) { + tle_event_active(ts->rxev, TLE_SEV_DOWN); + tle_event_active(ts->erev, TLE_SEV_DOWN); + ts->rev = ev; + } else if (event == NGX_WRITE_EVENT) { + tle_event_active(ts->txev, TLE_SEV_DOWN); + tle_event_active(ts->erev, TLE_SEV_DOWN); + ts->wev = ev; + } + + ev->active = 1; + return NGX_OK; +} + +static ngx_int_t +tldk_del_event(ngx_event_t *ev, ngx_int_t event, ngx_uint_t flags) +{ + struct tldk_sock *ts; + ngx_connection_t *c; + + c = ev->data; + + FE_TRACE("%s(ev=%p,event=%#lx,flags=%#lx): fd=%d;\n", + __func__, ev, event, flags, c->fd); + + ev->active = 0; + if ((flags & NGX_CLOSE_EVENT) != 0) + return NGX_OK; + + ts = sd_to_sock(c->fd); + if (ts == NULL) + return NGX_OK; + + if (event == NGX_READ_EVENT) { + tle_event_down(ts->rxev); + tle_event_down(ts->erev); + ts->rev = NULL; + } else if (event == NGX_WRITE_EVENT) { + tle_event_down(ts->txev); + tle_event_down(ts->erev); + ts->wev = NULL; + } + + return NGX_OK; +} + +static inline void +post_event(ngx_event_t *ev, ngx_queue_t *q, ngx_uint_t flags, uint32_t type) +{ + if (ev != NULL && ev->active == 1) { + ev->ready = 1; + event_stat.nb_post[type]++; + if ((flags & NGX_POST_EVENTS) != 0) { + ngx_post_event(ev, q); + } else + ev->handler(ev); + } +} + +static ngx_int_t +tldk_process_events(ngx_cycle_t *cycle, ngx_msec_t timer, ngx_uint_t flags) +{ + uint32_t i, n, ne, nr, ns, nt; + uint64_t tme, tms, tmw; + struct tldk_sock *te[EVENT_BULK]; + struct tldk_sock *tr[EVENT_BULK]; + struct tldk_sock *ts[EVENT_BULK]; + struct tldk_sock *tt[EVENT_BULK]; + struct tldk_ctx *tcx; + + FE_TRACE("%s(cycle=%p,timer=%lu,flags=%#lx);\n", + __func__, cycle, timer, flags); + + tcx = wrk2ctx + ngx_worker; + + tms = rte_get_tsc_cycles(); + tme = (timer == NGX_TIMER_INFINITE) ? timer : + timer * (rte_get_tsc_hz() + MS_PER_S - 1) / MS_PER_S; + tmw = 0; + n = 0; + + do { + if (tcx->cf->be_in_worker != 0) + be_lcore_tcp(tcx); + + ns = tle_evq_get(stbl.syneq, (const void **)(uintptr_t)ts, + RTE_DIM(ts)); + nr = tle_evq_get(stbl.rxeq, (const void **)(uintptr_t)tr, + RTE_DIM(tr)); + nt = tle_evq_get(stbl.txeq, (const void **)(uintptr_t)tt, + RTE_DIM(tt)); + ne = tle_evq_get(stbl.ereq, (const void **)(uintptr_t)te, + RTE_DIM(te)); + n = ne + nr + ns + nt; + + if (n != 0) { + event_stat.nb_get[EV_ACCEPT] += ns; + event_stat.nb_get[EV_RECV] += nr; + event_stat.nb_get[EV_SEND] += nt; + event_stat.nb_get[EV_ERR] += ne; + break; + } + + if (tcx->cf->be_in_worker == 0) + //sched_yield(); + rte_delay_us(1); + + tmw += rte_get_tsc_cycles() - tms; + + } while (tmw < tme && ngx_quit == 0 && ngx_terminate == 0); + + if ((flags & NGX_UPDATE_TIME) != 0 || ngx_event_timer_alarm) + ngx_time_update(); + + if (n == 0) + return NGX_OK; + + for (i = 0; i != ns; i++) + post_event(ts[i]->rev, &ngx_posted_accept_events, flags, + EV_ACCEPT); + + for (i = 0; i != nr; i++) + post_event(tr[i]->rev, &ngx_posted_events, flags, EV_RECV); + + for (i = 0; i != nt; i++) + post_event(tt[i]->wev, &ngx_posted_events, flags, EV_SEND); + + for (i = 0; i != ne; i++) { + te[i]->posterr++; + post_event(te[i]->rev, &ngx_posted_events, flags, EV_ERR); + post_event(te[i]->wev, &ngx_posted_events, flags, EV_ERR); + } + + return NGX_OK; +} + +static ngx_int_t +tldk_init_events(ngx_cycle_t *cycle, ngx_msec_t timer) +{ + FE_TRACE("%s(cycle=%p,timer=%lu);\n", + __func__, cycle, timer); + + /* overwrite event actions for worker process */ + ngx_event_actions = tldk_event_module.actions; + ngx_event_flags = NGX_USE_LEVEL_EVENT; + + ngx_io = ngx_os_io; + return NGX_OK; +} + +void +tldk_dump_event_stats(void) +{ + static const char * const name[EV_NUM] = { + "ACCEPT", + "RECV", + "SEND", + "ERR", + }; + + uint32_t i; + + RTE_LOG(NOTICE, USER1, "%s(worker=%lu)={\n", __func__, ngx_worker); + for (i = 0; i != RTE_DIM(name); i++) + RTE_LOG(NOTICE, USER1, + "%s[GET, POST]={%" PRIu64 ", %" PRIu64 "};\n", + name[i], event_stat.nb_get[i], event_stat.nb_post[i]); + RTE_LOG(NOTICE, USER1, "};\n"); +} + +static void +tldk_done_events(ngx_cycle_t *cycle) +{ +} + +static ngx_str_t tldk_name = ngx_string("tldk"); + +ngx_event_module_t tldk_event_module = { + .name = &tldk_name, + .actions = { + .add = tldk_add_event, + .del = tldk_del_event, + .enable = tldk_add_event, + .disable = tldk_del_event, + .process_events = tldk_process_events, + .init = tldk_init_events, + .done = tldk_done_events, + }, +}; + +ngx_module_t ngx_tldk_event_module = { + NGX_MODULE_V1, + &tldk_event_module, /* module context */ + NULL, /* module directives */ + NGX_EVENT_MODULE, /* module type */ + NULL, /* init master */ + NULL, /* init module */ + NULL, /* init process */ + NULL, /* init thread */ + NULL, /* exit thread */ + NULL, /* exit process */ + NULL, /* exit master */ + NGX_MODULE_V1_PADDING +}; diff --git a/app/nginx/src/tldk/tldk_sock.c b/app/nginx/src/tldk/tldk_sock.c new file mode 100644 index 0000000..7831fc3 --- /dev/null +++ b/app/nginx/src/tldk/tldk_sock.c @@ -0,0 +1,549 @@ +/* + * Copyright (c) 2017 Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <ngx_tldk.h> +#include <tldk_sock.h> + +#include <rte_malloc.h> +#include <rte_errno.h> + +struct tldk_sock_stat { + uint64_t nb_accept; + uint64_t nb_close; + uint64_t nb_readv; + uint64_t nb_recv; + uint64_t nb_setopts; + uint64_t nb_shutdown; + uint64_t nb_writev; +}; + +static struct tldk_sock_stat sock_stat; + +/* One socket/file table per worker */ +struct tldk_stbl stbl = { + .snum = 0, +}; + +static int (*real_accept4)(int, struct sockaddr *, socklen_t *, int); +static int (*real_close)(int); +static ssize_t (*real_readv)(int, const struct iovec *, int); +static ssize_t (*real_recv)(int, void *, size_t, int); +static int (*real_setsockopt)(int, int, int, const void *, socklen_t); +static int (*real_shutdown)(int, int); +static ssize_t (*real_writev)(int, const struct iovec *, int); + +static inline uint32_t +get_socks(struct tldk_sock_list *list, struct tldk_sock *rs[], + uint32_t num) +{ + struct tldk_sock *s; + uint32_t i, n; + + n = RTE_MIN(list->num, num); + for (i = 0, s = LIST_FIRST(&list->head); + i != n; + i++, s = LIST_NEXT(s, link)) { + rs[i] = s; + } + + /* we retrieved all free entries */ + if (s == NULL) + LIST_INIT(&list->head); + else + LIST_FIRST(&list->head) = s; + + list->num -= n; + return n; +} + +static inline struct tldk_sock * +get_sock(struct tldk_sock_list *list) +{ + struct tldk_sock *s; + + if (get_socks(list, &s, 1) != 1) + return NULL; + + return s; +} + +static inline void +put_socks(struct tldk_sock_list *list, struct tldk_sock *fs[], uint32_t num) +{ + uint32_t i; + + for (i = 0; i != num; i++) + LIST_INSERT_HEAD(&list->head, fs[i], link); + list->num += num; +} + +static inline void +put_sock(struct tldk_sock_list *list, struct tldk_sock *s) +{ + put_socks(list, &s, 1); +} + +static inline void +rem_sock(struct tldk_sock_list *list, struct tldk_sock *s) +{ + LIST_REMOVE(s, link); + list->num--; +} + +static void +term_sock(struct tldk_sock *ts) +{ + tle_event_idle(ts->erev); + tle_event_idle(ts->rxev); + tle_event_idle(ts->txev); + tle_tcp_stream_close(ts->s); + ts->s = NULL; + ts->posterr = 0; +} + +static int32_t +close_sock(struct tldk_sock *ts) +{ + if (ts->s == NULL) + return EBADF; + term_sock(ts); + rem_sock(&stbl.use, ts); + put_sock(&stbl.free, ts); + return 0; +} + +static void +dump_sock_stats(void) +{ + RTE_LOG(NOTICE, USER1, "%s(worker=%lu)={\n" + "nb_accept=%" PRIu64 ";\n" + "nb_close=%" PRIu64 ";\n" + "nb_readv=%" PRIu64 ";\n" + "nb_recv=%" PRIu64 ";\n" + "nb_setopts=%" PRIu64 ";\n" + "nb_shutdown=%" PRIu64 ";\n" + "nb_writev=%" PRIu64 ";\n" + "};\n", + __func__, + ngx_worker, + sock_stat.nb_accept, + sock_stat.nb_close, + sock_stat.nb_readv, + sock_stat.nb_recv, + sock_stat.nb_setopts, + sock_stat.nb_shutdown, + sock_stat.nb_writev); +} + +void +tldk_stbl_fini(void) +{ + dump_sock_stats(); + tldk_dump_event_stats(); + rte_free(stbl.sd); + tle_evq_destroy(stbl.txeq); + tle_evq_destroy(stbl.rxeq); + tle_evq_destroy(stbl.ereq); + tle_evq_destroy(stbl.syneq); +} + +#define INIT_FUNC(func) do { \ + real_##func = dlsym(RTLD_NEXT, #func); \ + RTE_ASSERT(real_##func); \ +} while (0) + +static void __attribute__((constructor)) +stub_init(void) +{ + INIT_FUNC(accept4); + INIT_FUNC(close); + INIT_FUNC(readv); + INIT_FUNC(recv); + INIT_FUNC(setsockopt); + INIT_FUNC(shutdown); + INIT_FUNC(writev); +} + +#undef INIT_FUNC + +int +tldk_stbl_init(const ngx_cycle_t *cycle, const struct tldk_ctx *tc) +{ + uint32_t i, lc, sid, sn; + size_t sz; + struct tle_evq_param eprm; + struct rlimit rlim; + + lc = tc->cf->lcore; + sn = tc->cf->nb_stream; + sid = rte_lcore_to_socket_id(lc); + + if (sn < cycle->listening.nelts + cycle->connection_n) + return -EINVAL; + + if (getrlimit(RLIMIT_NOFILE, &rlim) != 0) + return -errno; + + stbl.nosd = rlim.rlim_max; + + /* allocate event queues */ + + memset(&eprm, 0, sizeof(eprm)); + eprm.socket_id = sid; + eprm.max_events = sn; + + stbl.syneq = tle_evq_create(&eprm); + stbl.ereq = tle_evq_create(&eprm); + stbl.rxeq = tle_evq_create(&eprm); + stbl.txeq = tle_evq_create(&eprm); + + RTE_LOG(NOTICE, USER1, "%s(lcore=%u, worker=%lu): " + "synevq=%p, erevq=%p, rxevq=%p, txevq=%p\n", + __func__, lc, ngx_worker, + stbl.syneq, stbl.ereq, stbl.rxeq, stbl.txeq); + if (stbl.syneq == NULL || stbl.ereq == NULL || stbl.rxeq == NULL || + stbl.txeq == NULL) + return -ENOMEM; + + LIST_INIT(&stbl.lstn.head); + LIST_INIT(&stbl.free.head); + LIST_INIT(&stbl.use.head); + + sz = sn * sizeof(*stbl.sd); + stbl.sd = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE, + rte_lcore_to_socket_id(lc)); + + if (stbl.sd == NULL) { + RTE_LOG(ERR, USER1, "%s(lcore=%u, worker=%lu): " + "failed to allocate %zu bytes\n", + __func__, lc, ngx_worker, sz); + return -ENOMEM; + } + + stbl.snum = sn; + + /* init listen socks */ + for (i = 0; i != cycle->listening.nelts; i++) { + stbl.sd[i].rxev = tle_event_alloc(stbl.syneq, stbl.sd + i); + stbl.sd[i].txev = tle_event_alloc(stbl.txeq, stbl.sd + i); + stbl.sd[i].erev = tle_event_alloc(stbl.ereq, stbl.sd + i); + put_sock(&stbl.lstn, stbl.sd + i); + } + + /* init worker connection socks */ + for (; i != sn; i++) { + stbl.sd[i].rxev = tle_event_alloc(stbl.rxeq, stbl.sd + i); + stbl.sd[i].txev = tle_event_alloc(stbl.txeq, stbl.sd + i); + stbl.sd[i].erev = tle_event_alloc(stbl.ereq, stbl.sd + i); + put_sock(&stbl.free, stbl.sd + i); + } + + return 0; +} + +int +tldk_open_bind_listen(struct tldk_ctx *tcx, int domain, int type, + const struct sockaddr *addr, socklen_t addrlen, int backlog) +{ + int32_t rc; + struct tldk_sock *ts; + struct tle_tcp_stream_param sprm; + + ts = get_sock(&stbl.lstn); + if (ts == NULL) { + errno = ENOBUFS; + return -1; + } + + tle_event_active(ts->erev, TLE_SEV_DOWN); + tle_event_active(ts->rxev, TLE_SEV_DOWN); + tle_event_active(ts->txev, TLE_SEV_DOWN); + + /* setup stream parameters */ + + memset(&sprm, 0, sizeof(sprm)); + + sprm.cfg.err_ev = ts->erev; + sprm.cfg.recv_ev = ts->rxev; + sprm.cfg.send_ev = ts->txev; + + memcpy(&sprm.addr.local, addr, addrlen); + sprm.addr.remote.ss_family = sprm.addr.local.ss_family; + + ts->s = tle_tcp_stream_open(tcx->ctx, &sprm); + if (ts->s != NULL) + rc = tle_tcp_stream_listen(ts->s); + else + rc = -rte_errno; + + if (rc != 0) { + term_sock(ts); + put_sock(&stbl.lstn, ts); + errno = -rc; + return -1; + } + + return SOCK_TO_SD(ts); +} + +/* + * socket API + */ + +int +close(int sd) +{ + int32_t rc; + struct tldk_sock *ts; + + FE_TRACE("worker#%lu: %s(%d);\n", + ngx_worker, __func__, sd); + + ts = sd_to_sock(sd); + if (ts == NULL) + return real_close(sd); + + sock_stat.nb_close++; + + rc = close_sock(ts); + if (rc != 0) { + errno =-rc; + return -1; + } + return 0; +} + +int +shutdown(int sd, int how) +{ + struct tldk_sock *ts; + + FE_TRACE("worker#%lu: %s(%d, %#x);\n", + ngx_worker, __func__, sd, how); + + ts = sd_to_sock(sd); + if (ts == NULL) + return real_shutdown(sd, how); + + sock_stat.nb_shutdown++; + + errno = ENOTSUP; + return -1; +} + + +int +accept4(int sd, struct sockaddr *addr, socklen_t *addrlen, int flags) +{ + uint32_t n, slen; + struct tle_stream *s; + struct tldk_sock *cs, *ts; + struct tle_tcp_stream_cfg prm; + struct tle_tcp_stream_addr sa; + + FE_TRACE("worker#%lu: %s(%d, %p, %p, %#x);\n", + ngx_worker, __func__, sd, addr, addrlen, flags); + + ts = sd_to_sock(sd); + if (ts == NULL) + return real_accept4(sd, addr, addrlen, flags); + else if (ts->s == NULL) { + errno = EBADF; + return -1; + } + + sock_stat.nb_accept++; + + n = ts->acpt.num; + if (n == 0) { + n = tle_tcp_stream_accept(ts->s, ts->acpt.buf, + RTE_DIM(ts->acpt.buf)); + if (n == 0) { + errno = EAGAIN; + return -1; + } + } + + s = ts->acpt.buf[n - 1]; + ts->acpt.num = n - 1; + + tle_event_raise(ts->rxev); + + cs = get_sock(&stbl.free); + if (cs == NULL) { + tle_tcp_stream_close(s); + errno = ENOBUFS; + return -1; + } + + cs->s = s; + put_sock(&stbl.use, cs); + + tle_event_active(cs->erev, TLE_SEV_DOWN); + tle_event_active(cs->rxev, TLE_SEV_DOWN); + tle_event_active(cs->txev, TLE_SEV_DOWN); + + memset(&prm, 0, sizeof(prm)); + prm.recv_ev = cs->rxev; + prm.send_ev = cs->txev; + prm.err_ev = cs->erev; + tle_tcp_stream_update_cfg(&s, &prm, 1); + + if (tle_tcp_stream_get_addr(s, &sa) == 0) { + + if (sa.remote.ss_family == AF_INET) + slen = sizeof(struct sockaddr_in); + else if (sa.remote.ss_family == AF_INET6) + slen = sizeof(struct sockaddr_in6); + else + slen = 0; + + slen = RTE_MIN(slen, *addrlen); + memcpy(addr, &sa.remote, slen); + *addrlen = slen; + } + + return SOCK_TO_SD(cs); +} + +ssize_t +recv(int sd, void *buf, size_t len, int flags) +{ + ssize_t sz; + struct tldk_sock *ts; + struct iovec iv; + + FE_TRACE("worker#%lu: %s(%d, %p, %zu, %#x);\n", + ngx_worker, __func__, sd, buf, len, flags); + + ts = sd_to_sock(sd); + if (ts == NULL) + return real_recv(sd, buf, len, flags); + else if (ts->s == NULL) { + errno = EBADF; + return -1; + } + + sock_stat.nb_recv++; + + iv.iov_base = buf; + iv.iov_len = len; + + sz = tle_tcp_stream_readv(ts->s, &iv, 1); + if (sz < 0) + errno = rte_errno; + else if (sz == 0 && ts->posterr == 0) { + errno = EAGAIN; + sz = -1; + } + + FE_TRACE("worker#%lu: %s(%d, %p, %zu, %#x) returns %zd;\n", + ngx_worker, __func__, sd, buf, len, flags, sz); + return sz; +} + +ssize_t +readv(int sd, const struct iovec *iov, int iovcnt) +{ + ssize_t sz; + struct tldk_sock *ts; + struct tldk_ctx *tcx; + + FE_TRACE("worker#%lu: %s(%d, %p, %d);\n", + ngx_worker, __func__, sd, iov, iovcnt); + + tcx = wrk2ctx + ngx_worker; + ts = sd_to_sock(sd); + if (ts == NULL) + return real_readv(sd, iov, iovcnt); + else if (ts->s == NULL || tcx == NULL) { + errno = EBADF; + return -1; + } + + sock_stat.nb_readv++; + + sz = tle_tcp_stream_readv(ts->s, iov, iovcnt); + if (sz < 0) + errno = rte_errno; + else if (sz == 0 && ts->posterr == 0) { + errno = EAGAIN; + sz = -1; + } + + FE_TRACE("worker#%lu: %s(%d, %p, %d) returns %zd;\n", + ngx_worker, __func__, sd, iov, iovcnt, sz); + return sz; +} + +ssize_t +writev(int sd, const struct iovec *iov, int iovcnt) +{ + ssize_t sz; + struct tldk_sock *ts; + struct tldk_ctx *tcx; + + FE_TRACE("worker#%lu: %s(%d, %p, %d);\n", + ngx_worker, __func__, sd, iov, iovcnt); + + tcx = wrk2ctx + ngx_worker; + ts = sd_to_sock(sd); + if (ts == NULL) + return real_writev(sd, iov, iovcnt); + else if (ts->s == NULL || tcx == NULL) { + errno = EBADF; + return -1; + } + + sock_stat.nb_writev++; + + sz = tle_tcp_stream_writev(ts->s, tcx->mpool, iov, iovcnt); + if (sz < 0) + errno = rte_errno; + + FE_TRACE("worker#%lu: %s(%d, %p, %d) returns %zd;\n", + ngx_worker, __func__, sd, iov, iovcnt, sz); + return sz; +} + +int +setsockopt(int sd, int level, int optname, const void *optval, socklen_t optlen) +{ + struct tldk_sock *ts; + + FE_TRACE("worker#%lu: %s(%d, %#x, %#x, %p, %d);\n", + ngx_worker, __func__, sd, level, optname, optval, optlen); + + ts = sd_to_sock(sd); + if (ts == NULL) + return real_setsockopt(sd, level, optname, optval, optlen); + else if (ts->s == NULL) { + errno = EBADF; + return -1; + } + + return 0; +} diff --git a/app/nginx/src/tldk/tldk_sock.h b/app/nginx/src/tldk/tldk_sock.h new file mode 100644 index 0000000..b9140b5 --- /dev/null +++ b/app/nginx/src/tldk/tldk_sock.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2017 Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef __TLDK_SOCK_H__ +#define __TLDK_SOCK_H__ + +#include <stdint.h> +#include <stddef.h> +#include <inttypes.h> + +#include <ngx_config.h> +#include <ngx_core.h> +#include <ngx_event.h> + +#include <tle_tcp.h> + +extern ngx_module_t ngx_tldk_event_module; + +extern int tldk_stbl_init(const ngx_cycle_t *, const struct tldk_ctx *); +extern void tldk_stbl_fini(void); + +extern int +tldk_open_bind_listen(struct tldk_ctx *tcx, int domain, int type, + const struct sockaddr *addr, socklen_t addrlen, int backlog); + +extern void +tldk_dump_event_stats(void); + +#define TLDK_ACCEPT_BULK 0x10 + +struct tldk_sock { + LIST_ENTRY(tldk_sock) link; + struct tle_stream *s; + struct tle_event *erev; + struct tle_event *rxev; + struct tle_event *txev; + ngx_event_t *rev; + ngx_event_t *wev; + uint16_t posterr; + struct { + uint32_t num; + struct tle_stream *buf[TLDK_ACCEPT_BULK]; + } acpt; +}; + +struct tldk_sock_list { + uint32_t num; + LIST_HEAD(, tldk_sock) head; +}; + +struct tldk_stbl { + struct tle_evq *syneq; + struct tle_evq *ereq; + struct tle_evq *rxeq; + struct tle_evq *txeq; + struct tldk_sock_list free; + struct tldk_sock_list lstn; + struct tldk_sock_list use; + int32_t nosd; + uint32_t snum; + struct tldk_sock *sd; +}; + + +#define SOCK_TO_SD(s) (stbl.nosd + ((s) - stbl.sd)) +#define SD_TO SOCK(d) (stbl.sd + ((d) - stbl.nosd)) + +/* One socket/file table per worker */ +extern struct tldk_stbl stbl; + +static inline struct tldk_sock * +sd_to_sock(int32_t sd) +{ + uint32_t n; + + n = sd - stbl.nosd; + if (n >= stbl.snum) + return NULL; + + return stbl.sd + n; +} + + + +#endif /* __TLDK_SOCK_H__ */ |