aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJianfeng Tan <henry.tjf@antfin.com>2019-11-18 06:59:50 +0000
committerJianfeng Tan <henry.tjf@antfin.com>2020-03-05 01:31:33 +0800
commit78c896b3b3127515478090c19447e27dc406427e (patch)
treed6d67d4683e9ca0409f9984a834547a572fb5310
parente4380f4866091fd92a7a57667dd938a99144f9cd (diff)
Signed-off-by: Jianfeng Tan <henry.tjf@antfin.com> Signed-off-by: Jielong Zhou <jielong.zjl@antfin.com> Signed-off-by: Jian Zhang <wuzai.zj@antfin.com> Signed-off-by: Chen Zhao <winters.zc@antfin.com> Change-Id: I55c39de4c6cd30f991f35631eb507f770230f08e
-rw-r--r--.gitreview1
-rw-r--r--Makefile39
-rw-r--r--README200
-rw-r--r--afl/lower_constructor_priority.diff16
-rwxr-xr-xafl/run.sh1
-rw-r--r--afl/seeds/seed.txtbin0 -> 90 bytes
-rw-r--r--angora/dpdk_abilist.txt1756
-rw-r--r--angora/run.sh1
-rw-r--r--angora/seeds/seed.txt1
-rw-r--r--dpdk/Makefile101
-rw-r--r--dpdk/dpdk-v18.11_patches/0001-eal-don-t-start-the-interrupt-mp-thread.patch35
-rw-r--r--dpdk/dpdk-v18.11_patches/0002-eal-prioritize-constructor.patch25
-rw-r--r--dpdk/dpdk-v18.11_patches/0003-mbuf-add-single-linked-list.patch33
-rw-r--r--dpdk/dpdk-v18.11_patches/0004-net-virtio-user-add-rss-update-for-virtio-user.patch43
-rw-r--r--dpdk/dpdk-v18.11_patches/0005-net-virtio-user-support-raw-socket-as-backend.patch645
-rw-r--r--dpdk/dpdk-v18.11_patches/0006-mempool-add-dynamic-mempool-support.patch247
-rw-r--r--dpdk/dpdk-v18.11_patches/0007-mbuf-add-dynamic-mbuf-mempool-support.patch305
-rw-r--r--dpdk/dpdk-v18.11_patches/0008-mempool-prioritize-constructor.patch30
-rw-r--r--dpdk/dpdk-v18.11_patches/0009-net-virtio-fill-desc-limit.patch42
-rw-r--r--examples/Makefile2
-rw-r--r--examples/l4fwd/main.c1
-rw-r--r--examples/l4fwd/port.h11
-rw-r--r--lib/Makefile1
-rw-r--r--lib/libtle_glue/Makefile62
-rw-r--r--lib/libtle_glue/arp.c935
-rw-r--r--lib/libtle_glue/be.c256
-rw-r--r--lib/libtle_glue/config.h71
-rw-r--r--lib/libtle_glue/ctx.c535
-rw-r--r--lib/libtle_glue/ctx.h147
-rw-r--r--lib/libtle_glue/epoll.c577
-rw-r--r--lib/libtle_glue/fd.c122
-rw-r--r--lib/libtle_glue/fd.h113
-rw-r--r--lib/libtle_glue/gateway.h96
-rw-r--r--lib/libtle_glue/icmp.c297
-rw-r--r--lib/libtle_glue/init.c155
-rw-r--r--lib/libtle_glue/internal.h152
-rw-r--r--lib/libtle_glue/log.h77
-rw-r--r--lib/libtle_glue/ndp.h33
-rw-r--r--lib/libtle_glue/packetdrill.c544
-rw-r--r--lib/libtle_glue/packetdrill.h111
-rw-r--r--lib/libtle_glue/poll.c138
-rw-r--r--lib/libtle_glue/port.c246
-rw-r--r--lib/libtle_glue/rxcb.c834
-rw-r--r--lib/libtle_glue/rxtx.c573
-rw-r--r--lib/libtle_glue/select.c201
-rw-r--r--lib/libtle_glue/sock.h154
-rw-r--r--lib/libtle_glue/socket.c720
-rw-r--r--lib/libtle_glue/sym.c129
-rw-r--r--lib/libtle_glue/sym.h118
-rw-r--r--lib/libtle_glue/tcp.c558
-rw-r--r--lib/libtle_glue/tle_glue.h114
-rw-r--r--lib/libtle_glue/udp.c419
-rw-r--r--lib/libtle_glue/util.c60
-rw-r--r--lib/libtle_glue/util.h377
-rw-r--r--lib/libtle_glue/zerocopy.h59
-rw-r--r--lib/libtle_l4p/Makefile1
-rw-r--r--lib/libtle_l4p/ctx.c349
-rw-r--r--lib/libtle_l4p/ctx.h38
-rw-r--r--lib/libtle_l4p/misc.h66
-rw-r--r--lib/libtle_l4p/net_misc.h21
-rw-r--r--lib/libtle_l4p/port_statmap.h127
-rw-r--r--lib/libtle_l4p/stream.h55
-rw-r--r--lib/libtle_l4p/stream_table.c65
-rw-r--r--lib/libtle_l4p/stream_table.h490
-rw-r--r--lib/libtle_l4p/syncookie.h4
-rw-r--r--lib/libtle_l4p/tcp_ctl.h68
-rw-r--r--lib/libtle_l4p/tcp_misc.h34
-rw-r--r--lib/libtle_l4p/tcp_ofo.c39
-rw-r--r--lib/libtle_l4p/tcp_ofo.h14
-rw-r--r--lib/libtle_l4p/tcp_rxq.h4
-rw-r--r--lib/libtle_l4p/tcp_rxtx.c1445
-rw-r--r--lib/libtle_l4p/tcp_rxtx.h252
-rw-r--r--lib/libtle_l4p/tcp_stream.c395
-rw-r--r--lib/libtle_l4p/tcp_stream.h37
-rw-r--r--lib/libtle_l4p/tcp_timer.h40
-rw-r--r--lib/libtle_l4p/tcp_tx_seg.h12
-rw-r--r--lib/libtle_l4p/tcp_txq.h29
-rw-r--r--lib/libtle_l4p/tle_ctx.h41
-rw-r--r--lib/libtle_l4p/tle_event.h2
-rw-r--r--lib/libtle_l4p/tle_stats.h101
-rw-r--r--lib/libtle_l4p/tle_tcp.h60
-rw-r--r--lib/libtle_l4p/tle_udp.h49
-rw-r--r--lib/libtle_l4p/udp_rxtx.c186
-rw-r--r--lib/libtle_l4p/udp_stream.c347
-rw-r--r--lib/libtle_l4p/udp_stream.h9
-rw-r--r--lib/libtle_timer/timer.c43
-rw-r--r--mk/tle.app.mk4
-rw-r--r--mk/tle.lib.mk4
-rw-r--r--test/Makefile4
-rw-r--r--test/gtest/Makefile1
-rw-r--r--test/gtest/test_tle_ctx.cpp1
-rw-r--r--test/gtest/test_tle_tcp_stream.cpp4
-rw-r--r--test/gtest/test_tle_tcp_stream.h4
-rw-r--r--test/gtest/test_tle_udp_destroy.cpp1
-rw-r--r--test/gtest/test_tle_udp_stream_gen.cpp53
-rw-r--r--test/gtest/test_tle_udp_stream_gen.h2
-rw-r--r--test/packetdrill/COPYING339
-rw-r--r--test/packetdrill/Makefile2
-rw-r--r--test/packetdrill/Makefile.FreeBSD2
-rw-r--r--test/packetdrill/Makefile.Linux2
-rw-r--r--test/packetdrill/Makefile.NetBSD2
-rw-r--r--test/packetdrill/Makefile.OpenBSD2
-rw-r--r--test/packetdrill/Makefile.common63
-rw-r--r--test/packetdrill/README58
-rw-r--r--test/packetdrill/assert.h10
-rw-r--r--test/packetdrill/capability.h102
-rw-r--r--test/packetdrill/checksum.c239
-rw-r--r--test/packetdrill/checksum.h54
-rw-r--r--test/packetdrill/checksum_test.c140
-rw-r--r--test/packetdrill/code.c777
-rw-r--r--test/packetdrill/code.h122
-rw-r--r--test/packetdrill/config.c605
-rw-r--r--test/packetdrill/config.h204
-rwxr-xr-xtest/packetdrill/configure3
-rw-r--r--test/packetdrill/contrib/packetdrill.el45
-rw-r--r--test/packetdrill/contrib/packetdrill.vim125
-rw-r--r--test/packetdrill/epoll.c55
-rw-r--r--test/packetdrill/epoll.h62
-rw-r--r--test/packetdrill/ethernet.h75
-rw-r--r--test/packetdrill/fd_state.h64
-rw-r--r--test/packetdrill/file.c55
-rw-r--r--test/packetdrill/file.h52
-rw-r--r--test/packetdrill/fmemopen.c81
-rw-r--r--test/packetdrill/fmemopen.h37
-rw-r--r--test/packetdrill/gre.h102
-rw-r--r--test/packetdrill/gre_packet.c56
-rw-r--r--test/packetdrill/gre_packet.h45
-rw-r--r--test/packetdrill/hash.c430
-rw-r--r--test/packetdrill/hash.h43
-rw-r--r--test/packetdrill/hash_map.c162
-rw-r--r--test/packetdrill/hash_map.h56
-rw-r--r--test/packetdrill/header.h93
-rw-r--r--test/packetdrill/icmp.h97
-rw-r--r--test/packetdrill/icmp_packet.c406
-rw-r--r--test/packetdrill/icmp_packet.h55
-rw-r--r--test/packetdrill/icmpv6.h81
-rw-r--r--test/packetdrill/ip.h108
-rw-r--r--test/packetdrill/ip_address.c379
-rw-r--r--test/packetdrill/ip_address.h131
-rw-r--r--test/packetdrill/ip_packet.c221
-rw-r--r--test/packetdrill/ip_packet.h80
-rw-r--r--test/packetdrill/ip_prefix.c148
-rw-r--r--test/packetdrill/ip_prefix.h69
-rw-r--r--test/packetdrill/ipv6.h92
-rw-r--r--test/packetdrill/lexer.l280
-rw-r--r--test/packetdrill/link_layer.c104
-rw-r--r--test/packetdrill/link_layer.h38
-rw-r--r--test/packetdrill/logging.c51
-rw-r--r--test/packetdrill/logging.h46
-rw-r--r--test/packetdrill/mpls.h113
-rw-r--r--test/packetdrill/mpls_packet.c77
-rw-r--r--test/packetdrill/mpls_packet.h57
-rw-r--r--test/packetdrill/net_utils.c172
-rw-r--r--test/packetdrill/net_utils.h56
-rw-r--r--test/packetdrill/netdev.c502
-rw-r--r--test/packetdrill/netdev.h99
-rw-r--r--test/packetdrill/open_memstream.c142
-rw-r--r--test/packetdrill/open_memstream.h37
-rw-r--r--test/packetdrill/packet.c327
-rw-r--r--test/packetdrill/packet.h425
-rw-r--r--test/packetdrill/packet_checksum.c116
-rw-r--r--test/packetdrill/packet_checksum.h33
-rw-r--r--test/packetdrill/packet_parser.c625
-rw-r--r--test/packetdrill/packet_parser.h53
-rw-r--r--test/packetdrill/packet_parser_test.c484
-rw-r--r--test/packetdrill/packet_socket.h69
-rw-r--r--test/packetdrill/packet_socket_linux.c280
-rw-r--r--test/packetdrill/packet_socket_pcap.c290
-rw-r--r--test/packetdrill/packet_to_string.c303
-rw-r--r--test/packetdrill/packet_to_string.h44
-rw-r--r--test/packetdrill/packet_to_string_test.c301
-rw-r--r--test/packetdrill/packetdrill.c113
-rw-r--r--test/packetdrill/packetdrill.h108
-rw-r--r--test/packetdrill/parse.h62
-rw-r--r--test/packetdrill/parser.y1739
-rw-r--r--test/packetdrill/pipe.c55
-rw-r--r--test/packetdrill/pipe.h54
-rw-r--r--test/packetdrill/platforms.h121
-rw-r--r--test/packetdrill/run.c695
-rw-r--r--test/packetdrill/run.h197
-rw-r--r--test/packetdrill/run_command.c55
-rw-r--r--test/packetdrill/run_command.h38
-rw-r--r--test/packetdrill/run_packet.c1934
-rw-r--r--test/packetdrill/run_packet.h61
-rw-r--r--test/packetdrill/run_system_call.c3561
-rw-r--r--test/packetdrill/run_system_call.h104
-rw-r--r--test/packetdrill/script.c745
-rw-r--r--test/packetdrill/script.h308
-rw-r--r--test/packetdrill/sctp.h40
-rw-r--r--test/packetdrill/so_testing.c169
-rw-r--r--test/packetdrill/so_testing.h55
-rw-r--r--test/packetdrill/socket.c80
-rw-r--r--test/packetdrill/socket.h311
-rw-r--r--test/packetdrill/symbols.h42
-rw-r--r--test/packetdrill/symbols_freebsd.c310
-rw-r--r--test/packetdrill/symbols_linux.c502
-rw-r--r--test/packetdrill/symbols_netbsd.c320
-rw-r--r--test/packetdrill/symbols_openbsd.c281
-rw-r--r--test/packetdrill/system.c52
-rw-r--r--test/packetdrill/system.h35
-rw-r--r--test/packetdrill/tcp.h339
-rw-r--r--test/packetdrill/tcp_options.c70
-rw-r--r--test/packetdrill/tcp_options.h129
-rw-r--r--test/packetdrill/tcp_options_iterator.c169
-rw-r--r--test/packetdrill/tcp_options_iterator.h53
-rw-r--r--test/packetdrill/tcp_options_to_string.c167
-rw-r--r--test/packetdrill/tcp_options_to_string.h40
-rw-r--r--test/packetdrill/tcp_packet.c166
-rw-r--r--test/packetdrill/tcp_packet.h51
-rw-r--r--test/packetdrill/tests/bsd/fast_retransmit/fr-4pkt-sack-bsd.pkt38
-rw-r--r--test/packetdrill/tests/linux/README7
-rw-r--r--test/packetdrill/tests/linux/blocking/blocking-accept.pkt15
-rw-r--r--test/packetdrill/tests/linux/blocking/blocking-read.pkt25
-rw-r--r--test/packetdrill/tests/linux/close/close-read-data-fin.pkt38
-rw-r--r--test/packetdrill/tests/linux/close/close-so-linger-onoff-1-linger-0-rst.pkt28
-rw-r--r--test/packetdrill/tests/linux/close/close-unread-data-rst.pkt38
-rw-r--r--test/packetdrill/tests/linux/connect/http-get-nonblocking-ts.pkt34
-rw-r--r--test/packetdrill/tests/linux/early_retransmit/er-delayed-2pkt-sack.pkt27
-rw-r--r--test/packetdrill/tests/linux/early_retransmit/er-delayed-3pkt-sack.pkt28
-rw-r--r--test/packetdrill/tests/linux/early_retransmit/er-delayed-filled-3pkt-sack.pkt31
-rw-r--r--test/packetdrill/tests/linux/early_retransmit/er-delayed-get-ack-3pkt-sack.pkt35
-rw-r--r--test/packetdrill/tests/linux/early_retransmit/er-quick-2pkt-sack.pkt27
-rw-r--r--test/packetdrill/tests/linux/early_retransmit/er-quick-3pkt-sack.pkt28
-rw-r--r--test/packetdrill/tests/linux/fast_recovery/prr-ss-ack-below-snd_una-reno.pkt51
-rw-r--r--test/packetdrill/tests/linux/fast_retransmit/fr-4pkt-sack-linux.pkt35
-rw-r--r--test/packetdrill/tests/linux/icmp/icmp-all-types.pkt71
-rw-r--r--test/packetdrill/tests/linux/inet_diag/inet-diag-ipv4-mapped-ipv6.pkt29
-rw-r--r--test/packetdrill/tests/linux/inet_diag/inet-diag-ipv4.pkt28
-rw-r--r--test/packetdrill/tests/linux/inet_diag/inet-diag-ipv6.pkt29
-rw-r--r--test/packetdrill/tests/linux/init_rto/init_rto_passive_open.pkt17
-rwxr-xr-xtest/packetdrill/tests/linux/initial_window/iw10-base-case.pkt21
-rwxr-xr-xtest/packetdrill/tests/linux/initial_window/iw10-short-response.pkt21
-rw-r--r--test/packetdrill/tests/linux/ioctl/ioctl-siocinq-fin.pkt30
-rw-r--r--test/packetdrill/tests/linux/listen/listen-incoming-ack.pkt20
-rw-r--r--test/packetdrill/tests/linux/listen/listen-incoming-no-tcp-flags.pkt21
-rw-r--r--test/packetdrill/tests/linux/listen/listen-incoming-rst.pkt22
-rw-r--r--test/packetdrill/tests/linux/listen/listen-incoming-syn-ack.pkt20
-rw-r--r--test/packetdrill/tests/linux/listen/listen-incoming-syn-rst.pkt22
-rw-r--r--test/packetdrill/tests/linux/listen/listen-unbound.pkt5
-rw-r--r--test/packetdrill/tests/linux/mss/mss-getsockopt-tcp_maxseg-client-ts.pkt17
-rw-r--r--test/packetdrill/tests/linux/mss/mss-getsockopt-tcp_maxseg-client.pkt14
-rw-r--r--test/packetdrill/tests/linux/mss/mss-getsockopt-tcp_maxseg-server-advmss-ipv4.pkt29
-rw-r--r--test/packetdrill/tests/linux/mss/mss-getsockopt-tcp_maxseg-server-advmss-ts-ipv4.pkt30
-rw-r--r--test/packetdrill/tests/linux/mss/mss-getsockopt-tcp_maxseg-server-ts.pkt20
-rw-r--r--test/packetdrill/tests/linux/mss/mss-getsockopt-tcp_maxseg-server.pkt17
-rw-r--r--test/packetdrill/tests/linux/mss/mss-setsockopt-tcp_maxseg-client.pkt24
-rw-r--r--test/packetdrill/tests/linux/mss/mss-setsockopt-tcp_maxseg-server.pkt27
-rw-r--r--test/packetdrill/tests/linux/pmtu_discovery/pmtud-10pkt-1460-to-1160.pkt54
-rw-r--r--test/packetdrill/tests/linux/pmtu_discovery/pmtud-1pkt-1460-to-1160.pkt36
-rw-r--r--test/packetdrill/tests/linux/receiver_rtt/rcv-rtt-with-timestamps-new.pkt57
-rw-r--r--test/packetdrill/tests/linux/receiver_rtt/rcv-rtt-without-timestamps-new.pkt62
-rwxr-xr-xtest/packetdrill/tests/linux/run_tests.sh6
-rw-r--r--test/packetdrill/tests/linux/sack/sack-shift-sacked-1-2-3-fack.pkt47
-rw-r--r--test/packetdrill/tests/linux/sack/sack-shift-sacked-1-2:6-fack.pkt39
-rw-r--r--test/packetdrill/tests/linux/shutdown/shutdown-rd-close.pkt29
-rw-r--r--test/packetdrill/tests/linux/shutdown/shutdown-rd-wr-close.pkt45
-rw-r--r--test/packetdrill/tests/linux/shutdown/shutdown-rdwr-close.pkt26
-rw-r--r--test/packetdrill/tests/linux/shutdown/shutdown-wr-close.pkt29
-rw-r--r--test/packetdrill/tests/linux/undo/undo-fr-ack-then-dsack-on-ack-below-snd_una.pkt55
-rw-r--r--test/packetdrill/tests/linux/undo/undo-fr-acks-dropped-then-dsack.pkt44
-rw-r--r--test/packetdrill/tests/tldk/delay_ack/delay-ack-tldk.pkt26
-rw-r--r--test/packetdrill/tests/tldk/fast_retransmit/fr-4pkt-tldk.pkt35
-rw-r--r--test/packetdrill/tests/tldk/keep_alive/keep-alive-after-accept-tldk.pkt50
-rw-r--r--test/packetdrill/tests/tldk/keep_alive/keep-alive-before-connect-tldk.pkt37
-rw-r--r--test/packetdrill/tests/tldk/keep_alive/keep-alive-enable-disable-tldk.pkt26
-rw-r--r--test/packetdrill/tests/tldk/out_of_order/ofo-simple-3pkt-tldk.pkt27
-rw-r--r--test/packetdrill/tests/tldk/tso/tso-segment-split.pkt63
-rw-r--r--test/packetdrill/tun.h117
-rw-r--r--test/packetdrill/types.c44
-rw-r--r--test/packetdrill/types.h207
-rw-r--r--test/packetdrill/uapi_linux.h296
-rw-r--r--test/packetdrill/udp.h44
-rw-r--r--test/packetdrill/udp_packet.c91
-rw-r--r--test/packetdrill/udp_packet.h44
-rw-r--r--test/packetdrill/unaligned.h53
-rw-r--r--test/packetdrill/wire_client.c302
-rw-r--r--test/packetdrill/wire_client.h69
-rw-r--r--test/packetdrill/wire_client_netdev.c167
-rw-r--r--test/packetdrill/wire_client_netdev.h37
-rw-r--r--test/packetdrill/wire_conn.c254
-rw-r--r--test/packetdrill/wire_conn.h88
-rw-r--r--test/packetdrill/wire_protocol.c49
-rw-r--r--test/packetdrill/wire_protocol.h66
-rw-r--r--test/packetdrill/wire_server.c537
-rw-r--r--test/packetdrill/wire_server.h36
-rw-r--r--test/packetdrill/wire_server_netdev.c204
-rw-r--r--test/packetdrill/wire_server_netdev.h47
-rw-r--r--test/packetdrill/wrap.c125
-rw-r--r--test/packetdrill/wrap.h32
289 files changed, 45114 insertions, 1313 deletions
diff --git a/.gitreview b/.gitreview
index 3559d4a..418bfa7 100644
--- a/.gitreview
+++ b/.gitreview
@@ -2,3 +2,4 @@
host=gerrit.fd.io
port=29418
project=tldk
+defaultbranch=dev-next-socket
diff --git a/Makefile b/Makefile
index 474ada6..10c276d 100644
--- a/Makefile
+++ b/Makefile
@@ -22,6 +22,7 @@ endif
RTE_TARGET ?= x86_64-native-linuxapp-gcc
+DIRS-y += dpdk
DIRS-y += lib
DIRS-y += examples
DIRS-y += test
@@ -32,11 +33,18 @@ MAKEFLAGS += --no-print-directory
O ?= $(TLDK_ROOT)/${RTE_TARGET}
BASE_OUTPUT ?= $(abspath $(O))
+DPDK_LIBS_PATH := $(TLDK_ROOT)/dpdk/install/lib
+TLDK_LIBS_PATH := $(TLDK_ROOT)/$(RTE_TARGET)/lib
+LIBS :=
+
.PHONY: all
all: $(DIRS-y)
.PHONY: clean
-clean: $(DIRS-y)
+clean:
+ @make clean -C test/packetdrill
+ @rm -rf $(RTE_TARGET)
+ @rm -rf libtldk.so libtldk.a
.PHONY: $(DIRS-y)
$(DIRS-y): $(RTE_SDK)/mk/rte.vars.mk
@@ -48,8 +56,37 @@ $(DIRS-y): $(RTE_SDK)/mk/rte.vars.mk
CUR_SUBDIR=$(CUR_SUBDIR)/$(@) \
S=$(CURDIR)/$(@) \
RTE_TARGET=$(RTE_TARGET) \
+ EXTRA_CFLAGS="-fPIC" \
$(filter-out $(DIRS-y),$(MAKECMDGOALS))
+test: libtldk.a libtldk.so
+
+libtldk.so: lib
+ $(eval LIBS = $(wildcard $(DPDK_LIBS_PATH)/librte*.a $(TLDK_LIBS_PATH)/*.a))
+ @gcc -shared -o libtldk.so -L$(DPDK_LIBS_PATH) -L$(TLDK_LIBS_PATH) \
+ -Wl,--whole-archive $(LIBS) -Wl,--no-whole-archive \
+ -lpthread -ldl -lnuma
+
+define repack
+@echo -- repack $1 ---
+@rm -rf tmpxyz; rm -f $1; mkdir tmpxyz; cd tmpxyz; \
+ for f in $(LIBS) ; do \
+ fn=$$(basename $$f) ; \
+ echo $$fn ; \
+ mkdir $$fn"_obj" ; \
+ cd $$fn"_obj" ; \
+ ar x $$f ; \
+ cd .. ; \
+ done; \
+ar cru ../$1 $$(find */*.o | paste -sd " " -); cd ..; rm -rf tmpxyz
+endef
+
+libtldk.a: lib
+ $(eval LIBS = $(wildcard $(DPDK_LIBS_PATH)/librte*.a))
+ $(call repack,libdpdk.a)
+ $(eval LIBS = $(wildcard $(DPDK_LIBS_PATH)/librte*.a $(TLDK_LIBS_PATH)/*.a))
+ $(call repack,libtldk.a)
+
$(RTE_SDK)/mk/rte.vars.mk:
ifeq ($(RTE_SDK),$(LOCAL_RTE_SDK))
@make RTE_TARGET=$(RTE_TARGET) config all -C $(TLDK_ROOT)/dpdk/
diff --git a/README b/README
index 2ca150b..792bdef 100644
--- a/README
+++ b/README
@@ -1,7 +1,5 @@
1. OVERVIEW
- TLDK project scope is as follows:
-
1) To implement a set of libraries for L4 protocol processing (UDP, TCP etc.)
for both IPv4 and IPv6.
@@ -16,8 +14,7 @@
code for setup, manage and perform actual IO over underlying devices are
all out of scope of these libraries.
- The only information these libraries need to know about the
- underlying devices:
+ The only information these libraries need about the underlying devices:
- supported HW offloads
- MTU and L3/L2 addresses
That allows the libraries to fill L2/L3 headers and mbuf metadata
@@ -36,12 +33,22 @@
The library uses siphash logic from the below source
https://github.com/veorq/SipHash
+2. APIs
+
+ TLDK provides three series of APIs:
+ - TLDK native APIs, provided by libtle_l4p.
+ - Posix APIs, provided by libtle_glue with PRELOAD compile macro.
+ - Posix APIs with changed symbol names, provided by libtle_glue without PRELOAD macro.
+
+
+3. INSTALLATION GUIDE
-2. INSTALLATION GUIDE
+ - Original guide
+ ----------------
1) Obtain latest supported DPDK version and build it.
(refer to http://dpdk.org for information how to download and build it).
- Currently supported(tested) DPDK versions: 18.11 LTS.
+ Currently supported(tested) DPDK versions: 16.11 LTS, 17.11 LTS, 18.02.
2) Make sure that RTE_SDK and RTE_TARGET DPDK related environment variables
are setup correctly.
3) Go to the TLDK root directory and type: 'make all'.
@@ -58,6 +65,29 @@
make all
./x86_64-native-linuxapp-gcc/app/l4fwd ...
+
+ - For preload use
+ -----------------
+
+ Debug:
+
+ $ make DPDK_DEBUG=y EXTRA_CFLAGS="-g -O0 -fPIC -DPRELOAD" all
+
+ Release:
+
+ $ make EXTRA_CFLAGS="-g -fPIC -DPRELOAD" all
+
+ - For TLDK API use
+ ------------------
+
+ Debug:
+
+ $ make DPDK_DEBUG=y EXTRA_CFLAGS="-g -O0 -fPIC" all
+
+ Release:
+
+ $ make EXTRA_CFLAGS="-g -O3 -fPIC" all
+
3. CONTENTS
$(TLDK_ROOT)
@@ -74,6 +104,8 @@
| +--libtle_l4p - implementation of the TCP/UDP packet processing
| |
| +--libtle_timer - implementation of the timer library
+ | |
+ | +--libtle_glue - socket glue layer with arp, icmp, epoll, etc
|
+----examples
| |
@@ -88,3 +120,159 @@
| | (googletest)
| |
| +--timer - UT for libtle_timer (standalone app)
+ | |
+ | +--packetdrill - UT for stack (standalone app)
+
+
+5. Features
+
+ Done:
+ - posix interface
+ - loopback device
+ - regression test
+ - multi-thread
+ - lightweight mem
+ - tcp_info (paritial)
+ - fd management
+ - arp request/reply
+ - icmp reply
+ - interrupt mode
+ - blocking recv/send
+ - TSO
+ - UFO
+
+ TODO:
+ - fuzzing
+ - SACK
+ - RACK
+ - zerocopy APIs
+ - batching APIs
+ - multi-process
+ - numa awareness
+ - context recycle on thread exit
+
+5. Thread model
+
+ - Multi-process is still not fully supported.
+
+ - Symmetric multi-thread
+
+ (app thread) (app thread) (app thread)
+ \ \ \
+ / / /
+ \ \ \
+ --------------------------------------------------------
+ | FD management, Socket APIs (FE) |
+ --------------------------------------------------------
+
+ ----------- ----------- -----------
+ | | | | | |
+ | ctx | | ctx | | ctx |
+ | | | | | |
+ ----------- ----------- -----------
+ \__ | __/
+ \__ | __/
+ \__ | __/
+ \__ | __/
+ -------------------------
+ | (RSS) NIC (FDIR) |
+ -------------------------
+
+ - Lookaside multi-thread
+
+ (app thread) (app thread) (io thread)
+ \ \ \
+ / / /
+ \ \ \
+ ------------------------------------------------------
+ | FD management, Socket APIs (FE) |
+ ------------------------------------------------------
+ /
+ \
+ /
+ ------------------------------------------------------
+ | |
+ | ctx |
+ | |
+ ------------------------------------------------------
+ |
+ |
+ -------------------------
+ | NIC |
+ -------------------------
+
+6. How to run
+
+ We have two setups which need their own preparation.
+
+ - virtio-user: test with virtio-user + vhost-kernel;
+ - physical NIC: test with physical NIC bound to vfio.
+
+ If you are using physical NIC:
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ a. Set Linux boot options (Only needed if you will use physical NIC)
+ a1. Add below configuration into GRUB_CMDLINE_LINUX in /etc/default/grub
+ "intel_iommu=on iommu=pt"
+
+ a2. Update grub
+ $ sudo grub2-mkconfig -o /boot/grub2/grub.cfg
+
+ If you want to use 1GB hugepages, you can also add below content in the
+ boot cmdline:
+ "default_hugepagesz=1G hugepagesz=1G hugepages=2"
+
+ b. Adjust RLIMIT_MEMLOCK (Only needed if you will use physical NIC)
+ Add below two lines into /etc/security/limits.conf
+ "* soft memlock 4194304
+ * hard memlock 4194304"
+
+ c. Reboot system
+
+ d. Bind NIC to vfio-pci
+
+ $ sudo modprobe vfio-pci
+ $ sudo ./usertools/dpdk-devbind.py -b vfio-pci 0000:01:00.1
+ $ sudo chmod 666 /dev/vfio/16 (16 is just an example)
+
+ If you are using virtio-user:
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ a. Prepare vhost-kernel
+
+ $ sudo modprobe vhost-net
+ (if you don't have those modules, have to compile by yourself)
+ $ sudo chmod 666 /dev/vhost-net
+ $ sudo tunctl -u <your username>
+
+ b. Prepare the vNIC
+
+ $ export DPDK_VNIC="--vdev=virtio_user0,path=/dev/vhost-net,queue_size=1024,iface=tap0"
+
+ For both cases, we need to:
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ $ sudo chmod 777 /dev/hugepages
+ $ export DPDK_IP=1.1.1.1
+
+ Note: for specific test example, you can refer to the example commit log.
+
+7. How to run packetdrill tests:
+
+ Compile it in LOOK_ASIDE_BACKEND mode:
+
+ $ make PACKETDRILL=y EXTRA_CFLAGS="-g -O0 -fPIC -march=native -DLOOK_ASIDE_BACKEND " all
+
+ To run it:
+
+ $ test/packetdrill/packetdrill --local_ip=192.168.0.2 \
+ --remote_ip=192.0.2.1 --so_filename=`pwd`/libtldk.so \
+ test/packetdrill/tests/tldk/fast_retransmit/fr-4pkt-tldk.pkt
+
+8. Tested Examples
+
+ - examples/client
+ - examples/server
+ - wget (epoll)
+ - curl (poll)
+ - haproxy (multi-thread mode)
diff --git a/afl/lower_constructor_priority.diff b/afl/lower_constructor_priority.diff
new file mode 100644
index 0000000..b1eba07
--- /dev/null
+++ b/afl/lower_constructor_priority.diff
@@ -0,0 +1,16 @@
+diff --git a/llvm_mode/afl-llvm-rt.o.c b/llvm_mode/afl-llvm-rt.o.c
+index debde20..69e2e4c 100644
+--- a/llvm_mode/afl-llvm-rt.o.c
++++ b/llvm_mode/afl-llvm-rt.o.c
+@@ -39,9 +39,9 @@
+ the LLVM-generated runtime initialization pass, not before. */
+
+ #ifdef USE_TRACE_PC
+-# define CONST_PRIO 5
++# define CONST_PRIO 2005
+ #else
+-# define CONST_PRIO 0
++# define CONST_PRIO 2000
+ #endif /* ^USE_TRACE_PC */
+
+ #include <sys/mman.h>
diff --git a/afl/run.sh b/afl/run.sh
new file mode 100755
index 0000000..23213f6
--- /dev/null
+++ b/afl/run.sh
@@ -0,0 +1 @@
+`pwd`/AFLplusplus/afl-fuzz -m 4096 -i seeds -o output ../x86_64-native-linuxapp-gcc/app/tcp_lo 127.0.0.1 1234 @@
diff --git a/afl/seeds/seed.txt b/afl/seeds/seed.txt
new file mode 100644
index 0000000..1ed4d76
--- /dev/null
+++ b/afl/seeds/seed.txt
Binary files differ
diff --git a/angora/dpdk_abilist.txt b/angora/dpdk_abilist.txt
new file mode 100644
index 0000000..f02f7c7
--- /dev/null
+++ b/angora/dpdk_abilist.txt
@@ -0,0 +1,1756 @@
+fun:pci_find_max_end_va=uninstrumented
+fun:pci_parse_one_sysfs_resource=uninstrumented
+fun:pci_update_device=uninstrumented
+fun:rte_pci_get_iommu_class=uninstrumented
+fun:rte_pci_ioport_map=uninstrumented
+fun:rte_pci_ioport_read=uninstrumented
+fun:rte_pci_ioport_unmap=uninstrumented
+fun:rte_pci_ioport_write=uninstrumented
+fun:rte_pci_map_device=uninstrumented
+fun:rte_pci_read_config=uninstrumented
+fun:rte_pci_scan=uninstrumented
+fun:rte_pci_unmap_device=uninstrumented
+fun:rte_pci_write_config=uninstrumented
+fun:pci_name_set=uninstrumented
+fun:rte_pci_add_device=uninstrumented
+fun:rte_pci_dump=uninstrumented
+fun:rte_pci_get_sysfs_path=uninstrumented
+fun:rte_pci_insert_device=uninstrumented
+fun:rte_pci_match=uninstrumented
+fun:rte_pci_probe=uninstrumented
+fun:rte_pci_register=uninstrumented
+fun:rte_pci_unregister=uninstrumented
+fun:pci_uio_map_resource=uninstrumented
+fun:pci_uio_remap_resource=uninstrumented
+fun:pci_uio_unmap_resource=uninstrumented
+fun:rte_pci_dev_iterate=uninstrumented
+fun:pci_uio_alloc_resource=uninstrumented
+fun:pci_uio_free_resource=uninstrumented
+fun:pci_uio_ioport_map=uninstrumented
+fun:pci_uio_ioport_read=uninstrumented
+fun:pci_uio_ioport_unmap=uninstrumented
+fun:pci_uio_ioport_write=uninstrumented
+fun:pci_uio_map_resource_by_index=uninstrumented
+fun:pci_uio_read_config=uninstrumented
+fun:pci_uio_write_config=uninstrumented
+fun:pci_vfio_ioport_map=uninstrumented
+fun:pci_vfio_ioport_read=uninstrumented
+fun:pci_vfio_ioport_unmap=uninstrumented
+fun:pci_vfio_ioport_write=uninstrumented
+fun:pci_vfio_is_enabled=uninstrumented
+fun:pci_vfio_map_resource=uninstrumented
+fun:pci_vfio_read_config=uninstrumented
+fun:pci_vfio_unmap_resource=uninstrumented
+fun:pci_vfio_write_config=uninstrumented
+fun:rte_vdev_add_custom_scan=uninstrumented
+fun:rte_vdev_find_device=uninstrumented
+fun:rte_vdev_init=uninstrumented
+fun:rte_vdev_register=uninstrumented
+fun:rte_vdev_remove_custom_scan=uninstrumented
+fun:rte_vdev_uninit=uninstrumented
+fun:rte_vdev_unregister=uninstrumented
+fun:rte_vdev_dev_iterate=uninstrumented
+fun:cmdline_free=uninstrumented
+fun:cmdline_in=uninstrumented
+fun:cmdline_interact=uninstrumented
+fun:cmdline_new=uninstrumented
+fun:cmdline_poll=uninstrumented
+fun:cmdline_printf=uninstrumented
+fun:cmdline_quit=uninstrumented
+fun:cmdline_set_prompt=uninstrumented
+fun:cmdline_write_char=uninstrumented
+fun:cirbuf_add_buf_head=uninstrumented
+fun:cirbuf_add_buf_tail=uninstrumented
+fun:cirbuf_add_head=uninstrumented
+fun:cirbuf_add_head_safe=uninstrumented
+fun:cirbuf_add_tail=uninstrumented
+fun:cirbuf_add_tail_safe=uninstrumented
+fun:cirbuf_align_left=uninstrumented
+fun:cirbuf_align_right=uninstrumented
+fun:cirbuf_del_buf_head=uninstrumented
+fun:cirbuf_del_buf_tail=uninstrumented
+fun:cirbuf_del_head=uninstrumented
+fun:cirbuf_del_head_safe=uninstrumented
+fun:cirbuf_del_tail=uninstrumented
+fun:cirbuf_del_tail_safe=uninstrumented
+fun:cirbuf_get_buf_head=uninstrumented
+fun:cirbuf_get_buf_tail=uninstrumented
+fun:cirbuf_get_head=uninstrumented
+fun:cirbuf_get_tail=uninstrumented
+fun:cirbuf_init=uninstrumented
+fun:cmdline_complete=uninstrumented
+fun:cmdline_isendofcommand=uninstrumented
+fun:cmdline_isendoftoken=uninstrumented
+fun:cmdline_parse=uninstrumented
+fun:cmdline_get_help_etheraddr=uninstrumented
+fun:cmdline_parse_etheraddr=uninstrumented
+fun:cmdline_get_help_ipaddr=uninstrumented
+fun:cmdline_parse_ipaddr=uninstrumented
+fun:cmdline_get_help_num=uninstrumented
+fun:cmdline_parse_num=uninstrumented
+fun:cmdline_get_help_portlist=uninstrumented
+fun:cmdline_parse_portlist=uninstrumented
+fun:cmdline_complete_get_elt_string=uninstrumented
+fun:cmdline_complete_get_nb_string=uninstrumented
+fun:cmdline_get_help_string=uninstrumented
+fun:cmdline_parse_string=uninstrumented
+fun:rdline_add_history=uninstrumented
+fun:rdline_char_in=uninstrumented
+fun:rdline_clear_history=uninstrumented
+fun:rdline_get_buffer=uninstrumented
+fun:rdline_get_history_item=uninstrumented
+fun:rdline_init=uninstrumented
+fun:rdline_newline=uninstrumented
+fun:rdline_quit=uninstrumented
+fun:rdline_redisplay=uninstrumented
+fun:rdline_reset=uninstrumented
+fun:rdline_restart=uninstrumented
+fun:rdline_stop=uninstrumented
+fun:cmdline_file_new=uninstrumented
+fun:cmdline_stdin_exit=uninstrumented
+fun:cmdline_stdin_new=uninstrumented
+fun:vt100_init=uninstrumented
+fun:vt100_parser=uninstrumented
+fun:eal_create_runtime_dir=uninstrumented
+fun:eal_parse_sysfs_value=uninstrumented
+fun:eal_proc_type_detect=uninstrumented
+fun:rte_eal_check_module=uninstrumented
+fun:rte_eal_cleanup=uninstrumented
+fun:rte_eal_create_uio_dev=uninstrumented
+fun:rte_eal_get_configuration=uninstrumented
+fun:rte_eal_get_runtime_dir=uninstrumented
+fun:rte_eal_has_hugepages=uninstrumented
+fun:rte_eal_has_pci=uninstrumented
+fun:rte_eal_init=uninstrumented
+fun:rte_eal_iopl_init=uninstrumented
+fun:rte_eal_iova_mode=uninstrumented
+fun:rte_eal_lcore_role=uninstrumented
+fun:rte_eal_mbuf_user_pool_ops=uninstrumented
+fun:rte_eal_process_type=uninstrumented
+fun:rte_eal_vfio_intr_mode=uninstrumented
+fun:rte_set_application_usage_hook=uninstrumented
+fun:rte_eal_alarm_cancel=uninstrumented
+fun:rte_eal_alarm_init=uninstrumented
+fun:rte_eal_alarm_set=uninstrumented
+fun:rte_bus_dump=uninstrumented
+fun:rte_bus_find=uninstrumented
+fun:rte_bus_find_by_device=uninstrumented
+fun:rte_bus_find_by_device_name=uninstrumented
+fun:rte_bus_find_by_name=uninstrumented
+fun:rte_bus_get_iommu_class=uninstrumented
+fun:rte_bus_probe=uninstrumented
+fun:rte_bus_register=uninstrumented
+fun:rte_bus_scan=uninstrumented
+fun:rte_bus_sigbus_handler=uninstrumented
+fun:rte_bus_unregister=uninstrumented
+fun:rte_class_find=uninstrumented
+fun:rte_class_find_by_name=uninstrumented
+fun:rte_class_register=uninstrumented
+fun:rte_class_unregister=uninstrumented
+fun:rte_cpu_check_supported=uninstrumented
+fun:rte_cpu_is_supported=uninstrumented
+fun:local_dev_probe=uninstrumented
+fun:local_dev_remove=uninstrumented
+fun:rte_dev_event_callback_process=uninstrumented
+fun:rte_dev_event_callback_register=uninstrumented
+fun:rte_dev_event_callback_unregister=uninstrumented
+fun:rte_dev_is_probed=uninstrumented
+fun:rte_dev_iterator_init=uninstrumented
+fun:rte_dev_iterator_next=uninstrumented
+fun:rte_dev_probe=uninstrumented
+fun:rte_dev_remove=uninstrumented
+fun:rte_eal_hotplug_add=uninstrumented
+fun:rte_eal_hotplug_remove=uninstrumented
+fun:rte_devargs_add=uninstrumented
+fun:rte_devargs_dump=uninstrumented
+fun:rte_devargs_insert=uninstrumented
+fun:rte_devargs_layers_parse=uninstrumented
+fun:rte_devargs_next=uninstrumented
+fun:rte_devargs_parse=uninstrumented
+fun:rte_devargs_parsef=uninstrumented
+fun:rte_devargs_remove=uninstrumented
+fun:rte_devargs_type_count=uninstrumented
+fun:rte_strerror=uninstrumented
+fun:rte_fbarray_attach=uninstrumented
+fun:rte_fbarray_destroy=uninstrumented
+fun:rte_fbarray_detach=uninstrumented
+fun:rte_fbarray_dump_metadata=uninstrumented
+fun:rte_fbarray_find_contig_free=uninstrumented
+fun:rte_fbarray_find_contig_used=uninstrumented
+fun:rte_fbarray_find_idx=uninstrumented
+fun:rte_fbarray_find_next_free=uninstrumented
+fun:rte_fbarray_find_next_n_free=uninstrumented
+fun:rte_fbarray_find_next_n_used=uninstrumented
+fun:rte_fbarray_find_next_used=uninstrumented
+fun:rte_fbarray_find_prev_free=uninstrumented
+fun:rte_fbarray_find_prev_n_free=uninstrumented
+fun:rte_fbarray_find_prev_n_used=uninstrumented
+fun:rte_fbarray_find_prev_used=uninstrumented
+fun:rte_fbarray_find_rev_contig_free=uninstrumented
+fun:rte_fbarray_find_rev_contig_used=uninstrumented
+fun:rte_fbarray_get=uninstrumented
+fun:rte_fbarray_init=uninstrumented
+fun:rte_fbarray_is_used=uninstrumented
+fun:rte_fbarray_set_free=uninstrumented
+fun:rte_fbarray_set_used=uninstrumented
+fun:rte_hexdump=uninstrumented
+fun:rte_memdump=uninstrumented
+fun:rte_hypervisor_get_name=uninstrumented
+fun:rte_eal_get_lcore_state=uninstrumented
+fun:rte_eal_mp_remote_launch=uninstrumented
+fun:rte_eal_mp_wait_lcore=uninstrumented
+fun:rte_eal_wait_lcore=uninstrumented
+fun:rte_eal_cpu_init=uninstrumented
+fun:rte_socket_count=uninstrumented
+fun:rte_socket_id_by_idx=uninstrumented
+fun:eal_log_set_default=uninstrumented
+fun:rte_log=uninstrumented
+fun:rte_log_cur_msg_loglevel=uninstrumented
+fun:rte_log_cur_msg_logtype=uninstrumented
+fun:rte_log_dump=uninstrumented
+fun:rte_log_get_global_level=uninstrumented
+fun:rte_log_get_level=uninstrumented
+fun:rte_log_register=uninstrumented
+fun:rte_log_register_type_and_pick_level=uninstrumented
+fun:rte_log_save_pattern=uninstrumented
+fun:rte_log_save_regexp=uninstrumented
+fun:rte_log_set_global_level=uninstrumented
+fun:rte_log_set_level=uninstrumented
+fun:rte_log_set_level_pattern=uninstrumented
+fun:rte_log_set_level_regexp=uninstrumented
+fun:rte_openlog_stream=uninstrumented
+fun:rte_vlog=uninstrumented
+fun:eal_memalloc_is_contig=uninstrumented
+fun:eal_memalloc_mem_alloc_validate=uninstrumented
+fun:eal_memalloc_mem_alloc_validator_register=uninstrumented
+fun:eal_memalloc_mem_alloc_validator_unregister=uninstrumented
+fun:eal_memalloc_mem_event_callback_register=uninstrumented
+fun:eal_memalloc_mem_event_callback_unregister=uninstrumented
+fun:eal_memalloc_mem_event_notify=uninstrumented
+fun:eal_get_virtual_area=uninstrumented
+fun:rte_dump_physmem_layout=uninstrumented
+fun:rte_eal_get_physmem_size=uninstrumented
+fun:rte_eal_memory_init=uninstrumented
+fun:rte_mem_alloc_validator_register=uninstrumented
+fun:rte_mem_alloc_validator_unregister=uninstrumented
+fun:rte_mem_check_dma_mask=uninstrumented
+fun:rte_mem_check_dma_mask_thread_unsafe=uninstrumented
+fun:rte_mem_event_callback_register=uninstrumented
+fun:rte_mem_event_callback_unregister=uninstrumented
+fun:rte_mem_iova2virt=uninstrumented
+fun:rte_mem_lock_page=uninstrumented
+fun:rte_mem_set_dma_mask=uninstrumented
+fun:rte_mem_virt2memseg=uninstrumented
+fun:rte_mem_virt2memseg_list=uninstrumented
+fun:rte_memory_get_nchannel=uninstrumented
+fun:rte_memory_get_nrank=uninstrumented
+fun:rte_memseg_contig_walk=uninstrumented
+fun:rte_memseg_contig_walk_thread_unsafe=uninstrumented
+fun:rte_memseg_get_fd=uninstrumented
+fun:rte_memseg_get_fd_offset=uninstrumented
+fun:rte_memseg_get_fd_offset_thread_unsafe=uninstrumented
+fun:rte_memseg_get_fd_thread_unsafe=uninstrumented
+fun:rte_memseg_list_walk=uninstrumented
+fun:rte_memseg_list_walk_thread_unsafe=uninstrumented
+fun:rte_memseg_walk=uninstrumented
+fun:rte_memseg_walk_thread_unsafe=uninstrumented
+fun:rte_eal_memzone_init=uninstrumented
+fun:rte_memzone_dump=uninstrumented
+fun:rte_memzone_free=uninstrumented
+fun:rte_memzone_lookup=uninstrumented
+fun:rte_memzone_reserve=uninstrumented
+fun:rte_memzone_reserve_aligned=uninstrumented
+fun:rte_memzone_reserve_bounded=uninstrumented
+fun:rte_memzone_walk=uninstrumented
+fun:eal_adjust_config=uninstrumented
+fun:eal_check_common_options=uninstrumented
+fun:eal_common_usage=uninstrumented
+fun:eal_option_device_parse=uninstrumented
+fun:eal_parse_common_option=uninstrumented
+fun:eal_plugins_init=uninstrumented
+fun:eal_reset_internal_config=uninstrumented
+fun:rte_eal_primary_proc_alive=uninstrumented
+fun:rte_mp_action_register=uninstrumented
+fun:rte_mp_action_unregister=uninstrumented
+fun:rte_mp_channel_init=uninstrumented
+fun:rte_mp_reply=uninstrumented
+fun:rte_mp_request_async=uninstrumented
+fun:rte_mp_request_sync=uninstrumented
+fun:rte_mp_sendmsg=uninstrumented
+fun:rte_strscpy=uninstrumented
+fun:rte_strsplit=uninstrumented
+fun:rte_dump_tailq=uninstrumented
+fun:rte_eal_tailq_lookup=uninstrumented
+fun:rte_eal_tailq_register=uninstrumented
+fun:rte_eal_tailqs_init=uninstrumented
+fun:eal_cpuset_socket_id=uninstrumented
+fun:eal_thread_dump_affinity=uninstrumented
+fun:rte_ctrl_thread_create=uninstrumented
+fun:rte_lcore_has_role=uninstrumented
+fun:rte_socket_id=uninstrumented
+fun:rte_thread_get_affinity=uninstrumented
+fun:rte_thread_set_affinity=uninstrumented
+fun:rte_delay_us_block=uninstrumented
+fun:rte_delay_us_callback_register=uninstrumented
+fun:rte_delay_us_sleep=uninstrumented
+fun:rte_get_tsc_hz=uninstrumented
+fun:set_tsc_freq=uninstrumented
+fun:rte_uuid_compare=uninstrumented
+fun:rte_uuid_is_null=uninstrumented
+fun:rte_uuid_parse=uninstrumented
+fun:rte_uuid_unparse=uninstrumented
+fun:rte_cpu_getauxval=uninstrumented
+fun:rte_cpu_strcmp_auxval=uninstrumented
+fun:__rte_panic=uninstrumented
+fun:rte_dump_registers=uninstrumented
+fun:rte_dump_stack=uninstrumented
+fun:rte_exit=uninstrumented
+fun:dev_sigbus_handler_register=uninstrumented
+fun:dev_sigbus_handler_unregister=uninstrumented
+fun:rte_dev_event_monitor_start=uninstrumented
+fun:rte_dev_event_monitor_stop=uninstrumented
+fun:rte_dev_hotplug_handle_disable=uninstrumented
+fun:rte_dev_hotplug_handle_enable=uninstrumented
+fun:eal_hugepage_info_init=uninstrumented
+fun:eal_hugepage_info_read=uninstrumented
+fun:rte_eal_intr_init=uninstrumented
+fun:rte_epoll_ctl=uninstrumented
+fun:rte_epoll_wait=uninstrumented
+fun:rte_intr_allow_others=uninstrumented
+fun:rte_intr_callback_register=uninstrumented
+fun:rte_intr_callback_unregister=uninstrumented
+fun:rte_intr_cap_multiple=uninstrumented
+fun:rte_intr_disable=uninstrumented
+fun:rte_intr_dp_is_en=uninstrumented
+fun:rte_intr_efd_disable=uninstrumented
+fun:rte_intr_efd_enable=uninstrumented
+fun:rte_intr_enable=uninstrumented
+fun:rte_intr_free_epoll_fd=uninstrumented
+fun:rte_intr_rx_ctl=uninstrumented
+fun:rte_intr_tls_epfd=uninstrumented
+fun:eal_cpu_core_id=uninstrumented
+fun:eal_cpu_detected=uninstrumented
+fun:eal_cpu_socket_id=uninstrumented
+fun:rte_eal_log_init=uninstrumented
+fun:eal_memalloc_alloc_seg=uninstrumented
+fun:eal_memalloc_alloc_seg_bulk=uninstrumented
+fun:eal_memalloc_free_seg=uninstrumented
+fun:eal_memalloc_free_seg_bulk=uninstrumented
+fun:eal_memalloc_get_seg_fd=uninstrumented
+fun:eal_memalloc_get_seg_fd_offset=uninstrumented
+fun:eal_memalloc_init=uninstrumented
+fun:eal_memalloc_set_seg_fd=uninstrumented
+fun:eal_memalloc_sync_with_primary=uninstrumented
+fun:rte_eal_hugepage_attach=uninstrumented
+fun:rte_eal_hugepage_init=uninstrumented
+fun:rte_eal_memseg_init=uninstrumented
+fun:rte_eal_using_phys_addrs=uninstrumented
+fun:rte_mem_virt2iova=uninstrumented
+fun:rte_mem_virt2phy=uninstrumented
+fun:eal_thread_init_master=uninstrumented
+fun:eal_thread_loop=uninstrumented
+fun:rte_eal_remote_launch=uninstrumented
+fun:rte_sys_gettid=uninstrumented
+fun:rte_thread_setname=uninstrumented
+fun:get_tsc_freq=uninstrumented
+fun:rte_eal_hpet_init=uninstrumented
+fun:rte_eal_timer_init=uninstrumented
+fun:rte_get_hpet_cycles=uninstrumented
+fun:rte_get_hpet_hz=uninstrumented
+fun:rte_vfio_clear_group=uninstrumented
+fun:rte_vfio_container_create=uninstrumented
+fun:rte_vfio_container_destroy=uninstrumented
+fun:rte_vfio_container_dma_map=uninstrumented
+fun:rte_vfio_container_dma_unmap=uninstrumented
+fun:rte_vfio_container_group_bind=uninstrumented
+fun:rte_vfio_container_group_unbind=uninstrumented
+fun:rte_vfio_dma_map=uninstrumented
+fun:rte_vfio_dma_unmap=uninstrumented
+fun:rte_vfio_enable=uninstrumented
+fun:rte_vfio_get_container_fd=uninstrumented
+fun:rte_vfio_get_group_fd=uninstrumented
+fun:rte_vfio_get_group_num=uninstrumented
+fun:rte_vfio_is_enabled=uninstrumented
+fun:rte_vfio_noiommu_is_enabled=uninstrumented
+fun:rte_vfio_release_device=uninstrumented
+fun:rte_vfio_setup_device=uninstrumented
+fun:vfio_get_default_container_fd=uninstrumented
+fun:vfio_has_supported_extensions=uninstrumented
+fun:vfio_set_iommu_type=uninstrumented
+fun:vfio_mp_sync_setup=uninstrumented
+fun:eal_dev_hotplug_request_to_primary=uninstrumented
+fun:eal_dev_hotplug_request_to_secondary=uninstrumented
+fun:rte_mp_dev_hotplug_init=uninstrumented
+fun:malloc_elem_alloc=uninstrumented
+fun:malloc_elem_can_hold=uninstrumented
+fun:malloc_elem_dump=uninstrumented
+fun:malloc_elem_find_max_iova_contig=uninstrumented
+fun:malloc_elem_free=uninstrumented
+fun:malloc_elem_free_list_index=uninstrumented
+fun:malloc_elem_free_list_insert=uninstrumented
+fun:malloc_elem_free_list_remove=uninstrumented
+fun:malloc_elem_hide_region=uninstrumented
+fun:malloc_elem_init=uninstrumented
+fun:malloc_elem_insert=uninstrumented
+fun:malloc_elem_join_adjacent_free=uninstrumented
+fun:malloc_elem_resize=uninstrumented
+fun:alloc_pages_on_heap=uninstrumented
+fun:malloc_heap_add_external_memory=uninstrumented
+fun:malloc_heap_alloc=uninstrumented
+fun:malloc_heap_alloc_biggest=uninstrumented
+fun:malloc_heap_create=uninstrumented
+fun:malloc_heap_destroy=uninstrumented
+fun:malloc_heap_dump=uninstrumented
+fun:malloc_heap_free=uninstrumented
+fun:malloc_heap_free_pages=uninstrumented
+fun:malloc_heap_get_stats=uninstrumented
+fun:malloc_heap_remove_external_memory=uninstrumented
+fun:malloc_heap_resize=uninstrumented
+fun:malloc_socket_to_heap_id=uninstrumented
+fun:rollback_expand_heap=uninstrumented
+fun:rte_eal_malloc_heap_init=uninstrumented
+fun:register_mp_requests=uninstrumented
+fun:request_sync=uninstrumented
+fun:request_to_primary=uninstrumented
+fun:rte_cpu_get_flag_enabled=uninstrumented
+fun:rte_cpu_get_flag_name=uninstrumented
+fun:get_tsc_freq_arch=uninstrumented
+fun:rte_hypervisor_get=uninstrumented
+fun:rte_keepalive_create=uninstrumented
+fun:rte_keepalive_dispatch_pings=uninstrumented
+fun:rte_keepalive_mark_alive=uninstrumented
+fun:rte_keepalive_mark_sleep=uninstrumented
+fun:rte_keepalive_register_core=uninstrumented
+fun:rte_keepalive_register_relay_callback=uninstrumented
+fun:rte_calloc=uninstrumented
+fun:rte_calloc_socket=uninstrumented
+fun:rte_free=uninstrumented
+fun:rte_malloc=uninstrumented
+fun:rte_malloc_dump_heaps=uninstrumented
+fun:rte_malloc_dump_stats=uninstrumented
+fun:rte_malloc_get_socket_stats=uninstrumented
+fun:rte_malloc_heap_create=uninstrumented
+fun:rte_malloc_heap_destroy=uninstrumented
+fun:rte_malloc_heap_get_socket=uninstrumented
+fun:rte_malloc_heap_memory_add=uninstrumented
+fun:rte_malloc_heap_memory_attach=uninstrumented
+fun:rte_malloc_heap_memory_detach=uninstrumented
+fun:rte_malloc_heap_memory_remove=uninstrumented
+fun:rte_malloc_heap_socket_is_external=uninstrumented
+fun:rte_malloc_set_limit=uninstrumented
+fun:rte_malloc_socket=uninstrumented
+fun:rte_malloc_validate=uninstrumented
+fun:rte_malloc_virt2iova=uninstrumented
+fun:rte_realloc=uninstrumented
+fun:rte_zmalloc=uninstrumented
+fun:rte_zmalloc_socket=uninstrumented
+fun:rte_option_init=uninstrumented
+fun:rte_option_parse=uninstrumented
+fun:rte_option_register=uninstrumented
+fun:rte_reciprocal_value=uninstrumented
+fun:rte_reciprocal_value_u64=uninstrumented
+fun:rte_service_attr_get=uninstrumented
+fun:rte_service_attr_reset_all=uninstrumented
+fun:rte_service_component_register=uninstrumented
+fun:rte_service_component_runstate_set=uninstrumented
+fun:rte_service_component_unregister=uninstrumented
+fun:rte_service_dump=uninstrumented
+fun:rte_service_finalize=uninstrumented
+fun:rte_service_get_by_name=uninstrumented
+fun:rte_service_get_count=uninstrumented
+fun:rte_service_get_name=uninstrumented
+fun:rte_service_init=uninstrumented
+fun:rte_service_lcore_add=uninstrumented
+fun:rte_service_lcore_attr_get=uninstrumented
+fun:rte_service_lcore_attr_reset_all=uninstrumented
+fun:rte_service_lcore_count=uninstrumented
+fun:rte_service_lcore_count_services=uninstrumented
+fun:rte_service_lcore_del=uninstrumented
+fun:rte_service_lcore_list=uninstrumented
+fun:rte_service_lcore_reset_all=uninstrumented
+fun:rte_service_lcore_start=uninstrumented
+fun:rte_service_lcore_stop=uninstrumented
+fun:rte_service_map_lcore_get=uninstrumented
+fun:rte_service_map_lcore_set=uninstrumented
+fun:rte_service_may_be_active=uninstrumented
+fun:rte_service_probe_capability=uninstrumented
+fun:rte_service_run_iter_on_app_lcore=uninstrumented
+fun:rte_service_runstate_get=uninstrumented
+fun:rte_service_runstate_set=uninstrumented
+fun:rte_service_set_runstate_mapped_check=uninstrumented
+fun:rte_service_set_stats_enable=uninstrumented
+fun:rte_service_start_with_defaults=uninstrumented
+fun:eth_dev_to_id=uninstrumented
+fun:eth_find_device=uninstrumented
+fun:rte_eth_devargs_parse_list=uninstrumented
+fun:rte_eth_devargs_parse_representor_ports=uninstrumented
+fun:__rte_eth_dev_profile_init=uninstrumented
+fun:_rte_eth_dev_callback_process=uninstrumented
+fun:_rte_eth_dev_reset=uninstrumented
+fun:rte_eth_add_first_rx_callback=uninstrumented
+fun:rte_eth_add_rx_callback=uninstrumented
+fun:rte_eth_add_tx_callback=uninstrumented
+fun:rte_eth_allmulticast_disable=uninstrumented
+fun:rte_eth_allmulticast_enable=uninstrumented
+fun:rte_eth_allmulticast_get=uninstrumented
+fun:rte_eth_dev_adjust_nb_rx_tx_desc=uninstrumented
+fun:rte_eth_dev_allocate=uninstrumented
+fun:rte_eth_dev_allocated=uninstrumented
+fun:rte_eth_dev_attach_secondary=uninstrumented
+fun:rte_eth_dev_callback_register=uninstrumented
+fun:rte_eth_dev_callback_unregister=uninstrumented
+fun:rte_eth_dev_close=uninstrumented
+fun:rte_eth_dev_configure=uninstrumented
+fun:rte_eth_dev_count=uninstrumented
+fun:rte_eth_dev_count_avail=uninstrumented
+fun:rte_eth_dev_count_total=uninstrumented
+fun:rte_eth_dev_create=uninstrumented
+fun:rte_eth_dev_default_mac_addr_set=uninstrumented
+fun:rte_eth_dev_destroy=uninstrumented
+fun:rte_eth_dev_filter_ctrl=uninstrumented
+fun:rte_eth_dev_filter_supported=uninstrumented
+fun:rte_eth_dev_flow_ctrl_get=uninstrumented
+fun:rte_eth_dev_flow_ctrl_set=uninstrumented
+fun:rte_eth_dev_fw_version_get=uninstrumented
+fun:rte_eth_dev_get_dcb_info=uninstrumented
+fun:rte_eth_dev_get_eeprom=uninstrumented
+fun:rte_eth_dev_get_eeprom_length=uninstrumented
+fun:rte_eth_dev_get_module_eeprom=uninstrumented
+fun:rte_eth_dev_get_module_info=uninstrumented
+fun:rte_eth_dev_get_mtu=uninstrumented
+fun:rte_eth_dev_get_name_by_port=uninstrumented
+fun:rte_eth_dev_get_port_by_name=uninstrumented
+fun:rte_eth_dev_get_reg_info=uninstrumented
+fun:rte_eth_dev_get_sec_ctx=uninstrumented
+fun:rte_eth_dev_get_supported_ptypes=uninstrumented
+fun:rte_eth_dev_get_vlan_offload=uninstrumented
+fun:rte_eth_dev_info_get=uninstrumented
+fun:rte_eth_dev_is_removed=uninstrumented
+fun:rte_eth_dev_is_valid_port=uninstrumented
+fun:rte_eth_dev_l2_tunnel_eth_type_conf=uninstrumented
+fun:rte_eth_dev_l2_tunnel_offload_set=uninstrumented
+fun:rte_eth_dev_mac_addr_add=uninstrumented
+fun:rte_eth_dev_mac_addr_remove=uninstrumented
+fun:rte_eth_dev_owner_delete=uninstrumented
+fun:rte_eth_dev_owner_get=uninstrumented
+fun:rte_eth_dev_owner_new=uninstrumented
+fun:rte_eth_dev_owner_set=uninstrumented
+fun:rte_eth_dev_owner_unset=uninstrumented
+fun:rte_eth_dev_pool_ops_supported=uninstrumented
+fun:rte_eth_dev_priority_flow_ctrl_set=uninstrumented
+fun:rte_eth_dev_probing_finish=uninstrumented
+fun:rte_eth_dev_release_port=uninstrumented
+fun:rte_eth_dev_reset=uninstrumented
+fun:rte_eth_dev_rss_hash_conf_get=uninstrumented
+fun:rte_eth_dev_rss_hash_update=uninstrumented
+fun:rte_eth_dev_rss_reta_query=uninstrumented
+fun:rte_eth_dev_rss_reta_update=uninstrumented
+fun:rte_eth_dev_rx_intr_ctl=uninstrumented
+fun:rte_eth_dev_rx_intr_ctl_q=uninstrumented
+fun:rte_eth_dev_rx_intr_ctl_q_get_fd=uninstrumented
+fun:rte_eth_dev_rx_intr_disable=uninstrumented
+fun:rte_eth_dev_rx_intr_enable=uninstrumented
+fun:rte_eth_dev_rx_offload_name=uninstrumented
+fun:rte_eth_dev_rx_queue_start=uninstrumented
+fun:rte_eth_dev_rx_queue_stop=uninstrumented
+fun:rte_eth_dev_set_eeprom=uninstrumented
+fun:rte_eth_dev_set_link_down=uninstrumented
+fun:rte_eth_dev_set_link_up=uninstrumented
+fun:rte_eth_dev_set_mc_addr_list=uninstrumented
+fun:rte_eth_dev_set_mtu=uninstrumented
+fun:rte_eth_dev_set_rx_queue_stats_mapping=uninstrumented
+fun:rte_eth_dev_set_tx_queue_stats_mapping=uninstrumented
+fun:rte_eth_dev_set_vlan_ether_type=uninstrumented
+fun:rte_eth_dev_set_vlan_offload=uninstrumented
+fun:rte_eth_dev_set_vlan_pvid=uninstrumented
+fun:rte_eth_dev_set_vlan_strip_on_queue=uninstrumented
+fun:rte_eth_dev_socket_id=uninstrumented
+fun:rte_eth_dev_start=uninstrumented
+fun:rte_eth_dev_stop=uninstrumented
+fun:rte_eth_dev_tx_offload_name=uninstrumented
+fun:rte_eth_dev_tx_queue_start=uninstrumented
+fun:rte_eth_dev_tx_queue_stop=uninstrumented
+fun:rte_eth_dev_uc_all_hash_table_set=uninstrumented
+fun:rte_eth_dev_uc_hash_table_set=uninstrumented
+fun:rte_eth_dev_udp_tunnel_port_add=uninstrumented
+fun:rte_eth_dev_udp_tunnel_port_delete=uninstrumented
+fun:rte_eth_dev_vlan_filter=uninstrumented
+fun:rte_eth_devargs_parse=uninstrumented
+fun:rte_eth_dma_zone_reserve=uninstrumented
+fun:rte_eth_find_next=uninstrumented
+fun:rte_eth_find_next_owned_by=uninstrumented
+fun:rte_eth_iterator_cleanup=uninstrumented
+fun:rte_eth_iterator_init=uninstrumented
+fun:rte_eth_iterator_next=uninstrumented
+fun:rte_eth_led_off=uninstrumented
+fun:rte_eth_led_on=uninstrumented
+fun:rte_eth_link_get=uninstrumented
+fun:rte_eth_link_get_nowait=uninstrumented
+fun:rte_eth_macaddr_get=uninstrumented
+fun:rte_eth_mirror_rule_reset=uninstrumented
+fun:rte_eth_mirror_rule_set=uninstrumented
+fun:rte_eth_promiscuous_disable=uninstrumented
+fun:rte_eth_promiscuous_enable=uninstrumented
+fun:rte_eth_promiscuous_get=uninstrumented
+fun:rte_eth_remove_rx_callback=uninstrumented
+fun:rte_eth_remove_tx_callback=uninstrumented
+fun:rte_eth_rx_queue_info_get=uninstrumented
+fun:rte_eth_rx_queue_setup=uninstrumented
+fun:rte_eth_set_queue_rate_limit=uninstrumented
+fun:rte_eth_speed_bitflag=uninstrumented
+fun:rte_eth_stats_get=uninstrumented
+fun:rte_eth_stats_reset=uninstrumented
+fun:rte_eth_switch_domain_alloc=uninstrumented
+fun:rte_eth_switch_domain_free=uninstrumented
+fun:rte_eth_timesync_adjust_time=uninstrumented
+fun:rte_eth_timesync_disable=uninstrumented
+fun:rte_eth_timesync_enable=uninstrumented
+fun:rte_eth_timesync_read_rx_timestamp=uninstrumented
+fun:rte_eth_timesync_read_time=uninstrumented
+fun:rte_eth_timesync_read_tx_timestamp=uninstrumented
+fun:rte_eth_timesync_write_time=uninstrumented
+fun:rte_eth_tx_buffer_count_callback=uninstrumented
+fun:rte_eth_tx_buffer_drop_callback=uninstrumented
+fun:rte_eth_tx_buffer_init=uninstrumented
+fun:rte_eth_tx_buffer_set_err_callback=uninstrumented
+fun:rte_eth_tx_done_cleanup=uninstrumented
+fun:rte_eth_tx_queue_info_get=uninstrumented
+fun:rte_eth_tx_queue_setup=uninstrumented
+fun:rte_eth_xstats_get=uninstrumented
+fun:rte_eth_xstats_get_by_id=uninstrumented
+fun:rte_eth_xstats_get_id_by_name=uninstrumented
+fun:rte_eth_xstats_get_names=uninstrumented
+fun:rte_eth_xstats_get_names_by_id=uninstrumented
+fun:rte_eth_xstats_reset=uninstrumented
+fun:rte_flow_conv=uninstrumented
+fun:rte_flow_copy=uninstrumented
+fun:rte_flow_create=uninstrumented
+fun:rte_flow_destroy=uninstrumented
+fun:rte_flow_error_set=uninstrumented
+fun:rte_flow_expand_rss=uninstrumented
+fun:rte_flow_flush=uninstrumented
+fun:rte_flow_isolate=uninstrumented
+fun:rte_flow_ops_get=uninstrumented
+fun:rte_flow_query=uninstrumented
+fun:rte_flow_validate=uninstrumented
+fun:rte_mtr_capabilities_get=uninstrumented
+fun:rte_mtr_create=uninstrumented
+fun:rte_mtr_destroy=uninstrumented
+fun:rte_mtr_meter_disable=uninstrumented
+fun:rte_mtr_meter_dscp_table_update=uninstrumented
+fun:rte_mtr_meter_enable=uninstrumented
+fun:rte_mtr_meter_profile_add=uninstrumented
+fun:rte_mtr_meter_profile_delete=uninstrumented
+fun:rte_mtr_meter_profile_update=uninstrumented
+fun:rte_mtr_ops_get=uninstrumented
+fun:rte_mtr_policer_actions_update=uninstrumented
+fun:rte_mtr_stats_read=uninstrumented
+fun:rte_mtr_stats_update=uninstrumented
+fun:rte_tm_capabilities_get=uninstrumented
+fun:rte_tm_get_number_of_leaf_nodes=uninstrumented
+fun:rte_tm_hierarchy_commit=uninstrumented
+fun:rte_tm_level_capabilities_get=uninstrumented
+fun:rte_tm_mark_ip_dscp=uninstrumented
+fun:rte_tm_mark_ip_ecn=uninstrumented
+fun:rte_tm_mark_vlan_dei=uninstrumented
+fun:rte_tm_node_add=uninstrumented
+fun:rte_tm_node_capabilities_get=uninstrumented
+fun:rte_tm_node_cman_update=uninstrumented
+fun:rte_tm_node_delete=uninstrumented
+fun:rte_tm_node_parent_update=uninstrumented
+fun:rte_tm_node_resume=uninstrumented
+fun:rte_tm_node_shaper_update=uninstrumented
+fun:rte_tm_node_shared_shaper_update=uninstrumented
+fun:rte_tm_node_shared_wred_context_update=uninstrumented
+fun:rte_tm_node_stats_read=uninstrumented
+fun:rte_tm_node_stats_update=uninstrumented
+fun:rte_tm_node_suspend=uninstrumented
+fun:rte_tm_node_type_get=uninstrumented
+fun:rte_tm_node_wfq_weight_mode_update=uninstrumented
+fun:rte_tm_node_wred_context_update=uninstrumented
+fun:rte_tm_ops_get=uninstrumented
+fun:rte_tm_shaper_profile_add=uninstrumented
+fun:rte_tm_shaper_profile_delete=uninstrumented
+fun:rte_tm_shared_shaper_add_update=uninstrumented
+fun:rte_tm_shared_shaper_delete=uninstrumented
+fun:rte_tm_shared_wred_context_add_update=uninstrumented
+fun:rte_tm_shared_wred_context_delete=uninstrumented
+fun:rte_tm_wred_profile_add=uninstrumented
+fun:rte_tm_wred_profile_delete=uninstrumented
+fun:gro_tcp4_reassemble=uninstrumented
+fun:gro_tcp4_tbl_create=uninstrumented
+fun:gro_tcp4_tbl_destroy=uninstrumented
+fun:gro_tcp4_tbl_pkt_count=uninstrumented
+fun:gro_tcp4_tbl_timeout_flush=uninstrumented
+fun:gro_vxlan_tcp4_reassemble=uninstrumented
+fun:gro_vxlan_tcp4_tbl_create=uninstrumented
+fun:gro_vxlan_tcp4_tbl_destroy=uninstrumented
+fun:gro_vxlan_tcp4_tbl_pkt_count=uninstrumented
+fun:gro_vxlan_tcp4_tbl_timeout_flush=uninstrumented
+fun:rte_gro_ctx_create=uninstrumented
+fun:rte_gro_ctx_destroy=uninstrumented
+fun:rte_gro_get_pkt_count=uninstrumented
+fun:rte_gro_reassemble=uninstrumented
+fun:rte_gro_reassemble_burst=uninstrumented
+fun:rte_gro_timeout_flush=uninstrumented
+fun:rte_hash_add_key=uninstrumented
+fun:rte_hash_add_key_data=uninstrumented
+fun:rte_hash_add_key_with_hash=uninstrumented
+fun:rte_hash_add_key_with_hash_data=uninstrumented
+fun:rte_hash_count=uninstrumented
+fun:rte_hash_create=uninstrumented
+fun:rte_hash_del_key=uninstrumented
+fun:rte_hash_del_key_with_hash=uninstrumented
+fun:rte_hash_find_existing=uninstrumented
+fun:rte_hash_free=uninstrumented
+fun:rte_hash_free_key_with_position=uninstrumented
+fun:rte_hash_get_key_with_position=uninstrumented
+fun:rte_hash_hash=uninstrumented
+fun:rte_hash_iterate=uninstrumented
+fun:rte_hash_lookup=uninstrumented
+fun:rte_hash_lookup_bulk=uninstrumented
+fun:rte_hash_lookup_bulk_data=uninstrumented
+fun:rte_hash_lookup_data=uninstrumented
+fun:rte_hash_lookup_with_hash=uninstrumented
+fun:rte_hash_lookup_with_hash_data=uninstrumented
+fun:rte_hash_reset=uninstrumented
+fun:rte_hash_set_cmp_func=uninstrumented
+fun:rte_fbk_hash_create=uninstrumented
+fun:rte_fbk_hash_find_existing=uninstrumented
+fun:rte_fbk_hash_free=uninstrumented
+fun:ip_frag_find=uninstrumented
+fun:ip_frag_lookup=uninstrumented
+fun:ip_frag_process=uninstrumented
+fun:rte_frag_table_del_expired_entries=uninstrumented
+fun:rte_ip_frag_free_death_row=uninstrumented
+fun:rte_ip_frag_table_create=uninstrumented
+fun:rte_ip_frag_table_destroy=uninstrumented
+fun:rte_ip_frag_table_statistics_dump=uninstrumented
+fun:rte_ipv4_fragment_packet=uninstrumented
+fun:ipv4_frag_reassemble=uninstrumented
+fun:rte_ipv4_frag_reassemble_packet=uninstrumented
+fun:rte_ipv6_fragment_packet=uninstrumented
+fun:ipv6_frag_reassemble=uninstrumented
+fun:rte_ipv6_frag_reassemble_packet=uninstrumented
+fun:rte_kvargs_count=uninstrumented
+fun:rte_kvargs_free=uninstrumented
+fun:rte_kvargs_parse=uninstrumented
+fun:rte_kvargs_parse_delim=uninstrumented
+fun:rte_kvargs_process=uninstrumented
+fun:rte_kvargs_strcmp=uninstrumented
+fun:__rte_pktmbuf_read=uninstrumented
+fun:rte_get_rx_ol_flag_list=uninstrumented
+fun:rte_get_rx_ol_flag_name=uninstrumented
+fun:rte_get_tx_ol_flag_list=uninstrumented
+fun:rte_get_tx_ol_flag_name=uninstrumented
+fun:rte_mbuf_sanity_check=uninstrumented
+fun:rte_pktmbuf_dump=uninstrumented
+fun:rte_pktmbuf_dynamic_pool_create=uninstrumented
+fun:rte_pktmbuf_init=uninstrumented
+fun:rte_pktmbuf_pool_create=uninstrumented
+fun:rte_pktmbuf_pool_create_by_ops=uninstrumented
+fun:rte_pktmbuf_pool_init=uninstrumented
+fun:rte_mbuf_best_mempool_ops=uninstrumented
+fun:rte_mbuf_platform_mempool_ops=uninstrumented
+fun:rte_mbuf_set_platform_mempool_ops=uninstrumented
+fun:rte_mbuf_set_user_mempool_ops=uninstrumented
+fun:rte_mbuf_user_mempool_ops=uninstrumented
+fun:rte_get_ptype_inner_l2_name=uninstrumented
+fun:rte_get_ptype_inner_l3_name=uninstrumented
+fun:rte_get_ptype_inner_l4_name=uninstrumented
+fun:rte_get_ptype_l2_name=uninstrumented
+fun:rte_get_ptype_l3_name=uninstrumented
+fun:rte_get_ptype_l4_name=uninstrumented
+fun:rte_get_ptype_name=uninstrumented
+fun:rte_get_ptype_tunnel_name=uninstrumented
+fun:rte_mempool_audit=uninstrumented
+fun:rte_mempool_avail_count=uninstrumented
+fun:rte_mempool_cache_create=uninstrumented
+fun:rte_mempool_cache_free=uninstrumented
+fun:rte_mempool_calc_obj_size=uninstrumented
+fun:rte_mempool_check_cookies=uninstrumented
+fun:rte_mempool_contig_blocks_check_cookies=uninstrumented
+fun:rte_mempool_create=uninstrumented
+fun:rte_mempool_create_empty=uninstrumented
+fun:rte_mempool_dump=uninstrumented
+fun:rte_mempool_free=uninstrumented
+fun:rte_mempool_in_use_count=uninstrumented
+fun:rte_mempool_list_dump=uninstrumented
+fun:rte_mempool_lookup=uninstrumented
+fun:rte_mempool_mem_iter=uninstrumented
+fun:rte_mempool_obj_iter=uninstrumented
+fun:rte_mempool_populate_anon=uninstrumented
+fun:rte_mempool_populate_default=uninstrumented
+fun:rte_mempool_populate_iova=uninstrumented
+fun:rte_mempool_populate_virt=uninstrumented
+fun:rte_mempool_walk=uninstrumented
+fun:rte_mempool_ops_alloc=uninstrumented
+fun:rte_mempool_ops_calc_mem_size=uninstrumented
+fun:rte_mempool_ops_free=uninstrumented
+fun:rte_mempool_ops_get_count=uninstrumented
+fun:rte_mempool_ops_get_info=uninstrumented
+fun:rte_mempool_ops_populate=uninstrumented
+fun:rte_mempool_register_ops=uninstrumented
+fun:rte_mempool_set_ops_byname=uninstrumented
+fun:rte_mempool_op_calc_mem_size_default=uninstrumented
+fun:rte_mempool_op_populate_default=uninstrumented
+fun:rte_metrics_get_names=uninstrumented
+fun:rte_metrics_get_values=uninstrumented
+fun:rte_metrics_init=uninstrumented
+fun:rte_metrics_reg_name=uninstrumented
+fun:rte_metrics_reg_names=uninstrumented
+fun:rte_metrics_update_value=uninstrumented
+fun:rte_metrics_update_values=uninstrumented
+fun:rte_net_make_rarp_packet=uninstrumented
+fun:rte_net_get_ptype=uninstrumented
+fun:rte_net_skip_ip6_ext=uninstrumented
+fun:rte_net_crc_calc=uninstrumented
+fun:rte_net_crc_set_alg=uninstrumented
+fun:eal_parse_pci_BDF=uninstrumented
+fun:eal_parse_pci_DomBDF=uninstrumented
+fun:pci_map_resource=uninstrumented
+fun:pci_unmap_resource=uninstrumented
+fun:rte_eal_compare_pci_addr=uninstrumented
+fun:rte_pci_addr_cmp=uninstrumented
+fun:rte_pci_addr_parse=uninstrumented
+fun:rte_pci_device_name=uninstrumented
+fun:rte_eth_from_ring=uninstrumented
+fun:rte_eth_from_rings=uninstrumented
+fun:sock_support_features=uninstrumented
+fun:vhost_kernel_open_sock=uninstrumented
+fun:vhost_kernel_set_sock=uninstrumented
+fun:tap_support_features=uninstrumented
+fun:vhost_kernel_open_tap=uninstrumented
+fun:eth_virtio_dev_init=uninstrumented
+fun:virtio_dev_pause=uninstrumented
+fun:virtio_dev_resume=uninstrumented
+fun:virtio_inject_pkts=uninstrumented
+fun:virtio_interrupt_handler=uninstrumented
+fun:vtpci_get_status=uninstrumented
+fun:vtpci_init=uninstrumented
+fun:vtpci_isr=uninstrumented
+fun:vtpci_msix_detect=uninstrumented
+fun:vtpci_negotiate_features=uninstrumented
+fun:vtpci_read_dev_config=uninstrumented
+fun:vtpci_reinit_complete=uninstrumented
+fun:vtpci_reset=uninstrumented
+fun:vtpci_set_status=uninstrumented
+fun:vtpci_write_dev_config=uninstrumented
+fun:virtio_dev_cq_start=uninstrumented
+fun:virtio_dev_rx_queue_done=uninstrumented
+fun:virtio_dev_rx_queue_setup=uninstrumented
+fun:virtio_dev_rx_queue_setup_finish=uninstrumented
+fun:virtio_dev_tx_queue_setup=uninstrumented
+fun:virtio_dev_tx_queue_setup_finish=uninstrumented
+fun:virtio_recv_mergeable_pkts=uninstrumented
+fun:virtio_recv_mergeable_pkts_inorder=uninstrumented
+fun:virtio_recv_pkts=uninstrumented
+fun:virtio_xmit_pkts=uninstrumented
+fun:virtio_xmit_pkts_inorder=uninstrumented
+fun:vq_ring_free_chain=uninstrumented
+fun:vq_ring_free_inorder=uninstrumented
+fun:virtio_rxq_vec_setup=uninstrumented
+fun:virtio_recv_pkts_vec=uninstrumented
+fun:is_vhost_user_by_type=uninstrumented
+fun:virtio_user_dev_init=uninstrumented
+fun:virtio_user_dev_uninit=uninstrumented
+fun:virtio_user_handle_cq=uninstrumented
+fun:virtio_user_handle_mq=uninstrumented
+fun:virtio_user_start_device=uninstrumented
+fun:virtio_user_stop_device=uninstrumented
+fun:virtqueue_detach_unused=uninstrumented
+fun:virtqueue_rxvq_flush=uninstrumented
+fun:rte_ring_create=uninstrumented
+fun:rte_ring_dump=uninstrumented
+fun:rte_ring_free=uninstrumented
+fun:rte_ring_get_memsize=uninstrumented
+fun:rte_ring_init=uninstrumented
+fun:rte_ring_list_dump=uninstrumented
+fun:rte_ring_lookup=uninstrumented
+fun:rte_timer_dump_stats=uninstrumented
+fun:rte_timer_init=uninstrumented
+fun:rte_timer_manage=uninstrumented
+fun:rte_timer_pending=uninstrumented
+fun:rte_timer_reset=uninstrumented
+fun:rte_timer_reset_sync=uninstrumented
+fun:rte_timer_stop=uninstrumented
+fun:rte_timer_stop_sync=uninstrumented
+fun:rte_timer_subsystem_init=uninstrumented
+fun:pci_find_max_end_va=discard
+fun:pci_parse_one_sysfs_resource=discard
+fun:pci_update_device=discard
+fun:rte_pci_get_iommu_class=discard
+fun:rte_pci_ioport_map=discard
+fun:rte_pci_ioport_read=discard
+fun:rte_pci_ioport_unmap=discard
+fun:rte_pci_ioport_write=discard
+fun:rte_pci_map_device=discard
+fun:rte_pci_read_config=discard
+fun:rte_pci_scan=discard
+fun:rte_pci_unmap_device=discard
+fun:rte_pci_write_config=discard
+fun:pci_name_set=discard
+fun:rte_pci_add_device=discard
+fun:rte_pci_dump=discard
+fun:rte_pci_get_sysfs_path=discard
+fun:rte_pci_insert_device=discard
+fun:rte_pci_match=discard
+fun:rte_pci_probe=discard
+fun:rte_pci_register=discard
+fun:rte_pci_unregister=discard
+fun:pci_uio_map_resource=discard
+fun:pci_uio_remap_resource=discard
+fun:pci_uio_unmap_resource=discard
+fun:rte_pci_dev_iterate=discard
+fun:pci_uio_alloc_resource=discard
+fun:pci_uio_free_resource=discard
+fun:pci_uio_ioport_map=discard
+fun:pci_uio_ioport_read=discard
+fun:pci_uio_ioport_unmap=discard
+fun:pci_uio_ioport_write=discard
+fun:pci_uio_map_resource_by_index=discard
+fun:pci_uio_read_config=discard
+fun:pci_uio_write_config=discard
+fun:pci_vfio_ioport_map=discard
+fun:pci_vfio_ioport_read=discard
+fun:pci_vfio_ioport_unmap=discard
+fun:pci_vfio_ioport_write=discard
+fun:pci_vfio_is_enabled=discard
+fun:pci_vfio_map_resource=discard
+fun:pci_vfio_read_config=discard
+fun:pci_vfio_unmap_resource=discard
+fun:pci_vfio_write_config=discard
+fun:rte_vdev_add_custom_scan=discard
+fun:rte_vdev_find_device=discard
+fun:rte_vdev_init=discard
+fun:rte_vdev_register=discard
+fun:rte_vdev_remove_custom_scan=discard
+fun:rte_vdev_uninit=discard
+fun:rte_vdev_unregister=discard
+fun:rte_vdev_dev_iterate=discard
+fun:cmdline_free=discard
+fun:cmdline_in=discard
+fun:cmdline_interact=discard
+fun:cmdline_new=discard
+fun:cmdline_poll=discard
+fun:cmdline_printf=discard
+fun:cmdline_quit=discard
+fun:cmdline_set_prompt=discard
+fun:cmdline_write_char=discard
+fun:cirbuf_add_buf_head=discard
+fun:cirbuf_add_buf_tail=discard
+fun:cirbuf_add_head=discard
+fun:cirbuf_add_head_safe=discard
+fun:cirbuf_add_tail=discard
+fun:cirbuf_add_tail_safe=discard
+fun:cirbuf_align_left=discard
+fun:cirbuf_align_right=discard
+fun:cirbuf_del_buf_head=discard
+fun:cirbuf_del_buf_tail=discard
+fun:cirbuf_del_head=discard
+fun:cirbuf_del_head_safe=discard
+fun:cirbuf_del_tail=discard
+fun:cirbuf_del_tail_safe=discard
+fun:cirbuf_get_buf_head=discard
+fun:cirbuf_get_buf_tail=discard
+fun:cirbuf_get_head=discard
+fun:cirbuf_get_tail=discard
+fun:cirbuf_init=discard
+fun:cmdline_complete=discard
+fun:cmdline_isendofcommand=discard
+fun:cmdline_isendoftoken=discard
+fun:cmdline_parse=discard
+fun:cmdline_get_help_etheraddr=discard
+fun:cmdline_parse_etheraddr=discard
+fun:cmdline_get_help_ipaddr=discard
+fun:cmdline_parse_ipaddr=discard
+fun:cmdline_get_help_num=discard
+fun:cmdline_parse_num=discard
+fun:cmdline_get_help_portlist=discard
+fun:cmdline_parse_portlist=discard
+fun:cmdline_complete_get_elt_string=discard
+fun:cmdline_complete_get_nb_string=discard
+fun:cmdline_get_help_string=discard
+fun:cmdline_parse_string=discard
+fun:rdline_add_history=discard
+fun:rdline_char_in=discard
+fun:rdline_clear_history=discard
+fun:rdline_get_buffer=discard
+fun:rdline_get_history_item=discard
+fun:rdline_init=discard
+fun:rdline_newline=discard
+fun:rdline_quit=discard
+fun:rdline_redisplay=discard
+fun:rdline_reset=discard
+fun:rdline_restart=discard
+fun:rdline_stop=discard
+fun:cmdline_file_new=discard
+fun:cmdline_stdin_exit=discard
+fun:cmdline_stdin_new=discard
+fun:vt100_init=discard
+fun:vt100_parser=discard
+fun:eal_create_runtime_dir=discard
+fun:eal_parse_sysfs_value=discard
+fun:eal_proc_type_detect=discard
+fun:rte_eal_check_module=discard
+fun:rte_eal_cleanup=discard
+fun:rte_eal_create_uio_dev=discard
+fun:rte_eal_get_configuration=discard
+fun:rte_eal_get_runtime_dir=discard
+fun:rte_eal_has_hugepages=discard
+fun:rte_eal_has_pci=discard
+fun:rte_eal_init=discard
+fun:rte_eal_iopl_init=discard
+fun:rte_eal_iova_mode=discard
+fun:rte_eal_lcore_role=discard
+fun:rte_eal_mbuf_user_pool_ops=discard
+fun:rte_eal_process_type=discard
+fun:rte_eal_vfio_intr_mode=discard
+fun:rte_set_application_usage_hook=discard
+fun:rte_eal_alarm_cancel=discard
+fun:rte_eal_alarm_init=discard
+fun:rte_eal_alarm_set=discard
+fun:rte_bus_dump=discard
+fun:rte_bus_find=discard
+fun:rte_bus_find_by_device=discard
+fun:rte_bus_find_by_device_name=discard
+fun:rte_bus_find_by_name=discard
+fun:rte_bus_get_iommu_class=discard
+fun:rte_bus_probe=discard
+fun:rte_bus_register=discard
+fun:rte_bus_scan=discard
+fun:rte_bus_sigbus_handler=discard
+fun:rte_bus_unregister=discard
+fun:rte_class_find=discard
+fun:rte_class_find_by_name=discard
+fun:rte_class_register=discard
+fun:rte_class_unregister=discard
+fun:rte_cpu_check_supported=discard
+fun:rte_cpu_is_supported=discard
+fun:local_dev_probe=discard
+fun:local_dev_remove=discard
+fun:rte_dev_event_callback_process=discard
+fun:rte_dev_event_callback_register=discard
+fun:rte_dev_event_callback_unregister=discard
+fun:rte_dev_is_probed=discard
+fun:rte_dev_iterator_init=discard
+fun:rte_dev_iterator_next=discard
+fun:rte_dev_probe=discard
+fun:rte_dev_remove=discard
+fun:rte_eal_hotplug_add=discard
+fun:rte_eal_hotplug_remove=discard
+fun:rte_devargs_add=discard
+fun:rte_devargs_dump=discard
+fun:rte_devargs_insert=discard
+fun:rte_devargs_layers_parse=discard
+fun:rte_devargs_next=discard
+fun:rte_devargs_parse=discard
+fun:rte_devargs_parsef=discard
+fun:rte_devargs_remove=discard
+fun:rte_devargs_type_count=discard
+fun:rte_strerror=discard
+fun:rte_fbarray_attach=discard
+fun:rte_fbarray_destroy=discard
+fun:rte_fbarray_detach=discard
+fun:rte_fbarray_dump_metadata=discard
+fun:rte_fbarray_find_contig_free=discard
+fun:rte_fbarray_find_contig_used=discard
+fun:rte_fbarray_find_idx=discard
+fun:rte_fbarray_find_next_free=discard
+fun:rte_fbarray_find_next_n_free=discard
+fun:rte_fbarray_find_next_n_used=discard
+fun:rte_fbarray_find_next_used=discard
+fun:rte_fbarray_find_prev_free=discard
+fun:rte_fbarray_find_prev_n_free=discard
+fun:rte_fbarray_find_prev_n_used=discard
+fun:rte_fbarray_find_prev_used=discard
+fun:rte_fbarray_find_rev_contig_free=discard
+fun:rte_fbarray_find_rev_contig_used=discard
+fun:rte_fbarray_get=discard
+fun:rte_fbarray_init=discard
+fun:rte_fbarray_is_used=discard
+fun:rte_fbarray_set_free=discard
+fun:rte_fbarray_set_used=discard
+fun:rte_hexdump=discard
+fun:rte_memdump=discard
+fun:rte_hypervisor_get_name=discard
+fun:rte_eal_get_lcore_state=discard
+fun:rte_eal_mp_remote_launch=discard
+fun:rte_eal_mp_wait_lcore=discard
+fun:rte_eal_wait_lcore=discard
+fun:rte_eal_cpu_init=discard
+fun:rte_socket_count=discard
+fun:rte_socket_id_by_idx=discard
+fun:eal_log_set_default=discard
+fun:rte_log=discard
+fun:rte_log_cur_msg_loglevel=discard
+fun:rte_log_cur_msg_logtype=discard
+fun:rte_log_dump=discard
+fun:rte_log_get_global_level=discard
+fun:rte_log_get_level=discard
+fun:rte_log_register=discard
+fun:rte_log_register_type_and_pick_level=discard
+fun:rte_log_save_pattern=discard
+fun:rte_log_save_regexp=discard
+fun:rte_log_set_global_level=discard
+fun:rte_log_set_level=discard
+fun:rte_log_set_level_pattern=discard
+fun:rte_log_set_level_regexp=discard
+fun:rte_openlog_stream=discard
+fun:rte_vlog=discard
+fun:eal_memalloc_is_contig=discard
+fun:eal_memalloc_mem_alloc_validate=discard
+fun:eal_memalloc_mem_alloc_validator_register=discard
+fun:eal_memalloc_mem_alloc_validator_unregister=discard
+fun:eal_memalloc_mem_event_callback_register=discard
+fun:eal_memalloc_mem_event_callback_unregister=discard
+fun:eal_memalloc_mem_event_notify=discard
+fun:eal_get_virtual_area=discard
+fun:rte_dump_physmem_layout=discard
+fun:rte_eal_get_physmem_size=discard
+fun:rte_eal_memory_init=discard
+fun:rte_mem_alloc_validator_register=discard
+fun:rte_mem_alloc_validator_unregister=discard
+fun:rte_mem_check_dma_mask=discard
+fun:rte_mem_check_dma_mask_thread_unsafe=discard
+fun:rte_mem_event_callback_register=discard
+fun:rte_mem_event_callback_unregister=discard
+fun:rte_mem_iova2virt=discard
+fun:rte_mem_lock_page=discard
+fun:rte_mem_set_dma_mask=discard
+fun:rte_mem_virt2memseg=discard
+fun:rte_mem_virt2memseg_list=discard
+fun:rte_memory_get_nchannel=discard
+fun:rte_memory_get_nrank=discard
+fun:rte_memseg_contig_walk=discard
+fun:rte_memseg_contig_walk_thread_unsafe=discard
+fun:rte_memseg_get_fd=discard
+fun:rte_memseg_get_fd_offset=discard
+fun:rte_memseg_get_fd_offset_thread_unsafe=discard
+fun:rte_memseg_get_fd_thread_unsafe=discard
+fun:rte_memseg_list_walk=discard
+fun:rte_memseg_list_walk_thread_unsafe=discard
+fun:rte_memseg_walk=discard
+fun:rte_memseg_walk_thread_unsafe=discard
+fun:rte_eal_memzone_init=discard
+fun:rte_memzone_dump=discard
+fun:rte_memzone_free=discard
+fun:rte_memzone_lookup=discard
+fun:rte_memzone_reserve=discard
+fun:rte_memzone_reserve_aligned=discard
+fun:rte_memzone_reserve_bounded=discard
+fun:rte_memzone_walk=discard
+fun:eal_adjust_config=discard
+fun:eal_check_common_options=discard
+fun:eal_common_usage=discard
+fun:eal_option_device_parse=discard
+fun:eal_parse_common_option=discard
+fun:eal_plugins_init=discard
+fun:eal_reset_internal_config=discard
+fun:rte_eal_primary_proc_alive=discard
+fun:rte_mp_action_register=discard
+fun:rte_mp_action_unregister=discard
+fun:rte_mp_channel_init=discard
+fun:rte_mp_reply=discard
+fun:rte_mp_request_async=discard
+fun:rte_mp_request_sync=discard
+fun:rte_mp_sendmsg=discard
+fun:rte_strscpy=discard
+fun:rte_strsplit=discard
+fun:rte_dump_tailq=discard
+fun:rte_eal_tailq_lookup=discard
+fun:rte_eal_tailq_register=discard
+fun:rte_eal_tailqs_init=discard
+fun:eal_cpuset_socket_id=discard
+fun:eal_thread_dump_affinity=discard
+fun:rte_ctrl_thread_create=discard
+fun:rte_lcore_has_role=discard
+fun:rte_socket_id=discard
+fun:rte_thread_get_affinity=discard
+fun:rte_thread_set_affinity=discard
+fun:rte_delay_us_block=discard
+fun:rte_delay_us_callback_register=discard
+fun:rte_delay_us_sleep=discard
+fun:rte_get_tsc_hz=discard
+fun:set_tsc_freq=discard
+fun:rte_uuid_compare=discard
+fun:rte_uuid_is_null=discard
+fun:rte_uuid_parse=discard
+fun:rte_uuid_unparse=discard
+fun:rte_cpu_getauxval=discard
+fun:rte_cpu_strcmp_auxval=discard
+fun:__rte_panic=discard
+fun:rte_dump_registers=discard
+fun:rte_dump_stack=discard
+fun:rte_exit=discard
+fun:dev_sigbus_handler_register=discard
+fun:dev_sigbus_handler_unregister=discard
+fun:rte_dev_event_monitor_start=discard
+fun:rte_dev_event_monitor_stop=discard
+fun:rte_dev_hotplug_handle_disable=discard
+fun:rte_dev_hotplug_handle_enable=discard
+fun:eal_hugepage_info_init=discard
+fun:eal_hugepage_info_read=discard
+fun:rte_eal_intr_init=discard
+fun:rte_epoll_ctl=discard
+fun:rte_epoll_wait=discard
+fun:rte_intr_allow_others=discard
+fun:rte_intr_callback_register=discard
+fun:rte_intr_callback_unregister=discard
+fun:rte_intr_cap_multiple=discard
+fun:rte_intr_disable=discard
+fun:rte_intr_dp_is_en=discard
+fun:rte_intr_efd_disable=discard
+fun:rte_intr_efd_enable=discard
+fun:rte_intr_enable=discard
+fun:rte_intr_free_epoll_fd=discard
+fun:rte_intr_rx_ctl=discard
+fun:rte_intr_tls_epfd=discard
+fun:eal_cpu_core_id=discard
+fun:eal_cpu_detected=discard
+fun:eal_cpu_socket_id=discard
+fun:rte_eal_log_init=discard
+fun:eal_memalloc_alloc_seg=discard
+fun:eal_memalloc_alloc_seg_bulk=discard
+fun:eal_memalloc_free_seg=discard
+fun:eal_memalloc_free_seg_bulk=discard
+fun:eal_memalloc_get_seg_fd=discard
+fun:eal_memalloc_get_seg_fd_offset=discard
+fun:eal_memalloc_init=discard
+fun:eal_memalloc_set_seg_fd=discard
+fun:eal_memalloc_sync_with_primary=discard
+fun:rte_eal_hugepage_attach=discard
+fun:rte_eal_hugepage_init=discard
+fun:rte_eal_memseg_init=discard
+fun:rte_eal_using_phys_addrs=discard
+fun:rte_mem_virt2iova=discard
+fun:rte_mem_virt2phy=discard
+fun:eal_thread_init_master=discard
+fun:eal_thread_loop=discard
+fun:rte_eal_remote_launch=discard
+fun:rte_sys_gettid=discard
+fun:rte_thread_setname=discard
+fun:get_tsc_freq=discard
+fun:rte_eal_hpet_init=discard
+fun:rte_eal_timer_init=discard
+fun:rte_get_hpet_cycles=discard
+fun:rte_get_hpet_hz=discard
+fun:rte_vfio_clear_group=discard
+fun:rte_vfio_container_create=discard
+fun:rte_vfio_container_destroy=discard
+fun:rte_vfio_container_dma_map=discard
+fun:rte_vfio_container_dma_unmap=discard
+fun:rte_vfio_container_group_bind=discard
+fun:rte_vfio_container_group_unbind=discard
+fun:rte_vfio_dma_map=discard
+fun:rte_vfio_dma_unmap=discard
+fun:rte_vfio_enable=discard
+fun:rte_vfio_get_container_fd=discard
+fun:rte_vfio_get_group_fd=discard
+fun:rte_vfio_get_group_num=discard
+fun:rte_vfio_is_enabled=discard
+fun:rte_vfio_noiommu_is_enabled=discard
+fun:rte_vfio_release_device=discard
+fun:rte_vfio_setup_device=discard
+fun:vfio_get_default_container_fd=discard
+fun:vfio_has_supported_extensions=discard
+fun:vfio_set_iommu_type=discard
+fun:vfio_mp_sync_setup=discard
+fun:eal_dev_hotplug_request_to_primary=discard
+fun:eal_dev_hotplug_request_to_secondary=discard
+fun:rte_mp_dev_hotplug_init=discard
+fun:malloc_elem_alloc=discard
+fun:malloc_elem_can_hold=discard
+fun:malloc_elem_dump=discard
+fun:malloc_elem_find_max_iova_contig=discard
+fun:malloc_elem_free=discard
+fun:malloc_elem_free_list_index=discard
+fun:malloc_elem_free_list_insert=discard
+fun:malloc_elem_free_list_remove=discard
+fun:malloc_elem_hide_region=discard
+fun:malloc_elem_init=discard
+fun:malloc_elem_insert=discard
+fun:malloc_elem_join_adjacent_free=discard
+fun:malloc_elem_resize=discard
+fun:alloc_pages_on_heap=discard
+fun:malloc_heap_add_external_memory=discard
+fun:malloc_heap_alloc=discard
+fun:malloc_heap_alloc_biggest=discard
+fun:malloc_heap_create=discard
+fun:malloc_heap_destroy=discard
+fun:malloc_heap_dump=discard
+fun:malloc_heap_free=discard
+fun:malloc_heap_free_pages=discard
+fun:malloc_heap_get_stats=discard
+fun:malloc_heap_remove_external_memory=discard
+fun:malloc_heap_resize=discard
+fun:malloc_socket_to_heap_id=discard
+fun:rollback_expand_heap=discard
+fun:rte_eal_malloc_heap_init=discard
+fun:register_mp_requests=discard
+fun:request_sync=discard
+fun:request_to_primary=discard
+fun:rte_cpu_get_flag_enabled=discard
+fun:rte_cpu_get_flag_name=discard
+fun:get_tsc_freq_arch=discard
+fun:rte_hypervisor_get=discard
+fun:rte_keepalive_create=discard
+fun:rte_keepalive_dispatch_pings=discard
+fun:rte_keepalive_mark_alive=discard
+fun:rte_keepalive_mark_sleep=discard
+fun:rte_keepalive_register_core=discard
+fun:rte_keepalive_register_relay_callback=discard
+fun:rte_calloc=discard
+fun:rte_calloc_socket=discard
+fun:rte_free=discard
+fun:rte_malloc=discard
+fun:rte_malloc_dump_heaps=discard
+fun:rte_malloc_dump_stats=discard
+fun:rte_malloc_get_socket_stats=discard
+fun:rte_malloc_heap_create=discard
+fun:rte_malloc_heap_destroy=discard
+fun:rte_malloc_heap_get_socket=discard
+fun:rte_malloc_heap_memory_add=discard
+fun:rte_malloc_heap_memory_attach=discard
+fun:rte_malloc_heap_memory_detach=discard
+fun:rte_malloc_heap_memory_remove=discard
+fun:rte_malloc_heap_socket_is_external=discard
+fun:rte_malloc_set_limit=discard
+fun:rte_malloc_socket=discard
+fun:rte_malloc_validate=discard
+fun:rte_malloc_virt2iova=discard
+fun:rte_realloc=discard
+fun:rte_zmalloc=discard
+fun:rte_zmalloc_socket=discard
+fun:rte_option_init=discard
+fun:rte_option_parse=discard
+fun:rte_option_register=discard
+fun:rte_reciprocal_value=discard
+fun:rte_reciprocal_value_u64=discard
+fun:rte_service_attr_get=discard
+fun:rte_service_attr_reset_all=discard
+fun:rte_service_component_register=discard
+fun:rte_service_component_runstate_set=discard
+fun:rte_service_component_unregister=discard
+fun:rte_service_dump=discard
+fun:rte_service_finalize=discard
+fun:rte_service_get_by_name=discard
+fun:rte_service_get_count=discard
+fun:rte_service_get_name=discard
+fun:rte_service_init=discard
+fun:rte_service_lcore_add=discard
+fun:rte_service_lcore_attr_get=discard
+fun:rte_service_lcore_attr_reset_all=discard
+fun:rte_service_lcore_count=discard
+fun:rte_service_lcore_count_services=discard
+fun:rte_service_lcore_del=discard
+fun:rte_service_lcore_list=discard
+fun:rte_service_lcore_reset_all=discard
+fun:rte_service_lcore_start=discard
+fun:rte_service_lcore_stop=discard
+fun:rte_service_map_lcore_get=discard
+fun:rte_service_map_lcore_set=discard
+fun:rte_service_may_be_active=discard
+fun:rte_service_probe_capability=discard
+fun:rte_service_run_iter_on_app_lcore=discard
+fun:rte_service_runstate_get=discard
+fun:rte_service_runstate_set=discard
+fun:rte_service_set_runstate_mapped_check=discard
+fun:rte_service_set_stats_enable=discard
+fun:rte_service_start_with_defaults=discard
+fun:eth_dev_to_id=discard
+fun:eth_find_device=discard
+fun:rte_eth_devargs_parse_list=discard
+fun:rte_eth_devargs_parse_representor_ports=discard
+fun:__rte_eth_dev_profile_init=discard
+fun:_rte_eth_dev_callback_process=discard
+fun:_rte_eth_dev_reset=discard
+fun:rte_eth_add_first_rx_callback=discard
+fun:rte_eth_add_rx_callback=discard
+fun:rte_eth_add_tx_callback=discard
+fun:rte_eth_allmulticast_disable=discard
+fun:rte_eth_allmulticast_enable=discard
+fun:rte_eth_allmulticast_get=discard
+fun:rte_eth_dev_adjust_nb_rx_tx_desc=discard
+fun:rte_eth_dev_allocate=discard
+fun:rte_eth_dev_allocated=discard
+fun:rte_eth_dev_attach_secondary=discard
+fun:rte_eth_dev_callback_register=discard
+fun:rte_eth_dev_callback_unregister=discard
+fun:rte_eth_dev_close=discard
+fun:rte_eth_dev_configure=discard
+fun:rte_eth_dev_count=discard
+fun:rte_eth_dev_count_avail=discard
+fun:rte_eth_dev_count_total=discard
+fun:rte_eth_dev_create=discard
+fun:rte_eth_dev_default_mac_addr_set=discard
+fun:rte_eth_dev_destroy=discard
+fun:rte_eth_dev_filter_ctrl=discard
+fun:rte_eth_dev_filter_supported=discard
+fun:rte_eth_dev_flow_ctrl_get=discard
+fun:rte_eth_dev_flow_ctrl_set=discard
+fun:rte_eth_dev_fw_version_get=discard
+fun:rte_eth_dev_get_dcb_info=discard
+fun:rte_eth_dev_get_eeprom=discard
+fun:rte_eth_dev_get_eeprom_length=discard
+fun:rte_eth_dev_get_module_eeprom=discard
+fun:rte_eth_dev_get_module_info=discard
+fun:rte_eth_dev_get_mtu=discard
+fun:rte_eth_dev_get_name_by_port=discard
+fun:rte_eth_dev_get_port_by_name=discard
+fun:rte_eth_dev_get_reg_info=discard
+fun:rte_eth_dev_get_sec_ctx=discard
+fun:rte_eth_dev_get_supported_ptypes=discard
+fun:rte_eth_dev_get_vlan_offload=discard
+fun:rte_eth_dev_info_get=discard
+fun:rte_eth_dev_is_removed=discard
+fun:rte_eth_dev_is_valid_port=discard
+fun:rte_eth_dev_l2_tunnel_eth_type_conf=discard
+fun:rte_eth_dev_l2_tunnel_offload_set=discard
+fun:rte_eth_dev_mac_addr_add=discard
+fun:rte_eth_dev_mac_addr_remove=discard
+fun:rte_eth_dev_owner_delete=discard
+fun:rte_eth_dev_owner_get=discard
+fun:rte_eth_dev_owner_new=discard
+fun:rte_eth_dev_owner_set=discard
+fun:rte_eth_dev_owner_unset=discard
+fun:rte_eth_dev_pool_ops_supported=discard
+fun:rte_eth_dev_priority_flow_ctrl_set=discard
+fun:rte_eth_dev_probing_finish=discard
+fun:rte_eth_dev_release_port=discard
+fun:rte_eth_dev_reset=discard
+fun:rte_eth_dev_rss_hash_conf_get=discard
+fun:rte_eth_dev_rss_hash_update=discard
+fun:rte_eth_dev_rss_reta_query=discard
+fun:rte_eth_dev_rss_reta_update=discard
+fun:rte_eth_dev_rx_intr_ctl=discard
+fun:rte_eth_dev_rx_intr_ctl_q=discard
+fun:rte_eth_dev_rx_intr_ctl_q_get_fd=discard
+fun:rte_eth_dev_rx_intr_disable=discard
+fun:rte_eth_dev_rx_intr_enable=discard
+fun:rte_eth_dev_rx_offload_name=discard
+fun:rte_eth_dev_rx_queue_start=discard
+fun:rte_eth_dev_rx_queue_stop=discard
+fun:rte_eth_dev_set_eeprom=discard
+fun:rte_eth_dev_set_link_down=discard
+fun:rte_eth_dev_set_link_up=discard
+fun:rte_eth_dev_set_mc_addr_list=discard
+fun:rte_eth_dev_set_mtu=discard
+fun:rte_eth_dev_set_rx_queue_stats_mapping=discard
+fun:rte_eth_dev_set_tx_queue_stats_mapping=discard
+fun:rte_eth_dev_set_vlan_ether_type=discard
+fun:rte_eth_dev_set_vlan_offload=discard
+fun:rte_eth_dev_set_vlan_pvid=discard
+fun:rte_eth_dev_set_vlan_strip_on_queue=discard
+fun:rte_eth_dev_socket_id=discard
+fun:rte_eth_dev_start=discard
+fun:rte_eth_dev_stop=discard
+fun:rte_eth_dev_tx_offload_name=discard
+fun:rte_eth_dev_tx_queue_start=discard
+fun:rte_eth_dev_tx_queue_stop=discard
+fun:rte_eth_dev_uc_all_hash_table_set=discard
+fun:rte_eth_dev_uc_hash_table_set=discard
+fun:rte_eth_dev_udp_tunnel_port_add=discard
+fun:rte_eth_dev_udp_tunnel_port_delete=discard
+fun:rte_eth_dev_vlan_filter=discard
+fun:rte_eth_devargs_parse=discard
+fun:rte_eth_dma_zone_reserve=discard
+fun:rte_eth_find_next=discard
+fun:rte_eth_find_next_owned_by=discard
+fun:rte_eth_iterator_cleanup=discard
+fun:rte_eth_iterator_init=discard
+fun:rte_eth_iterator_next=discard
+fun:rte_eth_led_off=discard
+fun:rte_eth_led_on=discard
+fun:rte_eth_link_get=discard
+fun:rte_eth_link_get_nowait=discard
+fun:rte_eth_macaddr_get=discard
+fun:rte_eth_mirror_rule_reset=discard
+fun:rte_eth_mirror_rule_set=discard
+fun:rte_eth_promiscuous_disable=discard
+fun:rte_eth_promiscuous_enable=discard
+fun:rte_eth_promiscuous_get=discard
+fun:rte_eth_remove_rx_callback=discard
+fun:rte_eth_remove_tx_callback=discard
+fun:rte_eth_rx_queue_info_get=discard
+fun:rte_eth_rx_queue_setup=discard
+fun:rte_eth_set_queue_rate_limit=discard
+fun:rte_eth_speed_bitflag=discard
+fun:rte_eth_stats_get=discard
+fun:rte_eth_stats_reset=discard
+fun:rte_eth_switch_domain_alloc=discard
+fun:rte_eth_switch_domain_free=discard
+fun:rte_eth_timesync_adjust_time=discard
+fun:rte_eth_timesync_disable=discard
+fun:rte_eth_timesync_enable=discard
+fun:rte_eth_timesync_read_rx_timestamp=discard
+fun:rte_eth_timesync_read_time=discard
+fun:rte_eth_timesync_read_tx_timestamp=discard
+fun:rte_eth_timesync_write_time=discard
+fun:rte_eth_tx_buffer_count_callback=discard
+fun:rte_eth_tx_buffer_drop_callback=discard
+fun:rte_eth_tx_buffer_init=discard
+fun:rte_eth_tx_buffer_set_err_callback=discard
+fun:rte_eth_tx_done_cleanup=discard
+fun:rte_eth_tx_queue_info_get=discard
+fun:rte_eth_tx_queue_setup=discard
+fun:rte_eth_xstats_get=discard
+fun:rte_eth_xstats_get_by_id=discard
+fun:rte_eth_xstats_get_id_by_name=discard
+fun:rte_eth_xstats_get_names=discard
+fun:rte_eth_xstats_get_names_by_id=discard
+fun:rte_eth_xstats_reset=discard
+fun:rte_flow_conv=discard
+fun:rte_flow_copy=discard
+fun:rte_flow_create=discard
+fun:rte_flow_destroy=discard
+fun:rte_flow_error_set=discard
+fun:rte_flow_expand_rss=discard
+fun:rte_flow_flush=discard
+fun:rte_flow_isolate=discard
+fun:rte_flow_ops_get=discard
+fun:rte_flow_query=discard
+fun:rte_flow_validate=discard
+fun:rte_mtr_capabilities_get=discard
+fun:rte_mtr_create=discard
+fun:rte_mtr_destroy=discard
+fun:rte_mtr_meter_disable=discard
+fun:rte_mtr_meter_dscp_table_update=discard
+fun:rte_mtr_meter_enable=discard
+fun:rte_mtr_meter_profile_add=discard
+fun:rte_mtr_meter_profile_delete=discard
+fun:rte_mtr_meter_profile_update=discard
+fun:rte_mtr_ops_get=discard
+fun:rte_mtr_policer_actions_update=discard
+fun:rte_mtr_stats_read=discard
+fun:rte_mtr_stats_update=discard
+fun:rte_tm_capabilities_get=discard
+fun:rte_tm_get_number_of_leaf_nodes=discard
+fun:rte_tm_hierarchy_commit=discard
+fun:rte_tm_level_capabilities_get=discard
+fun:rte_tm_mark_ip_dscp=discard
+fun:rte_tm_mark_ip_ecn=discard
+fun:rte_tm_mark_vlan_dei=discard
+fun:rte_tm_node_add=discard
+fun:rte_tm_node_capabilities_get=discard
+fun:rte_tm_node_cman_update=discard
+fun:rte_tm_node_delete=discard
+fun:rte_tm_node_parent_update=discard
+fun:rte_tm_node_resume=discard
+fun:rte_tm_node_shaper_update=discard
+fun:rte_tm_node_shared_shaper_update=discard
+fun:rte_tm_node_shared_wred_context_update=discard
+fun:rte_tm_node_stats_read=discard
+fun:rte_tm_node_stats_update=discard
+fun:rte_tm_node_suspend=discard
+fun:rte_tm_node_type_get=discard
+fun:rte_tm_node_wfq_weight_mode_update=discard
+fun:rte_tm_node_wred_context_update=discard
+fun:rte_tm_ops_get=discard
+fun:rte_tm_shaper_profile_add=discard
+fun:rte_tm_shaper_profile_delete=discard
+fun:rte_tm_shared_shaper_add_update=discard
+fun:rte_tm_shared_shaper_delete=discard
+fun:rte_tm_shared_wred_context_add_update=discard
+fun:rte_tm_shared_wred_context_delete=discard
+fun:rte_tm_wred_profile_add=discard
+fun:rte_tm_wred_profile_delete=discard
+fun:gro_tcp4_reassemble=discard
+fun:gro_tcp4_tbl_create=discard
+fun:gro_tcp4_tbl_destroy=discard
+fun:gro_tcp4_tbl_pkt_count=discard
+fun:gro_tcp4_tbl_timeout_flush=discard
+fun:gro_vxlan_tcp4_reassemble=discard
+fun:gro_vxlan_tcp4_tbl_create=discard
+fun:gro_vxlan_tcp4_tbl_destroy=discard
+fun:gro_vxlan_tcp4_tbl_pkt_count=discard
+fun:gro_vxlan_tcp4_tbl_timeout_flush=discard
+fun:rte_gro_ctx_create=discard
+fun:rte_gro_ctx_destroy=discard
+fun:rte_gro_get_pkt_count=discard
+fun:rte_gro_reassemble=discard
+fun:rte_gro_reassemble_burst=discard
+fun:rte_gro_timeout_flush=discard
+fun:rte_hash_add_key=discard
+fun:rte_hash_add_key_data=discard
+fun:rte_hash_add_key_with_hash=discard
+fun:rte_hash_add_key_with_hash_data=discard
+fun:rte_hash_count=discard
+fun:rte_hash_create=discard
+fun:rte_hash_del_key=discard
+fun:rte_hash_del_key_with_hash=discard
+fun:rte_hash_find_existing=discard
+fun:rte_hash_free=discard
+fun:rte_hash_free_key_with_position=discard
+fun:rte_hash_get_key_with_position=discard
+fun:rte_hash_hash=discard
+fun:rte_hash_iterate=discard
+fun:rte_hash_lookup=discard
+fun:rte_hash_lookup_bulk=discard
+fun:rte_hash_lookup_bulk_data=discard
+fun:rte_hash_lookup_data=discard
+fun:rte_hash_lookup_with_hash=discard
+fun:rte_hash_lookup_with_hash_data=discard
+fun:rte_hash_reset=discard
+fun:rte_hash_set_cmp_func=discard
+fun:rte_fbk_hash_create=discard
+fun:rte_fbk_hash_find_existing=discard
+fun:rte_fbk_hash_free=discard
+fun:ip_frag_find=discard
+fun:ip_frag_lookup=discard
+fun:ip_frag_process=discard
+fun:rte_frag_table_del_expired_entries=discard
+fun:rte_ip_frag_free_death_row=discard
+fun:rte_ip_frag_table_create=discard
+fun:rte_ip_frag_table_destroy=discard
+fun:rte_ip_frag_table_statistics_dump=discard
+fun:rte_ipv4_fragment_packet=discard
+fun:ipv4_frag_reassemble=discard
+fun:rte_ipv4_frag_reassemble_packet=discard
+fun:rte_ipv6_fragment_packet=discard
+fun:ipv6_frag_reassemble=discard
+fun:rte_ipv6_frag_reassemble_packet=discard
+fun:rte_kvargs_count=discard
+fun:rte_kvargs_free=discard
+fun:rte_kvargs_parse=discard
+fun:rte_kvargs_parse_delim=discard
+fun:rte_kvargs_process=discard
+fun:rte_kvargs_strcmp=discard
+fun:__rte_pktmbuf_read=discard
+fun:rte_get_rx_ol_flag_list=discard
+fun:rte_get_rx_ol_flag_name=discard
+fun:rte_get_tx_ol_flag_list=discard
+fun:rte_get_tx_ol_flag_name=discard
+fun:rte_mbuf_sanity_check=discard
+fun:rte_pktmbuf_dump=discard
+fun:rte_pktmbuf_dynamic_pool_create=discard
+fun:rte_pktmbuf_init=discard
+fun:rte_pktmbuf_pool_create=discard
+fun:rte_pktmbuf_pool_create_by_ops=discard
+fun:rte_pktmbuf_pool_init=discard
+fun:rte_mbuf_best_mempool_ops=discard
+fun:rte_mbuf_platform_mempool_ops=discard
+fun:rte_mbuf_set_platform_mempool_ops=discard
+fun:rte_mbuf_set_user_mempool_ops=discard
+fun:rte_mbuf_user_mempool_ops=discard
+fun:rte_get_ptype_inner_l2_name=discard
+fun:rte_get_ptype_inner_l3_name=discard
+fun:rte_get_ptype_inner_l4_name=discard
+fun:rte_get_ptype_l2_name=discard
+fun:rte_get_ptype_l3_name=discard
+fun:rte_get_ptype_l4_name=discard
+fun:rte_get_ptype_name=discard
+fun:rte_get_ptype_tunnel_name=discard
+fun:rte_mempool_audit=discard
+fun:rte_mempool_avail_count=discard
+fun:rte_mempool_cache_create=discard
+fun:rte_mempool_cache_free=discard
+fun:rte_mempool_calc_obj_size=discard
+fun:rte_mempool_check_cookies=discard
+fun:rte_mempool_contig_blocks_check_cookies=discard
+fun:rte_mempool_create=discard
+fun:rte_mempool_create_empty=discard
+fun:rte_mempool_dump=discard
+fun:rte_mempool_free=discard
+fun:rte_mempool_in_use_count=discard
+fun:rte_mempool_list_dump=discard
+fun:rte_mempool_lookup=discard
+fun:rte_mempool_mem_iter=discard
+fun:rte_mempool_obj_iter=discard
+fun:rte_mempool_populate_anon=discard
+fun:rte_mempool_populate_default=discard
+fun:rte_mempool_populate_iova=discard
+fun:rte_mempool_populate_virt=discard
+fun:rte_mempool_walk=discard
+fun:rte_mempool_ops_alloc=discard
+fun:rte_mempool_ops_calc_mem_size=discard
+fun:rte_mempool_ops_free=discard
+fun:rte_mempool_ops_get_count=discard
+fun:rte_mempool_ops_get_info=discard
+fun:rte_mempool_ops_populate=discard
+fun:rte_mempool_register_ops=discard
+fun:rte_mempool_set_ops_byname=discard
+fun:rte_mempool_op_calc_mem_size_default=discard
+fun:rte_mempool_op_populate_default=discard
+fun:rte_metrics_get_names=discard
+fun:rte_metrics_get_values=discard
+fun:rte_metrics_init=discard
+fun:rte_metrics_reg_name=discard
+fun:rte_metrics_reg_names=discard
+fun:rte_metrics_update_value=discard
+fun:rte_metrics_update_values=discard
+fun:rte_net_make_rarp_packet=discard
+fun:rte_net_get_ptype=discard
+fun:rte_net_skip_ip6_ext=discard
+fun:rte_net_crc_calc=discard
+fun:rte_net_crc_set_alg=discard
+fun:eal_parse_pci_BDF=discard
+fun:eal_parse_pci_DomBDF=discard
+fun:pci_map_resource=discard
+fun:pci_unmap_resource=discard
+fun:rte_eal_compare_pci_addr=discard
+fun:rte_pci_addr_cmp=discard
+fun:rte_pci_addr_parse=discard
+fun:rte_pci_device_name=discard
+fun:rte_eth_from_ring=discard
+fun:rte_eth_from_rings=discard
+fun:sock_support_features=discard
+fun:vhost_kernel_open_sock=discard
+fun:vhost_kernel_set_sock=discard
+fun:tap_support_features=discard
+fun:vhost_kernel_open_tap=discard
+fun:eth_virtio_dev_init=discard
+fun:virtio_dev_pause=discard
+fun:virtio_dev_resume=discard
+fun:virtio_inject_pkts=discard
+fun:virtio_interrupt_handler=discard
+fun:vtpci_get_status=discard
+fun:vtpci_init=discard
+fun:vtpci_isr=discard
+fun:vtpci_msix_detect=discard
+fun:vtpci_negotiate_features=discard
+fun:vtpci_read_dev_config=discard
+fun:vtpci_reinit_complete=discard
+fun:vtpci_reset=discard
+fun:vtpci_set_status=discard
+fun:vtpci_write_dev_config=discard
+fun:virtio_dev_cq_start=discard
+fun:virtio_dev_rx_queue_done=discard
+fun:virtio_dev_rx_queue_setup=discard
+fun:virtio_dev_rx_queue_setup_finish=discard
+fun:virtio_dev_tx_queue_setup=discard
+fun:virtio_dev_tx_queue_setup_finish=discard
+fun:virtio_recv_mergeable_pkts=discard
+fun:virtio_recv_mergeable_pkts_inorder=discard
+fun:virtio_recv_pkts=discard
+fun:virtio_xmit_pkts=discard
+fun:virtio_xmit_pkts_inorder=discard
+fun:vq_ring_free_chain=discard
+fun:vq_ring_free_inorder=discard
+fun:virtio_rxq_vec_setup=discard
+fun:virtio_recv_pkts_vec=discard
+fun:is_vhost_user_by_type=discard
+fun:virtio_user_dev_init=discard
+fun:virtio_user_dev_uninit=discard
+fun:virtio_user_handle_cq=discard
+fun:virtio_user_handle_mq=discard
+fun:virtio_user_start_device=discard
+fun:virtio_user_stop_device=discard
+fun:virtqueue_detach_unused=discard
+fun:virtqueue_rxvq_flush=discard
+fun:rte_ring_create=discard
+fun:rte_ring_dump=discard
+fun:rte_ring_free=discard
+fun:rte_ring_get_memsize=discard
+fun:rte_ring_init=discard
+fun:rte_ring_list_dump=discard
+fun:rte_ring_lookup=discard
+fun:rte_timer_dump_stats=discard
+fun:rte_timer_init=discard
+fun:rte_timer_manage=discard
+fun:rte_timer_pending=discard
+fun:rte_timer_reset=discard
+fun:rte_timer_reset_sync=discard
+fun:rte_timer_stop=discard
+fun:rte_timer_stop_sync=discard
+fun:rte_timer_subsystem_init=discard
diff --git a/angora/run.sh b/angora/run.sh
new file mode 100644
index 0000000..be215f1
--- /dev/null
+++ b/angora/run.sh
@@ -0,0 +1 @@
+~/git/Angora/angora_fuzzer -M 2048 -i seeds -o output -t /root/git/uss/angora/tcp_lo.taint -- /root/git/uss/angora/tcp_lo.fast 127.0.0.1 1234 @@
diff --git a/angora/seeds/seed.txt b/angora/seeds/seed.txt
new file mode 100644
index 0000000..f534deb
--- /dev/null
+++ b/angora/seeds/seed.txt
@@ -0,0 +1 @@
+Hello World.
diff --git a/dpdk/Makefile b/dpdk/Makefile
index 15204fa..5d92719 100644
--- a/dpdk/Makefile
+++ b/dpdk/Makefile
@@ -21,10 +21,12 @@ DPDK_PKTMBUF_HEADROOM ?= 128
DPDK_MARCH ?= native
DPDK_TUNE ?= generic
DPDK_DEBUG ?= n
+DPDK_DESTDIR ?= $(CURDIR)/install
+PACKETDRILL ?= n
B := $(DPDK_BUILD_DIR)
I := $(DPDK_INSTALL_DIR)
-DPDK_GIT_REPO ?= http://dpdk.org/git/dpdk
+DPDK_GIT_REPO ?= http://dpdk.org/git/dpdk -b v18.11
DPDK_SOURCE := $(B)/dpdk
ifneq (,$(findstring clang,$(CC)))
@@ -40,8 +42,8 @@ endif
JOBS := $(shell grep processor /proc/cpuinfo | wc -l)
# compiler/linker custom arguments
-DPDK_CPU_CFLAGS := -pie -fPIC
-DPDK_CPU_LDFLAGS :=
+DPDK_CPU_CFLAGS := -fPIC
+DPDK_CPU_LDFLAGS := -r
DPDK_EXTRA_LDFLAGS := -g
ifeq ($(DPDK_DEBUG),n)
@@ -78,6 +80,7 @@ DPDK_MAKE_ARGS := -C $(DPDK_SOURCE) -j $(JOBS) \
EXTRA_LDFLAGS="$(DPDK_EXTRA_LDFLAGS)" \
CPU_CFLAGS="$(DPDK_CPU_CFLAGS)" \
CPU_LDFLAGS="$(DPDK_CPU_LDFLAGS)" \
+ DESTDIR="$(DPDK_DESTDIR)" \
$(DPDK_MAKE_EXTRA_ARGS)
DPDK_SOURCE_FILES := $(shell [ -e $(DPDK_SOURCE) ] && \
@@ -102,7 +105,7 @@ $(B)/custom-config: $(B)/.patch.ok Makefile
$(call set,RTE_MAX_LCORE,256)
$(call set,RTE_PKTMBUF_HEADROOM,$(DPDK_PKTMBUF_HEADROOM))
$(call set,RTE_LIBEAL_USE_HPET,y)
- $(call set,RTE_BUILD_COMBINE_LIBS,y)
+ $(call set,RTE_BUILD_COMBINE_LIBS,n)
$(call set,RTE_LIBRTE_I40E_16BYTE_RX_DESC,y)
$(call set,RTE_LIBRTE_I40E_ITR_INTERVAL,16)
$(call set,RTE_LIBRTE_PMD_PCAP,y)
@@ -115,13 +118,101 @@ $(B)/custom-config: $(B)/.patch.ok Makefile
$(call set,RTE_LIBRTE_PMD_BOND,y)
$(call set,RTE_LIBRTE_IP_FRAG,y)
@# not needed
+ $(call set,RTE_LIBRTE_TIMER,y)
$(call set,RTE_LIBRTE_CFGFILE,n)
+ $(call set,RTE_LIBRTE_LPM,y)
+ $(call set,RTE_LIBRTE_ACL,n)
$(call set,RTE_LIBRTE_POWER,n)
$(call set,RTE_LIBRTE_DISTRIBUTOR,n)
$(call set,RTE_LIBRTE_REORDER,n)
+ $(call set,RTE_LIBRTE_PORT,n)
+ $(call set,RTE_LIBRTE_TABLE,n)
+ $(call set,RTE_LIBRTE_PIPELINE,n)
$(call set,RTE_LIBRTE_FLOW_CLASSIFY,n)
$(call set,RTE_LIBRTE_PMD_CRYPTO_SCHEDULER,n)
$(call set,RTE_KNI_KMOD,n)
+ $(call set,RTE_LIBRTE_ENA_PMD,n)
+ $(call set,RTE_LIBRTE_FM10K_PMD,n)
+ $(call set,RTE_LIBRTE_CXGBE_PMD,n)
+ $(call set,RTE_LIBRTE_ENIC_PMD,n)
+ $(call set,RTE_LIBRTE_BNXT_PMD,n)
+ $(call set,RTE_LIBRTE_SFC_EFX_PMD,n)
+ $(call set,RTE_LIBRTE_PMD_SOFTNIC,n)
+ $(call set,RTE_LIBRTE_THUNDERX_NICVF_PMD,n)
+ $(call set,RTE_LIBRTE_LIO_PMD,n)
+ $(call set,RTE_LIBRTE_OCTEONTX_PMD,n)
+ $(call set,RTE_LIBRTE_VMXNET3_PMD,n)
+ $(call set,RTE_LIBRTE_QEDE_PMD,n)
+ $(call set,RTE_LIBRTE_ARK_PMD,n)
+ $(call set,RTE_LIBRTE_PMD_NULL,n)
+ $(call set,RTE_LIBRTE_CRYPTODEV,n)
+ $(call set,RTE_LIBRTE_PMD_NULL_CRYPTO,n)
+ $(call set,RTE_LIBRTE_SECURITY,n)
+ $(call set,RTE_LIBRTE_EVENTDEV,n)
+ $(call set,RTE_LIBRTE_PMD_SKELETON_EVENTDEV,n)
+ $(call set,RTE_LIBRTE_PMD_OCTEONTX_SSOVF,n)
+ $(call set,RTE_LIBRTE_OCTEONTX_MEMPOOL,n)
+ $(call set,RTE_LIBRTE_EFD,n)
+ $(call set,RTE_LIBRTE_MEMBER,n)
+ $(call set,RTE_LIBRTE_JOBSTATS,n)
+ $(call set,RTE_LIBRTE_METER,n)
+ $(call set,RTE_LIBRTE_SCHED,n)
+ $(call set,RTE_APP_TEST,n)
+ $(call set,RTE_APP_CRYPTO_PERF,n)
+ $(call set,RTE_APP_EVENTDEV,n)
+ $(call set,RTE_LIBRTE_PMD_FAILSAFE,n)
+ $(call set,RTE_LIBRTE_EM_PMD,n)
+ $(call set,RTE_LIBRTE_IGB_PMD,n)
+ $(call set,RTE_LIBRTE_LATENCY_STATS,n)
+ $(call set,RTE_EAL_IGB_UIO,n)
+ $(call set,RTE_LIBRTE_KNI,n)
+ $(call set,RTE_LIBRTE_PMD_KNI,n)
+ $(call set,RTE_KNI_KMOD,n)
+ $(call set,RTE_KNI_KMOD_ETHTOOL,n)
+ $(call set,RTE_LIBRTE_BITRATE,n)
+ $(call set,RTE_LIBRTE_METRICS,y)
+ $(call set,RTE_LIBRTE_AVP_PMD,n)
+ $(call set,RTE_LIBRTE_NFP_PMD,n)
+ $(call set,RTE_LIBRTE_PMD_TAP,n)
+ $(call set,RTE_LIBRTE_VHOST,$(PACKETDRILL))
+ $(call set,RTE_LIBRTE_IFC_PMD,n)
+ $(call set,RTE_LIBRTE_PMD_VHOST,n)
+ $(call set,RTE_PROC_INFO,n)
+ $(call set,RTE_TEST_PMD,n)
+ $(call set,RTE_LIBRTE_FSLMC_BUS,n)
+ $(call set,RTE_LIBRTE_DPAA_BUS,n)
+ $(call set,RTE_LIBRTE_VMBUS,n)
+ $(call set,RTE_LIBRTE_IFPGA_BUS,n)
+ $(call set,RTE_LIBRTE_BPF,n)
+ $(call set,RTE_LIBRTE_COMPRESSDEV,n)
+ $(call set,RTE_LIBRTE_VDEV_NETVSC_PMD,n)
+ $(call set,RTE_LIBRTE_NETVSC_PMD,n)
+ $(call set,RTE_LIBRTE_RAWDEV,n)
+ $(call set,RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT,n)
+ $(call set,RTE_LIBRTE_AXGBE_PMD,n)
+ $(call set,RTE_LIBRTE_AVF_PMD,n)
+ $(call set,RTE_LIBRTE_BBDEV,n)
+ $(call set,RTE_LIBRTE_IP_FRAG_MAX_FRAG,48)
+ $(call set,RTE_MAX_NUMA_NODES,2)
+ $(call set,RTE_MAX_ETHPORTS,4)
+ $(call set,RTE_MAX_QUEUES_PER_PORT,8)
+ $(call set,RTE_LIBRTE_I40E_PMD,n)
+ $(call set,RTE_LIBRTE_IXGBE_PMD,n)
+ $(call set,RTE_LIBRTE_ENETC_PMD,n)
+ $(call set,RTE_LIBRTE_PMD_BOND,n)
+ $(call set,RTE_LIBRTE_ATLANTIC_PMD,n)
+ $(call set,RTE_LIBRTE_GSO,n)
+ $(call set,RTE_MAX_VFIO_GROUPS,4)
+ $(call set,RTE_MAX_VFIO_CONTAINERS,4)
+ $(call set,RTE_LIBRTE_COMMON_DPAAX,n)
+ $(call set,RTE_LIBRTE_PMD_OCTEONTX_CRYPTO,n)
+ $(call set,RTE_EAL_NUMA_AWARE_HUGEPAGES,n)
+ $(call set,RTE_DRIVER_MEMPOOL_STACK,y)
+ $(call set,RTE_DRIVER_MEMPOOL_BUCKET,n)
+ $(call set,RTE_LIBRTE_PMD_QAT,n)
+ $(call set,RTE_LIBRTE_PMD_AF_PACKET,n)
+ $(call set,RTE_MAX_MEM_MB,1024)
+ $(call set,RTE_LIBRTE_PDUMP,n)
@rm -f .config.ok
$(B)/.download.ok:
@@ -165,4 +256,4 @@ build: $(B)/.build.ok
.PHONY: clean
clean:
- @rm -rf $(B) $(I)
+ @rm -rf $(DPDK_BUILD_DIR) $(DPDK_DESTDIR)
diff --git a/dpdk/dpdk-v18.11_patches/0001-eal-don-t-start-the-interrupt-mp-thread.patch b/dpdk/dpdk-v18.11_patches/0001-eal-don-t-start-the-interrupt-mp-thread.patch
new file mode 100644
index 0000000..770bf05
--- /dev/null
+++ b/dpdk/dpdk-v18.11_patches/0001-eal-don-t-start-the-interrupt-mp-thread.patch
@@ -0,0 +1,35 @@
+From f68558b0ccbddb4cc81aca36befa0a7730ee051c Mon Sep 17 00:00:00 2001
+From: Jianfeng Tan <henry.tjf@antfin.com>
+Date: Wed, 29 Aug 2018 14:24:01 +0000
+Subject: [PATCH 7/9] eal: don't start the interrupt mp thread
+
+---
+ lib/librte_eal/common/eal_common_proc.c | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
+index 9fcb91219..79d9e6bbe 100644
+--- a/lib/librte_eal/common/eal_common_proc.c
++++ b/lib/librte_eal/common/eal_common_proc.c
+@@ -615,6 +615,7 @@ rte_mp_channel_init(void)
+ return -1;
+ }
+
++#if 0
+ if (rte_ctrl_thread_create(&mp_handle_tid, "rte_mp_handle",
+ NULL, mp_handle, NULL) < 0) {
+ RTE_LOG(ERR, EAL, "failed to create mp thead: %s\n",
+@@ -624,6 +625,10 @@ rte_mp_channel_init(void)
+ mp_fd = -1;
+ return -1;
+ }
++#else
++ RTE_SET_USED(mp_handle);
++ RTE_SET_USED(mp_handle_tid);
++#endif
+
+ /* unlock the directory */
+ flock(dir_fd, LOCK_UN);
+--
+2.17.1
+
diff --git a/dpdk/dpdk-v18.11_patches/0002-eal-prioritize-constructor.patch b/dpdk/dpdk-v18.11_patches/0002-eal-prioritize-constructor.patch
new file mode 100644
index 0000000..9d2959f
--- /dev/null
+++ b/dpdk/dpdk-v18.11_patches/0002-eal-prioritize-constructor.patch
@@ -0,0 +1,25 @@
+From 7fe32567994a8ce782fa8406613bade1d2100dca Mon Sep 17 00:00:00 2001
+From: Jianfeng Tan <henry.tjf@antfin.com>
+Date: Wed, 29 Aug 2018 14:14:09 +0000
+Subject: [PATCH 2/9] eal: prioritize constructor
+
+---
+ lib/librte_eal/common/include/rte_common.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/lib/librte_eal/common/include/rte_common.h b/lib/librte_eal/common/include/rte_common.h
+index 069c13ec7..a635f5be4 100644
+--- a/lib/librte_eal/common/include/rte_common.h
++++ b/lib/librte_eal/common/include/rte_common.h
+@@ -84,7 +84,7 @@ typedef uint16_t unaligned_uint16_t;
+ #define RTE_PRIORITY_LOG 101
+ #define RTE_PRIORITY_BUS 110
+ #define RTE_PRIORITY_CLASS 120
+-#define RTE_PRIORITY_LAST 65535
++#define RTE_PRIORITY_LAST 130
+
+ #define RTE_PRIO(prio) \
+ RTE_PRIORITY_ ## prio
+--
+2.17.1
+
diff --git a/dpdk/dpdk-v18.11_patches/0003-mbuf-add-single-linked-list.patch b/dpdk/dpdk-v18.11_patches/0003-mbuf-add-single-linked-list.patch
new file mode 100644
index 0000000..7430d1e
--- /dev/null
+++ b/dpdk/dpdk-v18.11_patches/0003-mbuf-add-single-linked-list.patch
@@ -0,0 +1,33 @@
+From 1416ff5de58922dc32eb2fb9ce2b9b970282136c Mon Sep 17 00:00:00 2001
+From: Jianfeng Tan <henry.tjf@antfin.com>
+Date: Wed, 29 Aug 2018 14:18:13 +0000
+Subject: [PATCH 3/9] mbuf: add single linked list
+
+---
+ lib/librte_mbuf/rte_mbuf.h | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
+index 9ce5d76d7..0081bd6d7 100644
+--- a/lib/librte_mbuf/rte_mbuf.h
++++ b/lib/librte_mbuf/rte_mbuf.h
+@@ -593,6 +593,8 @@ struct rte_mbuf {
+ */
+ struct rte_mbuf_ext_shared_info *shinfo;
+
++ struct rte_mbuf *next_pkt;
++
+ } __rte_cache_aligned;
+
+ /**
+@@ -1237,6 +1239,7 @@ static inline void rte_pktmbuf_reset_headroom(struct rte_mbuf *m)
+ static inline void rte_pktmbuf_reset(struct rte_mbuf *m)
+ {
+ m->next = NULL;
++ m->next_pkt = NULL;
+ m->pkt_len = 0;
+ m->tx_offload = 0;
+ m->vlan_tci = 0;
+--
+2.17.1
+
diff --git a/dpdk/dpdk-v18.11_patches/0004-net-virtio-user-add-rss-update-for-virtio-user.patch b/dpdk/dpdk-v18.11_patches/0004-net-virtio-user-add-rss-update-for-virtio-user.patch
new file mode 100644
index 0000000..e4eb8e7
--- /dev/null
+++ b/dpdk/dpdk-v18.11_patches/0004-net-virtio-user-add-rss-update-for-virtio-user.patch
@@ -0,0 +1,43 @@
+From 9bbe20eda858fd7fcbd8f137e5f96f51d571a556 Mon Sep 17 00:00:00 2001
+From: Jianfeng Tan <henry.tjf@antfin.com>
+Date: Wed, 29 Aug 2018 14:20:51 +0000
+Subject: [PATCH 4/9] net/virtio-user: add rss update for virtio-user
+
+---
+ drivers/net/virtio/virtio_ethdev.c | 13 +++++++++++++
+ 1 file changed, 13 insertions(+)
+
+diff --git a/drivers/net/virtio/virtio_ethdev.c b/drivers/net/virtio/virtio_ethdev.c
+index 614357da7..e7336cde9 100644
+--- a/drivers/net/virtio/virtio_ethdev.c
++++ b/drivers/net/virtio/virtio_ethdev.c
+@@ -738,6 +738,18 @@ virtio_dev_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id)
+ return 0;
+ }
+
++static int
++virtio_rss_hash_update(struct rte_eth_dev *dev,
++ struct rte_eth_rss_conf *rss_conf __rte_unused)
++{
++ struct virtio_hw *hw = dev->data->dev_private;
++
++ if (hw->virtio_user_dev)
++ return 0;
++
++ return -1;
++}
++
+ /*
+ * dev_ops for virtio, bare necessities for basic operation
+ */
+@@ -772,6 +784,7 @@ static const struct eth_dev_ops virtio_eth_dev_ops = {
+ .mac_addr_add = virtio_mac_addr_add,
+ .mac_addr_remove = virtio_mac_addr_remove,
+ .mac_addr_set = virtio_mac_addr_set,
++ .rss_hash_update = virtio_rss_hash_update,
+ };
+
+ static void
+--
+2.17.1
+
diff --git a/dpdk/dpdk-v18.11_patches/0005-net-virtio-user-support-raw-socket-as-backend.patch b/dpdk/dpdk-v18.11_patches/0005-net-virtio-user-support-raw-socket-as-backend.patch
new file mode 100644
index 0000000..1d950c5
--- /dev/null
+++ b/dpdk/dpdk-v18.11_patches/0005-net-virtio-user-support-raw-socket-as-backend.patch
@@ -0,0 +1,645 @@
+From 307f7debe0f2143e70659b7a082537077b20d185 Mon Sep 17 00:00:00 2001
+From: Jianfeng Tan <henry.tjf@antfin.com>
+Date: Thu, 19 Jul 2018 11:25:22 +0000
+Subject: [PATCH] net/virtio-user: support raw socket as backend
+
+We will support tapfd or raw socket fd opened by application and
+passed into virtio-user for initialization.
+
+Note if there are multiple queue pairs, users are still supposed
+to pass down the iface name with the first queue pair fd passed
+through this parameter.
+
+Signed-off-by: Jianfeng Tan <henry.tjf@antfin.com>
+---
+ drivers/net/virtio/Makefile | 1 +
+ drivers/net/virtio/virtio_user/vhost_kernel.c | 78 ++++++---
+ drivers/net/virtio/virtio_user/vhost_kernel.h | 15 ++
+ .../virtio/virtio_user/vhost_kernel_sock.c | 156 ++++++++++++++++++
+ .../net/virtio/virtio_user/vhost_kernel_tap.c | 64 ++++++-
+ .../net/virtio/virtio_user/vhost_kernel_tap.h | 39 -----
+ .../net/virtio/virtio_user/virtio_user_dev.c | 16 +-
+ .../net/virtio/virtio_user/virtio_user_dev.h | 3 +-
+ drivers/net/virtio/virtio_user_ethdev.c | 20 ++-
+ 9 files changed, 318 insertions(+), 74 deletions(-)
+ create mode 100644 drivers/net/virtio/virtio_user/vhost_kernel.h
+ create mode 100644 drivers/net/virtio/virtio_user/vhost_kernel_sock.c
+ delete mode 100644 drivers/net/virtio/virtio_user/vhost_kernel_tap.h
+
+diff --git a/drivers/net/virtio/Makefile b/drivers/net/virtio/Makefile
+index 6c2c9967b..2e1fc9b5e 100644
+--- a/drivers/net/virtio/Makefile
++++ b/drivers/net/virtio/Makefile
+@@ -41,6 +41,7 @@ ifeq ($(CONFIG_RTE_VIRTIO_USER),y)
+ SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_user.c
+ SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_kernel.c
+ SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_kernel_tap.c
++SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_kernel_sock.c
+ SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/virtio_user_dev.c
+ SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user_ethdev.c
+ endif
+diff --git a/drivers/net/virtio/virtio_user/vhost_kernel.c b/drivers/net/virtio/virtio_user/vhost_kernel.c
+index 6b19180d7..fa84287f5 100644
+--- a/drivers/net/virtio/virtio_user/vhost_kernel.c
++++ b/drivers/net/virtio/virtio_user/vhost_kernel.c
+@@ -6,13 +6,14 @@
+ #include <sys/stat.h>
+ #include <fcntl.h>
+ #include <unistd.h>
++#include <sys/ioctl.h>
+
+ #include <rte_memory.h>
+ #include <rte_eal_memconfig.h>
+
+ #include "vhost.h"
+ #include "virtio_user_dev.h"
+-#include "vhost_kernel_tap.h"
++#include "vhost_kernel.h"
+
+ struct vhost_memory_kernel {
+ uint32_t nregions;
+@@ -152,27 +153,25 @@ prepare_vhost_memory_kernel(void)
+ (1ULL << VIRTIO_NET_F_HOST_TSO6) | \
+ (1ULL << VIRTIO_NET_F_CSUM))
+
+-static unsigned int
+-tap_support_features(void)
++#define PATH_SYS_CLASS_NET "/sys/class/net"
++
++static int
++vhost_kernel_is_tap(struct virtio_user_dev *dev)
+ {
+- int tapfd;
+- unsigned int tap_features;
++ char path[128];
+
+- tapfd = open(PATH_NET_TUN, O_RDWR);
+- if (tapfd < 0) {
+- PMD_DRV_LOG(ERR, "fail to open %s: %s",
+- PATH_NET_TUN, strerror(errno));
+- return -1;
+- }
++ if (dev->ifname == NULL)
++ return 0;
+
+- if (ioctl(tapfd, TUNGETFEATURES, &tap_features) == -1) {
+- PMD_DRV_LOG(ERR, "TUNGETFEATURES failed: %s", strerror(errno));
+- close(tapfd);
+- return -1;
+- }
++ snprintf(path, 128, PATH_SYS_CLASS_NET"/%s", dev->ifname);
++ if(access(path, F_OK) == -1)
++ return 1;
+
+- close(tapfd);
+- return tap_features;
++ snprintf(path, 128, PATH_SYS_CLASS_NET"/%s/tun_flags", dev->ifname);
++ if(access(path, F_OK) != -1)
++ return 1;
++
++ return 0;
+ }
+
+ static int
+@@ -186,7 +185,6 @@ vhost_kernel_ioctl(struct virtio_user_dev *dev,
+ struct vhost_memory_kernel *vm = NULL;
+ int vhostfd;
+ unsigned int queue_sel;
+- unsigned int features;
+
+ PMD_DRV_LOG(INFO, "%s", vhost_msg_strings[req]);
+
+@@ -240,21 +238,36 @@ vhost_kernel_ioctl(struct virtio_user_dev *dev,
+ }
+
+ if (!ret && req_kernel == VHOST_GET_FEATURES) {
+- features = tap_support_features();
+- /* with tap as the backend, all these features are supported
++ int vnet_hdr, mq;
++
++ if (vhost_kernel_is_tap(dev))
++ tap_support_features(&vnet_hdr, &mq);
++ else
++ sock_support_features(dev->be_fd, &vnet_hdr, &mq);
++
++ /* with kernel vhost, all these features are supported
+ * but not claimed by vhost-net, so we add them back when
+ * reporting to upper layer.
+ */
+- if (features & IFF_VNET_HDR) {
++ if (vnet_hdr) {
+ *((uint64_t *)arg) |= VHOST_KERNEL_GUEST_OFFLOADS_MASK;
+ *((uint64_t *)arg) |= VHOST_KERNEL_HOST_OFFLOADS_MASK;
+ }
+
+- /* vhost_kernel will not declare this feature, but it does
++ /* kernel vhost will not declare this feature, but it does
+ * support multi-queue.
+ */
+- if (features & IFF_MULTI_QUEUE)
++ if (mq)
+ *(uint64_t *)arg |= (1ull << VIRTIO_NET_F_MQ);
++
++ /* raw socket only supports vnet header size of 10, so we must
++ * eliminate below features.
++ */
++ if (!vhost_kernel_is_tap(dev) &&
++ vnet_hdr == sizeof(struct virtio_net_hdr)) {
++ *((uint64_t *)arg) &= ~(1ull << VIRTIO_NET_F_MRG_RXBUF);
++ *((uint64_t *)arg) &= ~(1ull << VIRTIO_F_VERSION_1);
++ }
+ }
+
+ if (vm)
+@@ -333,7 +346,8 @@ vhost_kernel_enable_queue_pair(struct virtio_user_dev *dev,
+
+ if (!enable) {
+ if (dev->tapfds[pair_idx] >= 0) {
+- close(dev->tapfds[pair_idx]);
++ if (dev->be_fd < 0)
++ close(dev->tapfds[pair_idx]);
+ dev->tapfds[pair_idx] = -1;
+ }
+ return vhost_kernel_set_backend(vhostfd, -1);
+@@ -347,8 +361,18 @@ vhost_kernel_enable_queue_pair(struct virtio_user_dev *dev,
+ else
+ hdr_size = sizeof(struct virtio_net_hdr);
+
+- tapfd = vhost_kernel_open_tap(&dev->ifname, hdr_size, req_mq,
+- (char *)dev->mac_addr, dev->features);
++ if (vhost_kernel_is_tap(dev)) {
++ tapfd = vhost_kernel_open_tap(&dev->ifname, hdr_size,
++ req_mq, (char *)dev->mac_addr, dev->features);
++ } else {
++ if (pair_idx == 0 && dev->be_fd >= 0)
++ tapfd = vhost_kernel_set_sock(dev->be_fd,
++ hdr_size, req_mq);
++ else
++ tapfd = vhost_kernel_open_sock(dev->ifname,
++ hdr_size, dev->mac_addr, req_mq);
++ }
++
+ if (tapfd < 0) {
+ PMD_DRV_LOG(ERR, "fail to open tap for vhost kernel");
+ return -1;
+diff --git a/drivers/net/virtio/virtio_user/vhost_kernel.h b/drivers/net/virtio/virtio_user/vhost_kernel.h
+new file mode 100644
+index 000000000..75d6c5bf6
+--- /dev/null
++++ b/drivers/net/virtio/virtio_user/vhost_kernel.h
+@@ -0,0 +1,15 @@
++/* SPDX-License-Identifier: BSD-3-Clause
++ * Copyright(c) 2016 Intel Corporation
++ */
++
++int vhost_kernel_open_tap(char **p_ifname, int hdr_size, int req_mq,
++ const char *mac, uint64_t features);
++
++void tap_support_features(int *vnet_hdr, int *mq);
++
++int vhost_kernel_open_sock(char *ifname, int hdr_size,
++ uint8_t *mac, int req_mq);
++
++int vhost_kernel_set_sock(int sockfd, int hdr_size, int req_mq);
++
++void sock_support_features(int fd, int *vnet_hdr, int *mq);
+diff --git a/drivers/net/virtio/virtio_user/vhost_kernel_sock.c b/drivers/net/virtio/virtio_user/vhost_kernel_sock.c
+new file mode 100644
+index 000000000..7c2ace294
+--- /dev/null
++++ b/drivers/net/virtio/virtio_user/vhost_kernel_sock.c
+@@ -0,0 +1,156 @@
++/* SPDX-License-Identifier: BSD-3-Clause
++ * Copyright(c) 2018 Alibaba Group
++ * Copyright(c) 2018 Ant Financial Services Group
++ */
++
++#include <unistd.h>
++#include <sys/types.h>
++#include <sys/stat.h>
++#include <fcntl.h>
++#include <net/if.h>
++#include <net/if_arp.h>
++#include <errno.h>
++#include <string.h>
++#include <limits.h>
++#include <sys/socket.h>
++#include <arpa/inet.h>
++#include <netpacket/packet.h>
++#include <linux/if_ether.h>
++#include <sys/ioctl.h>
++
++#include <rte_ether.h>
++
++#include "../virtqueue.h"
++#include "../virtio_logs.h"
++#include "vhost_kernel.h"
++
++#ifndef PACKET_VNET_HDR
++#define PACKET_VNET_HDR 15
++#endif
++
++#ifndef PACKET_FANOUT
++#define PACKET_FANOUT 18
++#endif
++
++#ifndef PACKET_VNET_HDR_SZ
++#define PACKET_VNET_HDR_SZ 128
++#endif
++
++void
++sock_support_features(int fd, int *vnet_hdr, int *mq)
++{
++ int hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf);
++ int local_fd = 0;
++
++ if (fd < 0) {
++ fd = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
++ if (fd < 0) {
++ *mq = 0;
++ *vnet_hdr = 0;
++ return;
++ }
++ local_fd = 1;
++ }
++
++ *mq = 1;
++
++ if (setsockopt(fd, SOL_PACKET, PACKET_VNET_HDR_SZ,
++ (void *)&hdr_size, sizeof(hdr_size))) {
++ *vnet_hdr = sizeof(struct virtio_net_hdr);
++ } else
++ *vnet_hdr = hdr_size;
++
++ if (local_fd)
++ close(fd);
++}
++
++int
++vhost_kernel_set_sock(int sockfd, int hdr_size, int req_mq)
++{
++ int ret;
++ int fanout_type = 0; /* PACKET_FANOUT_HASH */
++
++ if (hdr_size == sizeof(struct virtio_net_hdr))
++ ret = setsockopt(sockfd, SOL_PACKET, PACKET_VNET_HDR,
++ (void *)&hdr_size, sizeof(hdr_size));
++ else
++ ret = setsockopt(sockfd, SOL_PACKET, PACKET_VNET_HDR_SZ,
++ (void *)&hdr_size, sizeof(hdr_size));
++ if (ret) {
++ PMD_DRV_LOG(ERR, "failed to set vnet hdr (%d): %s",
++ hdr_size, strerror(errno));
++ close(sockfd);
++ return -1;
++ }
++
++ if (fcntl(sockfd, F_SETFL, fcntl(sockfd, F_GETFL) | O_NONBLOCK))
++ {
++ PMD_DRV_LOG(ERR, "fcntl O_NONBLOCK failed! %s",
++ strerror(errno));
++ close(sockfd);
++ return -1;
++ }
++
++ if (req_mq) {
++ if (setsockopt(sockfd, SOL_PACKET, PACKET_FANOUT,
++ (void *)&fanout_type, sizeof(fanout_type))) {
++ PMD_DRV_LOG(ERR, "PACKET_FANOUT failed! %s",
++ strerror(errno));
++ close(sockfd);
++ return -1;
++ }
++ }
++
++ return sockfd;
++}
++
++int
++vhost_kernel_open_sock(char *ifname, int hdr_size,
++ uint8_t *mac, int req_mq)
++{
++ int sockfd;
++ struct ifreq ifr;
++ struct sockaddr_ll addr_ll;
++
++ sockfd = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
++ if (sockfd < 0) {
++ PMD_DRV_LOG(ERR, "socket failed: %s", strerror(errno));
++ return -1;
++ }
++
++ memset(&ifr, 0, sizeof(ifr));
++ strncpy(ifr.ifr_name, ifname, IFNAMSIZ - 1);
++
++ if (ioctl(sockfd, SIOCGIFINDEX, (void*)&ifr)) {
++ PMD_DRV_LOG(ERR, "SIOCGIFINDEX failed: %s", strerror(errno));
++ close(sockfd);
++ return -1;
++ }
++
++ memset(&addr_ll, 0, sizeof(addr_ll));
++ addr_ll.sll_ifindex = ifr.ifr_ifindex;
++ addr_ll.sll_family = AF_PACKET;
++ addr_ll.sll_protocol = htons(ETH_P_ALL);
++ addr_ll.sll_hatype = 0;
++ //addr_ll.sll_pkttype = PACKET_HOST;
++ //addr_ll.sll_halen = ETH_ALEN;
++ if (bind(sockfd, (struct sockaddr*)&addr_ll, sizeof(addr_ll))) {
++ PMD_DRV_LOG(ERR, "bind failed: %s", strerror(errno));
++ close(sockfd);
++ return -1;
++ }
++
++ ifr.ifr_flags |= IFF_PROMISC | IFF_UP;
++
++ if (ioctl(sockfd, SIOCSIFFLAGS, (char*)&ifr)) {
++ PMD_DRV_LOG(ERR, "SIOCSIFFLAGS failed: %s", strerror(errno));
++ close(sockfd);
++ return -1;
++ }
++
++ ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
++ if (ioctl(sockfd, SIOCGIFHWADDR, &ifr) == 0)
++ memcpy(mac, ifr.ifr_hwaddr.sa_data, ETHER_ADDR_LEN);
++
++ return vhost_kernel_set_sock(sockfd, hdr_size, req_mq);
++}
+diff --git a/drivers/net/virtio/virtio_user/vhost_kernel_tap.c b/drivers/net/virtio/virtio_user/vhost_kernel_tap.c
+index a3faf1d0c..85dd24dd6 100644
+--- a/drivers/net/virtio/virtio_user/vhost_kernel_tap.c
++++ b/drivers/net/virtio/virtio_user/vhost_kernel_tap.c
+@@ -11,13 +11,75 @@
+ #include <errno.h>
+ #include <string.h>
+ #include <limits.h>
++#include <sys/ioctl.h>
+
+ #include <rte_ether.h>
+
+-#include "vhost_kernel_tap.h"
++#include "vhost_kernel.h"
+ #include "../virtio_logs.h"
+ #include "../virtio_pci.h"
+
++/* TUN ioctls */
++#define TUNSETIFF _IOW('T', 202, int)
++#define TUNGETFEATURES _IOR('T', 207, unsigned int)
++#define TUNSETOFFLOAD _IOW('T', 208, unsigned int)
++#define TUNGETIFF _IOR('T', 210, unsigned int)
++#define TUNSETSNDBUF _IOW('T', 212, int)
++#define TUNGETVNETHDRSZ _IOR('T', 215, int)
++#define TUNSETVNETHDRSZ _IOW('T', 216, int)
++#define TUNSETQUEUE _IOW('T', 217, int)
++#define TUNSETVNETLE _IOW('T', 220, int)
++#define TUNSETVNETBE _IOW('T', 222, int)
++
++/* TUNSETIFF ifr flags */
++#define IFF_TAP 0x0002
++#define IFF_NO_PI 0x1000
++#define IFF_ONE_QUEUE 0x2000
++#define IFF_VNET_HDR 0x4000
++#define IFF_MULTI_QUEUE 0x0100
++#define IFF_ATTACH_QUEUE 0x0200
++#define IFF_DETACH_QUEUE 0x0400
++
++/* Features for GSO (TUNSETOFFLOAD). */
++#define TUN_F_CSUM 0x01 /* You can hand me unchecksummed packets. */
++#define TUN_F_TSO4 0x02 /* I can handle TSO for IPv4 packets */
++#define TUN_F_TSO6 0x04 /* I can handle TSO for IPv6 packets */
++#define TUN_F_TSO_ECN 0x08 /* I can handle TSO with ECN bits. */
++#define TUN_F_UFO 0x10 /* I can handle UFO packets */
++
++/* Constants */
++#define PATH_NET_TUN "/dev/net/tun"
++
++void
++tap_support_features(int *vnet_hdr, int *mq)
++{
++ int tapfd;
++ unsigned int tap_features;
++
++ *vnet_hdr = 0;
++ *mq = 0;
++
++ tapfd = open(PATH_NET_TUN, O_RDWR);
++ if (tapfd < 0) {
++ PMD_DRV_LOG(ERR, "fail to open %s: %s",
++ PATH_NET_TUN, strerror(errno));
++ return;
++ }
++
++ if (ioctl(tapfd, TUNGETFEATURES, &tap_features) == -1) {
++ PMD_DRV_LOG(ERR, "TUNGETFEATURES failed: %s", strerror(errno));
++ close(tapfd);
++ return;
++ }
++
++ close(tapfd);
++
++ if (tap_features & IFF_VNET_HDR)
++ *vnet_hdr = 1;
++ if (tap_features & IFF_MULTI_QUEUE)
++ *mq = 1;
++}
++
+ static int
+ vhost_kernel_tap_set_offload(int fd, uint64_t features)
+ {
+diff --git a/drivers/net/virtio/virtio_user/vhost_kernel_tap.h b/drivers/net/virtio/virtio_user/vhost_kernel_tap.h
+deleted file mode 100644
+index e0e95b4f5..000000000
+--- a/drivers/net/virtio/virtio_user/vhost_kernel_tap.h
++++ /dev/null
+@@ -1,39 +0,0 @@
+-/* SPDX-License-Identifier: BSD-3-Clause
+- * Copyright(c) 2016 Intel Corporation
+- */
+-
+-#include <sys/ioctl.h>
+-
+-/* TUN ioctls */
+-#define TUNSETIFF _IOW('T', 202, int)
+-#define TUNGETFEATURES _IOR('T', 207, unsigned int)
+-#define TUNSETOFFLOAD _IOW('T', 208, unsigned int)
+-#define TUNGETIFF _IOR('T', 210, unsigned int)
+-#define TUNSETSNDBUF _IOW('T', 212, int)
+-#define TUNGETVNETHDRSZ _IOR('T', 215, int)
+-#define TUNSETVNETHDRSZ _IOW('T', 216, int)
+-#define TUNSETQUEUE _IOW('T', 217, int)
+-#define TUNSETVNETLE _IOW('T', 220, int)
+-#define TUNSETVNETBE _IOW('T', 222, int)
+-
+-/* TUNSETIFF ifr flags */
+-#define IFF_TAP 0x0002
+-#define IFF_NO_PI 0x1000
+-#define IFF_ONE_QUEUE 0x2000
+-#define IFF_VNET_HDR 0x4000
+-#define IFF_MULTI_QUEUE 0x0100
+-#define IFF_ATTACH_QUEUE 0x0200
+-#define IFF_DETACH_QUEUE 0x0400
+-
+-/* Features for GSO (TUNSETOFFLOAD). */
+-#define TUN_F_CSUM 0x01 /* You can hand me unchecksummed packets. */
+-#define TUN_F_TSO4 0x02 /* I can handle TSO for IPv4 packets */
+-#define TUN_F_TSO6 0x04 /* I can handle TSO for IPv6 packets */
+-#define TUN_F_TSO_ECN 0x08 /* I can handle TSO with ECN bits. */
+-#define TUN_F_UFO 0x10 /* I can handle UFO packets */
+-
+-/* Constants */
+-#define PATH_NET_TUN "/dev/net/tun"
+-
+-int vhost_kernel_open_tap(char **p_ifname, int hdr_size, int req_mq,
+- const char *mac, uint64_t features);
+diff --git a/drivers/net/virtio/virtio_user/virtio_user_dev.c b/drivers/net/virtio/virtio_user/virtio_user_dev.c
+index 20816c936..7e655a0d5 100644
+--- a/drivers/net/virtio/virtio_user/virtio_user_dev.c
++++ b/drivers/net/virtio/virtio_user/virtio_user_dev.c
+@@ -294,7 +294,7 @@ virtio_user_fill_intr_handle(struct virtio_user_dev *dev)
+ eth_dev->intr_handle->max_intr = dev->max_queue_pairs + 1;
+ eth_dev->intr_handle->type = RTE_INTR_HANDLE_VDEV;
+ /* For virtio vdev, no need to read counter for clean */
+- eth_dev->intr_handle->efd_counter_size = 0;
++ eth_dev->intr_handle->efd_counter_size = 8;
+ eth_dev->intr_handle->fd = -1;
+ if (dev->vhostfd >= 0)
+ eth_dev->intr_handle->fd = dev->vhostfd;
+@@ -312,7 +312,9 @@ virtio_user_mem_event_cb(enum rte_mem_event type __rte_unused,
+ {
+ struct virtio_user_dev *dev = arg;
+ struct rte_memseg_list *msl;
++#if 0
+ uint16_t i;
++#endif
+
+ /* ignore externally allocated memory */
+ msl = rte_mem_virt2memseg_list(addr);
+@@ -325,15 +327,19 @@ virtio_user_mem_event_cb(enum rte_mem_event type __rte_unused,
+ goto exit;
+
+ /* Step 1: pause the active queues */
++#if 0
+ for (i = 0; i < dev->queue_pairs; i++)
+ dev->ops->enable_qp(dev, i, 0);
++#endif
+
+ /* Step 2: update memory regions */
+ dev->ops->send_request(dev, VHOST_USER_SET_MEM_TABLE, NULL);
+
+ /* Step 3: resume the active queues */
++#if 0
+ for (i = 0; i < dev->queue_pairs; i++)
+ dev->ops->enable_qp(dev, i, 1);
++#endif
+
+ exit:
+ pthread_mutex_unlock(&dev->mutex);
+@@ -412,7 +418,7 @@ virtio_user_dev_setup(struct virtio_user_dev *dev)
+ int
+ virtio_user_dev_init(struct virtio_user_dev *dev, char *path, int queues,
+ int cq, int queue_size, const char *mac, char **ifname,
+- int mrg_rxbuf, int in_order)
++ int mrg_rxbuf, int in_order, int fd)
+ {
+ pthread_mutex_init(&dev->mutex, NULL);
+ snprintf(dev->path, PATH_MAX, "%s", path);
+@@ -435,6 +441,12 @@ virtio_user_dev_init(struct virtio_user_dev *dev, char *path, int queues,
+ return -1;
+ }
+
++ if (fd >= 0) {
++ dev->be_fd = fd;
++ } else {
++ dev->be_fd = -1;
++ }
++
+ if (!dev->is_server) {
+ if (dev->ops->send_request(dev, VHOST_USER_SET_OWNER,
+ NULL) < 0) {
+diff --git a/drivers/net/virtio/virtio_user/virtio_user_dev.h b/drivers/net/virtio/virtio_user/virtio_user_dev.h
+index c42ce5d4b..575c21e3b 100644
+--- a/drivers/net/virtio/virtio_user/virtio_user_dev.h
++++ b/drivers/net/virtio/virtio_user/virtio_user_dev.h
+@@ -21,6 +21,7 @@ struct virtio_user_dev {
+ char *ifname;
+ int *vhostfds;
+ int *tapfds;
++ int be_fd;
+
+ /* for both vhost_user and vhost_kernel */
+ int callfds[VIRTIO_MAX_VIRTQUEUES];
+@@ -50,7 +51,7 @@ int virtio_user_start_device(struct virtio_user_dev *dev);
+ int virtio_user_stop_device(struct virtio_user_dev *dev);
+ int virtio_user_dev_init(struct virtio_user_dev *dev, char *path, int queues,
+ int cq, int queue_size, const char *mac, char **ifname,
+- int mrg_rxbuf, int in_order);
++ int mrg_rxbuf, int in_order, int fd);
+ void virtio_user_dev_uninit(struct virtio_user_dev *dev);
+ void virtio_user_handle_cq(struct virtio_user_dev *dev, uint16_t queue_idx);
+ uint8_t virtio_user_handle_mq(struct virtio_user_dev *dev, uint16_t q_pairs);
+diff --git a/drivers/net/virtio/virtio_user_ethdev.c b/drivers/net/virtio/virtio_user_ethdev.c
+index f8791391a..d5e87b24c 100644
+--- a/drivers/net/virtio/virtio_user_ethdev.c
++++ b/drivers/net/virtio/virtio_user_ethdev.c
+@@ -221,8 +221,7 @@ virtio_user_get_features(struct virtio_hw *hw)
+ {
+ struct virtio_user_dev *dev = virtio_user_get_dev(hw);
+
+- /* unmask feature bits defined in vhost user protocol */
+- return dev->device_features & VIRTIO_PMD_SUPPORTED_GUEST_FEATURES;
++ return dev->device_features;
+ }
+
+ static void
+@@ -361,6 +360,8 @@ static const char *valid_args[] = {
+ VIRTIO_USER_ARG_MRG_RXBUF,
+ #define VIRTIO_USER_ARG_IN_ORDER "in_order"
+ VIRTIO_USER_ARG_IN_ORDER,
++#define VIRTIO_USER_ARG_FD "fd"
++ VIRTIO_USER_ARG_FD,
+ NULL
+ };
+
+@@ -464,6 +465,7 @@ virtio_user_pmd_probe(struct rte_vdev_device *dev)
+ uint64_t server_mode = VIRTIO_USER_DEF_SERVER_MODE;
+ uint64_t mrg_rxbuf = 1;
+ uint64_t in_order = 1;
++ uint64_t fd = -1;
+ char *path = NULL;
+ char *ifname = NULL;
+ char *mac_addr = NULL;
+@@ -581,6 +583,15 @@ virtio_user_pmd_probe(struct rte_vdev_device *dev)
+ }
+ }
+
++ if (rte_kvargs_count(kvlist, VIRTIO_USER_ARG_FD) == 1) {
++ if (rte_kvargs_process(kvlist, VIRTIO_USER_ARG_FD,
++ &get_integer_arg, &fd) < 0) {
++ PMD_INIT_LOG(ERR, "error to parse %s",
++ VIRTIO_USER_ARG_FD);
++ goto end;
++ }
++ }
++
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ struct virtio_user_dev *vu_dev;
+
+@@ -598,7 +609,7 @@ virtio_user_pmd_probe(struct rte_vdev_device *dev)
+ vu_dev->is_server = false;
+ if (virtio_user_dev_init(hw->virtio_user_dev, path, queues, cq,
+ queue_size, mac_addr, &ifname, mrg_rxbuf,
+- in_order) < 0) {
++ in_order, fd) < 0) {
+ PMD_INIT_LOG(ERR, "virtio_user_dev_init fails");
+ virtio_user_eth_dev_free(eth_dev);
+ goto end;
+@@ -677,4 +688,5 @@ RTE_PMD_REGISTER_PARAM_STRING(net_virtio_user,
+ "iface=<string> "
+ "server=<0|1> "
+ "mrg_rxbuf=<0|1> "
+- "in_order=<0|1>");
++ "in_order=<0|1>"
++ "fd=<int>");
+--
+2.17.1
+
diff --git a/dpdk/dpdk-v18.11_patches/0006-mempool-add-dynamic-mempool-support.patch b/dpdk/dpdk-v18.11_patches/0006-mempool-add-dynamic-mempool-support.patch
new file mode 100644
index 0000000..bcc9743
--- /dev/null
+++ b/dpdk/dpdk-v18.11_patches/0006-mempool-add-dynamic-mempool-support.patch
@@ -0,0 +1,247 @@
+From 9d2ddfe6012b37297bc84f6ddcce810232162e5b Mon Sep 17 00:00:00 2001
+From: Jianfeng Tan <henry.tjf@antfin.com>
+Date: Wed, 26 Dec 2018 14:39:24 +0000
+Subject: [PATCH 1/2] mempool: add dynamic mempool support
+
+Signed-off-by: Jianfeng Tan <henry.tjf@antfin.com>
+---
+ drivers/mempool/ring/rte_mempool_ring.c | 26 +++++++----
+ lib/librte_mempool/rte_mempool.c | 27 +++++++++--
+ lib/librte_mempool/rte_mempool.h | 62 ++++++++++++++++++++-----
+ 3 files changed, 92 insertions(+), 23 deletions(-)
+
+diff --git a/drivers/mempool/ring/rte_mempool_ring.c b/drivers/mempool/ring/rte_mempool_ring.c
+index bc123fc52..e8fec9119 100644
+--- a/drivers/mempool/ring/rte_mempool_ring.c
++++ b/drivers/mempool/ring/rte_mempool_ring.c
+@@ -49,30 +49,40 @@ common_ring_get_count(const struct rte_mempool *mp)
+ static int
+ common_ring_alloc(struct rte_mempool *mp)
+ {
++ int n;
+ int rg_flags = 0, ret;
+ char rg_name[RTE_RING_NAMESIZE];
+ struct rte_ring *r;
+
+- ret = snprintf(rg_name, sizeof(rg_name),
+- RTE_MEMPOOL_MZ_FORMAT, mp->name);
+- if (ret < 0 || ret >= (int)sizeof(rg_name)) {
+- rte_errno = ENAMETOOLONG;
+- return -rte_errno;
+- }
+-
+ /* ring flags */
+ if (mp->flags & MEMPOOL_F_SP_PUT)
+ rg_flags |= RING_F_SP_ENQ;
+ if (mp->flags & MEMPOOL_F_SC_GET)
+ rg_flags |= RING_F_SC_DEQ;
+
++ if (mp->flags & MEMPOOL_F_DYNAMIC) {
++ n = RTE_MIN(mp->size, mp->populated_size + mp->dynamic_size);
++
++ ret = snprintf(rg_name, sizeof(rg_name),
++ RTE_MEMPOOL_MZ_FORMAT"_%x", mp->name, n);
++ } else {
++ n = mp->size;
++ ret = snprintf(rg_name, sizeof(rg_name),
++ RTE_MEMPOOL_MZ_FORMAT, mp->name);
++ }
++
++ if (ret < 0 || ret >= (int)sizeof(rg_name)) {
++ rte_errno = ENAMETOOLONG;
++ return -rte_errno;
++ }
++
+ /*
+ * Allocate the ring that will be used to store objects.
+ * Ring functions will return appropriate errors if we are
+ * running as a secondary process etc., so no checks made
+ * in this function for that condition.
+ */
+- r = rte_ring_create(rg_name, rte_align32pow2(mp->size + 1),
++ r = rte_ring_create(rg_name, rte_align32pow2(n + 1),
+ mp->socket_id, rg_flags);
+ if (r == NULL)
+ return -rte_errno;
+diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
+index 683b216f9..70039f6c3 100644
+--- a/lib/librte_mempool/rte_mempool.c
++++ b/lib/librte_mempool/rte_mempool.c
+@@ -152,6 +152,8 @@ mempool_add_elem(struct rte_mempool *mp, __rte_unused void *opaque,
+ hdr->mp = mp;
+ hdr->iova = iova;
+ STAILQ_INSERT_TAIL(&mp->elt_list, hdr, next);
++ if (mp->flags & MEMPOOL_F_DYNAMIC && mp->dyn_obj_cb)
++ mp->dyn_obj_cb(mp, NULL, obj, mp->populated_size);
+ mp->populated_size++;
+
+ #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+@@ -426,9 +428,10 @@ rte_mempool_populate_default(struct rte_mempool *mp)
+ ssize_t mem_size;
+ size_t align, pg_sz, pg_shift;
+ rte_iova_t iova;
+- unsigned mz_id, n;
++ unsigned mz_id, n, avail;
+ int ret;
+ bool no_contig, try_contig, no_pageshift, external;
++ bool dynamic = (mp->flags & MEMPOOL_F_DYNAMIC) ? true : false;
+
+ ret = mempool_ops_alloc_once(mp);
+ if (ret != 0)
+@@ -441,7 +444,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
+ external = ret;
+
+ /* mempool must not be populated */
+- if (mp->nb_mem_chunks != 0)
++ if (mp->nb_mem_chunks != 0 && !dynamic)
+ return -EEXIST;
+
+ no_contig = mp->flags & MEMPOOL_F_NO_IOVA_CONTIG;
+@@ -512,7 +515,16 @@ rte_mempool_populate_default(struct rte_mempool *mp)
+ pg_shift = rte_bsf32(pg_sz);
+ }
+
+- for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) {
++ n = mp->size;
++ if (dynamic) {
++ n = RTE_MIN(mp->size - mp->populated_size, mp->dynamic_size);
++ if (mp->nb_mem_chunks != 0 && rte_mempool_ops_alloc(mp) != 0)
++ return -ENOMEM;
++ }
++
++ avail = 0;
++ mz_id = mp->nb_mem_chunks;
++ for (; n > 0; mz_id++, n -= ret, avail += ret) {
+ size_t min_chunk_size;
+ unsigned int flags;
+
+@@ -607,9 +619,16 @@ rte_mempool_populate_default(struct rte_mempool *mp)
+ }
+ }
+
+- return mp->size;
++ return avail;
+
+ fail:
++ if (dynamic) {
++ if (avail)
++ return avail;
++
++ return ret;
++ }
++
+ rte_mempool_free_memchunks(mp);
+ return ret;
+ }
+diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
+index 7c9cd9a2f..0886b19f2 100644
+--- a/lib/librte_mempool/rte_mempool.h
++++ b/lib/librte_mempool/rte_mempool.h
+@@ -207,6 +207,16 @@ struct rte_mempool_info {
+ unsigned int contig_block_size;
+ } __rte_cache_aligned;
+
++struct rte_mempool;
++/**
++ * An object callback function for mempool.
++ *
++ * Used by rte_mempool_create() and rte_mempool_obj_iter().
++ */
++typedef void (rte_mempool_obj_cb_t)(struct rte_mempool *mp,
++ void *opaque, void *obj, unsigned obj_idx);
++typedef rte_mempool_obj_cb_t rte_mempool_obj_ctor_t; /* compat */
++
+ /**
+ * The RTE mempool structure.
+ */
+@@ -247,6 +257,8 @@ struct rte_mempool {
+ struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */
+
+ uint32_t populated_size; /**< Number of populated objects. */
++ uint32_t dynamic_size; /**< Number of dynamic populated objects. */
++ rte_mempool_obj_cb_t *dyn_obj_cb; /**< elem cb for dynamic populated objects. */
+ struct rte_mempool_objhdr_list elt_list; /**< List of objects in pool */
+ uint32_t nb_mem_chunks; /**< Number of memory chunks */
+ struct rte_mempool_memhdr_list mem_list; /**< List of memory chunks */
+@@ -264,6 +276,8 @@ struct rte_mempool {
+ #define MEMPOOL_F_POOL_CREATED 0x0010 /**< Internal: pool is created. */
+ #define MEMPOOL_F_NO_IOVA_CONTIG 0x0020 /**< Don't need IOVA contiguous objs. */
+ #define MEMPOOL_F_NO_PHYS_CONTIG MEMPOOL_F_NO_IOVA_CONTIG /* deprecated */
++#define MEMPOOL_F_DYNAMIC 0x0040 /**< Don't populate element once for all */
++#define MEMPOOL_F_DYNAMIC_NOW 0x0080 /**< It's is dynamically populated now */
+
+ /**
+ * @internal When debug is enabled, store some statistics.
+@@ -839,15 +853,6 @@ int rte_mempool_register_ops(const struct rte_mempool_ops *ops);
+ rte_mempool_register_ops(&ops); \
+ }
+
+-/**
+- * An object callback function for mempool.
+- *
+- * Used by rte_mempool_create() and rte_mempool_obj_iter().
+- */
+-typedef void (rte_mempool_obj_cb_t)(struct rte_mempool *mp,
+- void *opaque, void *obj, unsigned obj_idx);
+-typedef rte_mempool_obj_cb_t rte_mempool_obj_ctor_t; /* compat */
+-
+ /**
+ * A memory callback function for mempool.
+ *
+@@ -989,6 +994,22 @@ struct rte_mempool *
+ rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
+ unsigned cache_size, unsigned private_data_size,
+ int socket_id, unsigned flags);
++
++static inline void
++rte_mempool_set_dynamic_size(struct rte_mempool *mp, int dynamic_size)
++{
++ mp->flags |= MEMPOOL_F_DYNAMIC;
++ mp->dynamic_size = dynamic_size;
++}
++
++static inline void
++rte_mempool_set_dynamic_cb(struct rte_mempool *mp,
++ rte_mempool_obj_cb_t *dyn_obj_cb)
++{
++ mp->flags |= MEMPOOL_F_DYNAMIC;
++ mp->dyn_obj_cb = dyn_obj_cb;
++}
++
+ /**
+ * Free a mempool
+ *
+@@ -1390,9 +1411,28 @@ __mempool_generic_get(struct rte_mempool *mp, void **obj_table,
+ /* get remaining objects from ring */
+ ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, n);
+
+- if (ret < 0)
++ if (ret < 0) {
++ if (mp->flags & MEMPOOL_F_DYNAMIC &&
++ mp->populated_size < mp->size) {
++ int work;
++
++ work = rte_atomic32_cmpset(&mp->flags,
++ mp->flags & ~MEMPOOL_F_DYNAMIC_NOW,
++ mp->flags | MEMPOOL_F_DYNAMIC_NOW);
++ if (work) {
++ int more;
++
++ more = rte_mempool_populate_default(mp);
++ mp->flags &= ~MEMPOOL_F_DYNAMIC_NOW;
++ if (more > 0)
++ goto ring_dequeue;
++ } else {
++ /* mempool is populating, try again */
++ goto ring_dequeue;
++ }
++ }
+ __MEMPOOL_STAT_ADD(mp, get_fail, n);
+- else
++ } else
+ __MEMPOOL_STAT_ADD(mp, get_success, n);
+
+ return ret;
+--
+2.17.1
+
diff --git a/dpdk/dpdk-v18.11_patches/0007-mbuf-add-dynamic-mbuf-mempool-support.patch b/dpdk/dpdk-v18.11_patches/0007-mbuf-add-dynamic-mbuf-mempool-support.patch
new file mode 100644
index 0000000..8618928
--- /dev/null
+++ b/dpdk/dpdk-v18.11_patches/0007-mbuf-add-dynamic-mbuf-mempool-support.patch
@@ -0,0 +1,305 @@
+From c2a2b8eec349156b31f2faab61cc6063ef3f0c61 Mon Sep 17 00:00:00 2001
+From: Jianfeng Tan <henry.tjf@antfin.com>
+Date: Wed, 26 Dec 2018 14:40:07 +0000
+Subject: [PATCH 2/2] mbuf: add dynamic mbuf mempool support
+
+Signed-off-by: Jianfeng Tan <henry.tjf@antfin.com>
+---
+ examples/Makefile | 1 +
+ examples/dynamic_mbuf_pool/Makefile | 56 ++++++++++++++++
+ examples/dynamic_mbuf_pool/main.c | 92 ++++++++++++++++++++++++++
+ examples/dynamic_mbuf_pool/meson.build | 11 +++
+ lib/librte_mbuf/rte_mbuf.c | 51 ++++++++++++++
+ lib/librte_mbuf/rte_mbuf.h | 5 ++
+ lib/librte_mbuf/rte_mbuf_version.map | 8 ++-
+ 7 files changed, 223 insertions(+), 1 deletion(-)
+ create mode 100644 examples/dynamic_mbuf_pool/Makefile
+ create mode 100644 examples/dynamic_mbuf_pool/main.c
+ create mode 100644 examples/dynamic_mbuf_pool/meson.build
+
+diff --git a/examples/Makefile b/examples/Makefile
+index 33fe0e586..3df9cb7ad 100644
+--- a/examples/Makefile
++++ b/examples/Makefile
+@@ -21,6 +21,7 @@ DIRS-$(CONFIG_RTE_LIBRTE_CRYPTODEV) += fips_validation
+ DIRS-$(CONFIG_RTE_LIBRTE_FLOW_CLASSIFY) += flow_classify
+ DIRS-y += flow_filtering
+ DIRS-y += helloworld
++DIRS-y += dynamic_mbuf_pool
+ DIRS-$(CONFIG_RTE_LIBRTE_PIPELINE) += ip_pipeline
+ ifeq ($(CONFIG_RTE_LIBRTE_LPM),y)
+ DIRS-$(CONFIG_RTE_IP_FRAG) += ip_reassembly
+diff --git a/examples/dynamic_mbuf_pool/Makefile b/examples/dynamic_mbuf_pool/Makefile
+new file mode 100644
+index 000000000..f2761f661
+--- /dev/null
++++ b/examples/dynamic_mbuf_pool/Makefile
+@@ -0,0 +1,56 @@
++# SPDX-License-Identifier: BSD-3-Clause
++# Copyright(c) 2010-2014 Intel Corporation
++
++# binary name
++APP = dynamic_mbuf_pool
++
++# all source are stored in SRCS-y
++SRCS-y := main.c
++
++# Build using pkg-config variables if possible
++$(shell pkg-config --exists libdpdk)
++ifeq ($(.SHELLSTATUS),0)
++
++all: shared
++.PHONY: shared static
++shared: build/$(APP)-shared
++ ln -sf $(APP)-shared build/$(APP)
++static: build/$(APP)-static
++ ln -sf $(APP)-static build/$(APP)
++
++PC_FILE := $(shell pkg-config --path libdpdk)
++CFLAGS += -O3 $(shell pkg-config --cflags libdpdk)
++LDFLAGS_SHARED = $(shell pkg-config --libs libdpdk)
++LDFLAGS_STATIC = -Wl,-Bstatic $(shell pkg-config --static --libs libdpdk)
++
++build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build
++ $(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)
++
++build/$(APP)-static: $(SRCS-y) Makefile $(PC_FILE) | build
++ $(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC)
++
++build:
++ @mkdir -p $@
++
++.PHONY: clean
++clean:
++ rm -f build/$(APP) build/$(APP)-static build/$(APP)-shared
++ rmdir --ignore-fail-on-non-empty build
++
++else
++
++ifeq ($(RTE_SDK),)
++$(error "Please define RTE_SDK environment variable")
++endif
++
++# Default target, can be overridden by command line or environment
++RTE_TARGET ?= x86_64-native-linuxapp-gcc
++
++include $(RTE_SDK)/mk/rte.vars.mk
++
++CFLAGS += -O3
++CFLAGS += $(WERROR_FLAGS)
++
++include $(RTE_SDK)/mk/rte.extapp.mk
++
++endif
+diff --git a/examples/dynamic_mbuf_pool/main.c b/examples/dynamic_mbuf_pool/main.c
+new file mode 100644
+index 000000000..a568d7cec
+--- /dev/null
++++ b/examples/dynamic_mbuf_pool/main.c
+@@ -0,0 +1,92 @@
++/* SPDX-License-Identifier: BSD-3-Clause
++ * Copyright(c) 2010-2014 Intel Corporation
++ */
++
++#include <stdio.h>
++#include <string.h>
++#include <stdint.h>
++#include <errno.h>
++#include <sys/queue.h>
++#include <sys/types.h>
++#include <sys/stat.h>
++#include <fcntl.h>
++#include <unistd.h>
++
++#include <rte_memory.h>
++#include <rte_launch.h>
++#include <rte_eal.h>
++#include <rte_per_lcore.h>
++#include <rte_lcore.h>
++#include <rte_debug.h>
++#include <rte_memory.h>
++#include <rte_mbuf.h>
++#include <rte_memzone.h>
++
++#define HUGE_2M "/sys/kernel/mm/hugepages/hugepages-2048kB/free_hugepages"
++#define HUGE_1G "/sys/kernel/mm/hugepages/hugepages-1048576kB/free_hugepages"
++
++static long int
++get_value(const char *path)
++{
++ int fd, len;
++ long int value;
++ char buf[1024];
++
++ fd = open(path, O_RDONLY);
++ if (fd < 0)
++ return ULONG_MAX;
++
++ len = read(fd, buf, sizeof(buf));
++
++ close(fd);
++
++ if (len <= 0) {
++ return ULONG_MAX;
++ }
++
++ value = strtol(buf, NULL, 10);
++ return value;
++}
++
++static void
++print_free_hugepages(void)
++{
++ printf("2M: %ld\t\t1G: %ld\n", get_value(HUGE_2M), get_value(HUGE_1G));
++}
++
++int
++main(int argc, char **argv)
++{
++ int i;
++ int ret;
++ int n = 512 * 1024;
++ int dynamic_size = 8 * 1024;
++ struct rte_mbuf *m;
++ struct rte_mempool *mp;
++
++ ret = rte_eal_init(argc, argv);
++ if (ret < 0)
++ rte_panic("Cannot init EAL\n");
++
++ mp = rte_pktmbuf_dynamic_pool_create("mbuf_pool", n,
++ 64, 0, RTE_MBUF_DEFAULT_BUF_SIZE,
++ 0, dynamic_size);
++ if (mp == NULL)
++ rte_panic("Failed to create mbuf mempool");
++
++ for (i = 0; i < n; i++) {
++ m = rte_pktmbuf_alloc(mp);
++ if (m == NULL)
++ break;
++
++ if ((i % dynamic_size) == 1) {
++ print_free_hugepages();
++ usleep(100 * 1000);
++ }
++ }
++
++ printf("have allocated %d mbufs", i);
++ rte_memzone_dump(stdout);
++
++ return 0;
++}
+diff --git a/examples/dynamic_mbuf_pool/meson.build b/examples/dynamic_mbuf_pool/meson.build
+new file mode 100644
+index 000000000..c34e11e36
+--- /dev/null
++++ b/examples/dynamic_mbuf_pool/meson.build
+@@ -0,0 +1,11 @@
++# SPDX-License-Identifier: BSD-3-Clause
++# Copyright(c) 2017 Intel Corporation
++
++# meson file, for building this example as part of a main DPDK build.
++#
++# To build this example as a standalone application with an already-installed
++# DPDK instance, use 'make'
++
++sources = files(
++ 'main.c'
++)
+diff --git a/lib/librte_mbuf/rte_mbuf.c b/lib/librte_mbuf/rte_mbuf.c
+index 9790b4fb1..b70abd88c 100644
+--- a/lib/librte_mbuf/rte_mbuf.c
++++ b/lib/librte_mbuf/rte_mbuf.c
+@@ -167,6 +167,57 @@ rte_pktmbuf_pool_create(const char *name, unsigned int n,
+ data_room_size, socket_id, NULL);
+ }
+
++struct rte_mempool *
++rte_pktmbuf_dynamic_pool_create(const char *name, unsigned int n,
++ unsigned int cache_size, uint16_t priv_size,
++ uint16_t data_room_size, int socket_id, int dynamic_size)
++{
++ struct rte_mempool *mp;
++ struct rte_pktmbuf_pool_private mbp_priv;
++ const char *mp_ops_name;
++ unsigned elt_size;
++ int ret;
++
++ if (RTE_ALIGN(priv_size, RTE_MBUF_PRIV_ALIGN) != priv_size) {
++ RTE_LOG(ERR, MBUF, "mbuf priv_size=%u is not aligned\n",
++ priv_size);
++ rte_errno = EINVAL;
++ return NULL;
++ }
++ elt_size = sizeof(struct rte_mbuf) + (unsigned)priv_size +
++ (unsigned)data_room_size;
++ mbp_priv.mbuf_data_room_size = data_room_size;
++ mbp_priv.mbuf_priv_size = priv_size;
++
++ mp = rte_mempool_create_empty(name, n, elt_size, cache_size,
++ sizeof(struct rte_pktmbuf_pool_private),
++ socket_id, MEMPOOL_F_DYNAMIC);
++ if (mp == NULL)
++ return NULL;
++
++ mp_ops_name = rte_mbuf_best_mempool_ops();
++ ret = rte_mempool_set_ops_byname(mp, mp_ops_name, NULL);
++ if (ret != 0) {
++ RTE_LOG(ERR, MBUF, "error setting mempool handler\n");
++ rte_mempool_free(mp);
++ rte_errno = -ret;
++ return NULL;
++ }
++ rte_pktmbuf_pool_init(mp, &mbp_priv);
++
++ rte_mempool_set_dynamic_size(mp, dynamic_size);
++ rte_mempool_set_dynamic_cb(mp, rte_pktmbuf_init);
++
++ ret = rte_mempool_populate_default(mp);
++ if (ret < 0) {
++ rte_mempool_free(mp);
++ rte_errno = -ret;
++ return NULL;
++ }
++
++ return mp;
++}
++
+ /* do some sanity checks on a mbuf: panic if it fails */
+ void
+ rte_mbuf_sanity_check(const struct rte_mbuf *m, int is_header)
+diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
+index 3dbc6695e..5a2d81605 100644
+--- a/lib/librte_mbuf/rte_mbuf.h
++++ b/lib/librte_mbuf/rte_mbuf.h
+@@ -1183,6 +1183,11 @@ rte_pktmbuf_pool_create(const char *name, unsigned n,
+ unsigned cache_size, uint16_t priv_size, uint16_t data_room_size,
+ int socket_id);
+
++struct rte_mempool *
++rte_pktmbuf_dynamic_pool_create(const char *name, unsigned int n,
++ unsigned int cache_size, uint16_t priv_size,
++ uint16_t data_room_size, int socket_id, int dynamic_size);
++
+ /**
+ * Create a mbuf pool with a given mempool ops name
+ *
+diff --git a/lib/librte_mbuf/rte_mbuf_version.map b/lib/librte_mbuf/rte_mbuf_version.map
+index cae68db8d..d6d25af95 100644
+--- a/lib/librte_mbuf/rte_mbuf_version.map
++++ b/lib/librte_mbuf/rte_mbuf_version.map
+@@ -44,4 +44,10 @@ DPDK_18.08 {
+ rte_mbuf_set_user_mempool_ops;
+ rte_mbuf_user_mempool_ops;
+ rte_pktmbuf_pool_create_by_ops;
+-} DPDK_16.11;
++} DPDK_18.11;
++
++DPDK_18.11 {
++ global:
++
++ rte_pktmbuf_dynamic_pool_create;
++} DPDK_18.12;
+--
+2.17.1
+
diff --git a/dpdk/dpdk-v18.11_patches/0008-mempool-prioritize-constructor.patch b/dpdk/dpdk-v18.11_patches/0008-mempool-prioritize-constructor.patch
new file mode 100644
index 0000000..c941443
--- /dev/null
+++ b/dpdk/dpdk-v18.11_patches/0008-mempool-prioritize-constructor.patch
@@ -0,0 +1,30 @@
+From cd36895a4a7bfc342915b42e3856bd233452f0bd Mon Sep 17 00:00:00 2001
+From: Jianfeng Tan <henry.tjf@antfin.com>
+Date: Fri, 13 Jul 2018 15:25:22 +0800
+Subject: [PATCH 1/9] mempool: prioritize constructor
+
+---
+ lib/librte_mempool/rte_mempool.h | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
+index 7c9cd9a2f..bdc32d583 100644
+--- a/lib/librte_mempool/rte_mempool.h
++++ b/lib/librte_mempool/rte_mempool.h
+@@ -833,10 +833,10 @@ int rte_mempool_register_ops(const struct rte_mempool_ops *ops);
+ * more than RTE_MEMPOOL_MAX_OPS_IDX is registered.
+ */
+ #define MEMPOOL_REGISTER_OPS(ops) \
+- void mp_hdlr_init_##ops(void); \
+- void __attribute__((constructor, used)) mp_hdlr_init_##ops(void)\
++ static void __attribute__((constructor(101), used)) \
++ mp_hdlr_init_##ops(void) \
+ { \
+- rte_mempool_register_ops(&ops); \
++ rte_mempool_register_ops(&ops); \
+ }
+
+ /**
+--
+2.17.1
+
diff --git a/dpdk/dpdk-v18.11_patches/0009-net-virtio-fill-desc-limit.patch b/dpdk/dpdk-v18.11_patches/0009-net-virtio-fill-desc-limit.patch
new file mode 100644
index 0000000..146ea88
--- /dev/null
+++ b/dpdk/dpdk-v18.11_patches/0009-net-virtio-fill-desc-limit.patch
@@ -0,0 +1,42 @@
+commit 470acd1b108f20ae12b1216c9f6157b78655bcc7
+Author: Jianfeng Tan <henry.tjf@antfin.com>
+Date: Wed Dec 12 02:14:03 2018 +0000
+
+ net/virtio: fill desc limit
+
+ We shall fill desc limit accordingly, or APIs, such as
+ rte_eth_dev_adjust_nb_rx_tx_desc, will not give correct desc
+ information.
+
+ Signed-off-by: Jianfeng Tan <henry.tjf@antfin.com>
+
+diff --git a/drivers/net/virtio/virtio_ethdev.c b/drivers/net/virtio/virtio_ethdev.c
+index dbfa6865c..d369d5ce8 100644
+--- a/drivers/net/virtio/virtio_ethdev.c
++++ b/drivers/net/virtio/virtio_ethdev.c
+@@ -2172,6 +2172,7 @@ virtio_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
+ {
+ uint64_t tso_mask, host_features;
+ struct virtio_hw *hw = dev->data->dev_private;
++ struct virtqueue *vq;
+
+ dev_info->speed_capa = ETH_LINK_SPEED_10G; /* fake value */
+
+@@ -2209,6 +2210,17 @@ virtio_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
+ (1ULL << VIRTIO_NET_F_HOST_TSO6);
+ if ((host_features & tso_mask) == tso_mask)
+ dev_info->tx_offload_capa |= DEV_TX_OFFLOAD_TCP_TSO;
++
++
++ if (hw->vqs) {
++ vq = hw->vqs[VTNET_SQ_RQ_QUEUE_IDX];
++ dev_info->rx_desc_lim.nb_max = vq->vq_nentries;
++ dev_info->rx_desc_lim.nb_min = 256;
++
++ vq = hw->vqs[VTNET_SQ_TQ_QUEUE_IDX];
++ dev_info->tx_desc_lim.nb_max = vq->vq_nentries;
++ dev_info->tx_desc_lim.nb_min = 256;
++ }
+ }
+
+ /*
diff --git a/examples/Makefile b/examples/Makefile
index cf13574..9ef8d85 100644
--- a/examples/Makefile
+++ b/examples/Makefile
@@ -21,6 +21,6 @@ endif
include $(RTE_SDK)/mk/rte.vars.mk
-DIRS-y += l4fwd
+#DIRS-y += l4fwd
include $(TLDK_ROOT)/mk/tle.subdir.mk
diff --git a/examples/l4fwd/main.c b/examples/l4fwd/main.c
index 9396403..2e16479 100644
--- a/examples/l4fwd/main.c
+++ b/examples/l4fwd/main.c
@@ -68,7 +68,6 @@ static char proto_name[3][10] = {"udp", "tcp", ""};
static const struct rte_eth_conf port_conf_default = {
.rxmode = {
- .offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
},
};
diff --git a/examples/l4fwd/port.h b/examples/l4fwd/port.h
index a154844..67ca19a 100644
--- a/examples/l4fwd/port.h
+++ b/examples/l4fwd/port.h
@@ -177,21 +177,10 @@ port_init(struct netbe_port *uprt, uint32_t proto)
}
port_conf = port_conf_default;
- if ((uprt->rx_offload & RX_CSUM_OFFLOAD) != 0) {
- RTE_LOG(ERR, USER1, "%s(%u): enabling RX csum offload;\n",
- __func__, uprt->id);
- port_conf.rxmode.offloads |= uprt->rx_offload & RX_CSUM_OFFLOAD;
- }
- port_conf.rxmode.max_rx_pkt_len = uprt->mtu + ETHER_CRC_LEN;
- if (port_conf.rxmode.max_rx_pkt_len > ETHER_MAX_LEN)
- port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_JUMBO_FRAME;
-
rc = update_rss_conf(uprt, &dev_info, &port_conf, proto);
if (rc != 0)
return rc;
- port_conf.txmode.offloads = uprt->tx_offload;
-
rc = rte_eth_dev_configure(uprt->id, uprt->nb_lcore, uprt->nb_lcore,
&port_conf);
RTE_LOG(NOTICE, USER1,
diff --git a/lib/Makefile b/lib/Makefile
index 6317af9..9bbe159 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -25,5 +25,6 @@ DIRS-y += libtle_misc
DIRS-y += libtle_dring
DIRS-y += libtle_timer
DIRS-y += libtle_l4p
+DIRS-y += libtle_glue
include $(TLDK_ROOT)/mk/tle.subdir.mk
diff --git a/lib/libtle_glue/Makefile b/lib/libtle_glue/Makefile
new file mode 100644
index 0000000..13ceb82
--- /dev/null
+++ b/lib/libtle_glue/Makefile
@@ -0,0 +1,62 @@
+# Copyright (c) 2018 Ant Financial Services Group.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ifeq ($(RTE_SDK),)
+$(error "Please define RTE_SDK environment variable")
+endif
+
+# Default target, can be overwritten by command line or environment
+RTE_TARGET ?= x86_64-native-linuxapp-gcc
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+# library name
+LIB = libtle_glue.a
+
+CFLAGS += -O3
+CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR)
+
+EXPORT_MAP := tle_glue_version.map
+
+LIBABIVER := 1
+
+# source files
+SRCS-y += fd.c
+SRCS-y += ctx.c
+SRCS-y += arp.c
+SRCS-y += icmp.c
+SRCS-y += rxcb.c
+SRCS-y += port.c
+SRCS-y += sym.c
+SRCS-y += init.c
+SRCS-y += be.c
+SRCS-y += epoll.c
+SRCS-y += socket.c
+SRCS-y += rxtx.c
+SRCS-y += poll.c
+SRCS-y += util.c
+SRCS-y += tcp.c
+SRCS-y += udp.c
+SRCS-y += select.c
+
+ifeq ($(PACKETDRILL),y)
+SRCS-y += packetdrill.c
+endif
+
+# install this header file
+SYMLINK-y-include += tle_glue.h
+
+# this lib dependencies
+DEPDIRS-y += lib/libtle_l4p
+
+include $(TLDK_ROOT)/mk/tle.lib.mk
diff --git a/lib/libtle_glue/arp.c b/lib/libtle_glue/arp.c
new file mode 100644
index 0000000..9b13d9e
--- /dev/null
+++ b/lib/libtle_glue/arp.c
@@ -0,0 +1,935 @@
+/*
+ * Copyright (c) 2019 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netinet/icmp6.h>
+
+#include <rte_ethdev.h>
+#include <rte_arp.h>
+#include <rte_ip.h>
+#include <rte_hash.h>
+#include <rte_byteorder.h>
+
+#include "log.h"
+#include "ctx.h"
+#include "internal.h"
+#include "tle_timer.h"
+#include "util.h"
+#include "ndp.h"
+#include "gateway.h"
+
+#define IPV6_MULTI_MASK_LEN 13
+
+const struct in6_addr ipv6_all_multi = {{{
+ 0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01
+}}};
+
+const struct in6_addr ipv6_multi_mask = {{{
+ 0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+}}};
+
+static inline void
+set_multicast_mac_v6(struct ether_addr *addr, const struct in6_addr *ip6_addr)
+{
+ unaligned_uint16_t *ea_words = (unaligned_uint16_t *)addr;
+
+ ea_words[0] = 0x3333;
+ ea_words[1] = ip6_addr->__in6_u.__u6_addr16[6];
+ ea_words[2] = ip6_addr->__in6_u.__u6_addr16[7];
+}
+
+static inline void
+set_multicast_ipv6(uint8_t ipv6[16])
+{
+ rte_memcpy(ipv6, &ipv6_multi_mask, IPV6_MULTI_MASK_LEN);
+}
+
+static inline void
+set_broadcast_addr(struct ether_addr *addr)
+{
+ unaligned_uint16_t *ea_words = (unaligned_uint16_t *)addr;
+
+ ea_words[0] = 0xFFFF;
+ ea_words[1] = 0xFFFF;
+ ea_words[2] = 0xFFFF;
+}
+
+static inline bool
+match_addr(struct glue_ctx *ctx, struct rte_mbuf *pkt, const struct in_addr *addr)
+{
+ struct ipv4_hdr *ip4h;
+ const struct in_addr *gw;
+
+ ip4h = rte_pktmbuf_mtod_offset(pkt, struct ipv4_hdr *, pkt->l2_len);
+ if ((ip4h->version_ihl >> 4) != 4)
+ return false;
+
+ gw = ipv4_gateway_lookup(ctx, (struct in_addr *)&ip4h->dst_addr);
+ if (gw->s_addr != addr->s_addr)
+ return false;
+
+ return true;
+}
+
+static inline bool
+match_addr6(struct glue_ctx *ctx, struct rte_mbuf *pkt,
+ const struct in6_addr *addr)
+{
+ struct ipv6_hdr *ip6h;
+ const struct in6_addr *gw;
+
+ ip6h = rte_pktmbuf_mtod_offset(pkt, struct ipv6_hdr *, pkt->l2_len);
+ if (((ip6h->vtc_flow & 0xffffff00) >> 4) != 6)
+ return false;
+
+ gw = ipv6_gateway_lookup(ctx, (struct in6_addr *)&ip6h->dst_addr);
+ if (memcmp(gw, addr, sizeof(struct in6_addr)) != 0)
+ return false;
+
+ return true;
+}
+
+static inline void
+send_pkts(struct glue_ctx *ctx, struct rte_mbuf **pkts, uint16_t nb,
+ const char *prefix)
+{
+ uint16_t i, sent;
+
+ sent = rte_eth_tx_burst(ctx->port_id, ctx->queue_id, pkts, nb);
+ for (i = sent; i < nb; i++)
+ rte_pktmbuf_free(pkts[i]);
+
+ RTE_SET_USED(prefix);
+ TRACE("%s, send %u/%u pkts", prefix, sent, nb);
+}
+
+static void
+flush_arp_wait(int af, struct glue_ctx *ctx, const void *addr,
+ struct ether_addr *e_addr)
+{
+ struct rte_mbuf *pkt, *pre, *pkts[MAX_PKTS_BURST];
+ struct ether_hdr *eth;
+ uint32_t nb_pkts;
+
+ pre = NULL;
+ nb_pkts = 0;
+ for (pkt = ctx->arp_wait; pkt; pkt = pkt->next_pkt) {
+ if ((af == AF_INET &&
+ !match_addr(ctx, pkt, (const struct in_addr *)addr)) ||
+ (af == AF_INET6 &&
+ !match_addr6(ctx, pkt, (const struct in6_addr *)addr))) {
+ pre = pkt;
+ continue;
+ }
+
+ if (pre == NULL)
+ ctx->arp_wait = pkt->next_pkt;
+ else
+ pre->next_pkt = pkt->next_pkt;
+ eth = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+ ether_addr_copy(e_addr, &eth->d_addr);
+ pkts[nb_pkts++] = pkt;
+ if (nb_pkts == MAX_PKTS_BURST) {
+ send_pkts(ctx, pkts, nb_pkts, "ARP learned");
+ nb_pkts = 0;
+ }
+ }
+ if (nb_pkts)
+ send_pkts(ctx, pkts, nb_pkts, "ARP learned");
+}
+
+static inline void
+ipv4_dst_set(struct glue_ctx *ctx, struct tle_dest *dst,
+ const struct in_addr *addr, struct ether_addr *e_addr)
+{
+ struct ether_hdr *eth;
+ struct ipv4_hdr *ip4h;
+
+ if (is_ipv4_loopback_addr(addr->s_addr, ctx))
+ dst->mtu = MTU_LOOPBACK;
+ else
+ dst->mtu = MTU_NORMAL;
+ dst->l2_len = sizeof(*eth);
+ dst->head_mp = get_mempool_by_socket(0); /* fix me */
+
+ eth = (struct ether_hdr *)dst->hdr;
+ ether_addr_copy(&ctx->mac, &eth->s_addr);
+ if (e_addr == NULL)
+ set_broadcast_addr(&eth->d_addr);
+ else
+ ether_addr_copy(e_addr, &eth->d_addr);
+ eth->ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv4);
+
+ dst->l3_len = sizeof(*ip4h);
+ ip4h = (struct ipv4_hdr *)(eth + 1);
+ ip4h->dst_addr = addr->s_addr;
+ ip4h->version_ihl = 4 << 4 | sizeof(*ip4h) / IPV4_IHL_MULTIPLIER;
+ ip4h->time_to_live = 64;
+ ip4h->next_proto_id = IPPROTO_TCP;
+}
+
+static inline void
+ipv6_dst_set(struct glue_ctx *ctx, struct tle_dest *dst,
+ const struct in6_addr *addr, struct ether_addr *e_addr)
+{
+ struct ether_hdr *eth;
+ struct ipv6_hdr *ip6h;
+
+ if (is_ipv6_loopback_addr(addr, ctx))
+ dst->mtu = MTU_LOOPBACK;
+ else
+ dst->mtu = MTU_NORMAL;
+ dst->l2_len = sizeof(*eth);
+ dst->head_mp = get_mempool_by_socket(0); /* fix me */
+
+ eth = (struct ether_hdr *)dst->hdr;
+ ether_addr_copy(&ctx->mac, &eth->s_addr);
+ if (e_addr == NULL)
+ set_broadcast_addr(&eth->d_addr);
+ else
+ ether_addr_copy(e_addr, &eth->d_addr);
+ eth->ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv6);
+
+ dst->l3_len = sizeof(*ip6h);
+ ip6h = (struct ipv6_hdr *)(eth + 1);
+ rte_memcpy(ip6h->dst_addr, addr, sizeof(struct in6_addr));
+ ip6h->vtc_flow = 6 << 4;
+ ip6h->hop_limits = 255;
+ ip6h->proto = IPPROTO_TCP;
+}
+
+#define arp_timer(ctx, entry, interval) \
+ tle_timer_start(ctx->arp_tmw, entry, interval)
+
+void
+ipv4_dst_add(struct glue_ctx *ctx, const struct in_addr *addr,
+ struct ether_addr *e_addr)
+{
+ struct arp_entry *entry;
+ struct tle_dest *dst;
+ struct ether_hdr *eth;
+ uint64_t idx;
+ bool check_wait;
+ int rc;
+
+ rc = rte_hash_lookup_data(ctx->arp_hash, addr, (void**)&idx);
+ if (rc >= 0) {
+ entry = &ctx->arp4[idx];
+ dst = &entry->dst;
+ eth = (struct ether_hdr *)dst->hdr;
+ check_wait = is_broadcast_ether_addr(&eth->d_addr);
+
+ /* update arp entry, reset timer */
+ ether_addr_copy(e_addr, &eth->d_addr);
+ print_arp(AF_INET, addr, &eth->d_addr, "UPDATE");
+ if(entry->timer != NULL)
+ tle_timer_stop(ctx->arp_tmw, entry->timer);
+ entry->timer = arp_timer(ctx, entry, ARP_ENTRY_EXPIRE);
+ entry->inuse = 0;
+ entry->req_time = 0;
+
+ if(check_wait)
+ flush_arp_wait(AF_INET, ctx, addr, e_addr);
+
+ return;
+ }
+
+ idx = ctx->arp4_num;
+ entry = &ctx->arp4[idx];
+ dst = &entry->dst;
+
+ ipv4_dst_set(ctx, dst, addr, e_addr);
+ if (e_addr == NULL) {
+ entry->timer = arp_timer(ctx, entry, ARP_REQUEST_EXPIRE);
+ entry->req_time = 1;
+ } else {
+ entry->timer = arp_timer(ctx, entry, ARP_ENTRY_EXPIRE);
+ entry->inuse = 0;
+ }
+
+ rc = rte_hash_add_key_data(ctx->arp_hash, addr, (void *)idx);
+ if (rc < 0)
+ rte_panic("Failed to add ARP entry");
+
+ ctx->arp4_num++;
+ eth = (struct ether_hdr *)dst->hdr;
+ print_arp(AF_INET, addr, &eth->d_addr, "ADD");
+}
+
+void
+ipv6_dst_add(struct glue_ctx *ctx, const struct in6_addr *addr,
+ struct ether_addr *e_addr)
+{
+ struct arp_entry* entry;
+ struct tle_dest *dst;
+ struct ether_hdr *eth;
+ uint64_t idx;
+ bool check_wait;
+ int rc;
+
+ rc = rte_hash_lookup_data(ctx->arp6_hash, addr, (void**)&idx);
+ if (rc >= 0) {
+ entry = &ctx->arp6[idx];
+ dst = &entry->dst;
+ eth = (struct ether_hdr *)dst->hdr;
+ check_wait = is_broadcast_ether_addr(&eth->d_addr);
+
+ /* update arp entry, reset timer */
+ ether_addr_copy(e_addr, &eth->d_addr);
+ print_arp(AF_INET6, addr, &eth->d_addr, "UPDATE");
+ if(entry->timer != NULL)
+ tle_timer_stop(ctx->arp_tmw, entry->timer);
+ entry->timer = arp_timer(ctx, entry, ARP_ENTRY_EXPIRE);
+ entry->inuse = 0;
+ entry->req_time = 0;
+
+ if(check_wait)
+ flush_arp_wait(AF_INET6, ctx, addr, e_addr);
+
+ return;
+ }
+
+ idx = ctx->arp6_num;
+ entry = &ctx->arp6[idx];
+ dst = &entry->dst;
+
+ ipv6_dst_set(ctx, dst, addr, e_addr);
+ if (e_addr == NULL) {
+ entry->timer = arp_timer(ctx, entry, ARP_REQUEST_EXPIRE);
+ entry->req_time = 1;
+ } else {
+ entry->timer = arp_timer(ctx, entry, ARP_ENTRY_EXPIRE);
+ entry->inuse = 0;
+ }
+
+ rc = rte_hash_add_key_data(ctx->arp6_hash, addr, (void *)idx);
+ if (rc < 0)
+ rte_panic("Failed to add ARP6 entry");
+
+ eth = (struct ether_hdr *)dst->hdr;
+ print_arp(AF_INET6, addr, &eth->d_addr, "ADD");
+ ctx->arp6_num++;
+}
+
+static inline int
+arp_ip_exist(const struct rte_hash *h, const void *ip)
+{
+ return rte_hash_lookup(h, ip) >= 0;
+}
+
+struct rte_mbuf *
+ndp_recv(struct glue_ctx *ctx, struct rte_mbuf *m,
+ uint32_t l2len, uint32_t l3len)
+{
+ struct ether_hdr *eth_h;
+ struct ipv6_hdr *ipv6_h;
+ struct nd_neighbor_solicit *ns_h;
+ struct nd_opt_hdr *opth;
+
+ eth_h = rte_pktmbuf_mtod(m, struct ether_hdr *);
+ ipv6_h = rte_pktmbuf_mtod_offset(m, struct ipv6_hdr *, l2len);
+ ns_h = rte_pktmbuf_mtod_offset(m, struct nd_neighbor_solicit *,
+ l2len + l3len);
+
+ if (ipv6_h->payload_len < sizeof(struct nd_neighbor_solicit))
+ goto drop;
+
+ /* We only learn mac when:
+ * 1. Normal NS for my ip, whose TargetAddr is me
+ * 2. Normal NA to my ip, whose DstIpv6 is me
+ * 3. Unsolicited NA, and we already have an entry for that IP
+ */
+
+ /* NS message */
+ if (ns_h->nd_ns_hdr.icmp6_type == ND_NEIGHBOR_SOLICIT) {
+ /* not support Duplicate Address Detect NS yet */
+ if (IN6_IS_ADDR_UNSPECIFIED(ipv6_h->src_addr))
+ goto drop;
+
+ if (memcmp(&ns_h->nd_ns_target, &ctx->ipv6, sizeof(ctx->ipv6)))
+ goto drop;
+
+ /* NS message, target is my ipv6 addr */
+ opth = (struct nd_opt_hdr*)(ns_h + 1);
+ ipv6_dst_add(ctx, (struct in6_addr *)ipv6_h->src_addr,
+ (struct ether_addr *)(opth + 1));
+
+ /* response NA message */
+ ether_addr_copy(&ctx->mac, &eth_h->s_addr);
+ ether_addr_copy((struct ether_addr*)(opth + 1),
+ &eth_h->d_addr);
+
+ rte_memcpy(ipv6_h->dst_addr, ipv6_h->src_addr,
+ sizeof(struct in6_addr));
+ rte_memcpy(ipv6_h->src_addr, &ctx->ipv6,
+ sizeof(struct in6_addr));
+
+ ns_h->nd_ns_hdr.icmp6_type = ND_NEIGHBOR_ADVERT;
+ ns_h->nd_ns_hdr.icmp6_dataun.icmp6_un_data8[0] = 0x60;
+ ns_h->nd_ns_hdr.icmp6_cksum = 0;
+
+ opth->nd_opt_type = ND_OPT_TARGET_LINKLAYER_ADDR;
+ ether_addr_copy(&ctx->mac, (struct ether_addr*)(opth + 1));
+
+ ns_h->nd_ns_hdr.icmp6_cksum = rte_ipv6_udptcp_cksum(ipv6_h, ns_h);
+
+ if (m->pkt_len < ETHER_MIN_LEN)
+ rte_pktmbuf_append(m, ETHER_MIN_LEN - m->pkt_len);
+
+ send_pkts(ctx, &m, 1, "NDP NA reply");
+ return NULL;
+ }
+
+ /* NA message */
+ if (memcmp(ipv6_h->dst_addr, &ctx->ipv6, sizeof(ctx->ipv6)) == 0 ||
+ (memcmp(ipv6_h->dst_addr, &ipv6_all_multi, sizeof(ctx->ipv6)) == 0 &&
+ arp_ip_exist(ctx->arp6_hash, &ns_h->nd_ns_target))) {
+ opth = (struct nd_opt_hdr *)(ns_h + 1);
+ ipv6_dst_add(ctx, &ns_h->nd_ns_target,
+ (struct ether_addr *)(opth + 1));
+ }
+
+drop:
+ rte_pktmbuf_free(m);
+ return NULL;
+}
+
+struct rte_mbuf *
+arp_recv(struct glue_ctx *ctx, struct rte_mbuf *m, uint32_t l2len)
+{
+ struct ether_hdr *eth;
+ struct arp_hdr *ahdr;
+ struct arp_ipv4 *adata;
+ uint32_t tip;
+
+ eth = rte_pktmbuf_mtod(m, struct ether_hdr *);
+ ahdr = rte_pktmbuf_mtod_offset(m, struct arp_hdr *, l2len);
+
+ if (ahdr->arp_hrd != rte_be_to_cpu_16(ARP_HRD_ETHER) ||
+ ahdr->arp_pro != rte_be_to_cpu_16(ETHER_TYPE_IPv4))
+ goto drop;
+
+ adata = &ahdr->arp_data;
+ tip = adata->arp_tip;
+
+ /* We only learn mac when:
+ * 1. tip is me, or
+ * 2. this is a RARP, and we already have an entry for that IP
+ */
+ if (tip == ctx->ipv4 ||
+ (tip == INADDR_ANY && arp_ip_exist(ctx->arp_hash, &adata->arp_sip)))
+ ipv4_dst_add(ctx, (struct in_addr *)&adata->arp_sip,
+ &adata->arp_sha);
+
+ /* We only do ARP reply when:
+ * 1. tip is me.
+ */
+ if (ahdr->arp_op == rte_be_to_cpu_16(ARP_OP_REQUEST) &&
+ tip == ctx->ipv4) {
+ eth->d_addr = eth->s_addr;
+ eth->s_addr = ctx->mac;
+ ahdr->arp_op = rte_cpu_to_be_16(ARP_OP_REPLY);
+
+ adata->arp_tip = adata->arp_sip;
+ adata->arp_sip = tip;
+
+ adata->arp_tha = adata->arp_sha;
+ adata->arp_sha = ctx->mac;
+ if (m->pkt_len < ETHER_MIN_LEN)
+ rte_pktmbuf_append(m, ETHER_MIN_LEN - m->pkt_len);
+ send_pkts(ctx, &m, 1, "ARP reply");
+ return NULL;
+ }
+drop:
+ rte_pktmbuf_free(m);
+ return NULL;
+}
+
+static void
+arp6_send_request(struct glue_ctx *ctx, const struct in6_addr *addr)
+{
+ struct rte_mempool *mp = get_mempool_by_socket(0); /* fix me */
+ struct ether_hdr *eth;
+ struct ipv6_hdr *ip6h;
+ struct nd_neighbor_solicit *nsh;
+ struct nd_opt_hdr *opth;
+ struct ether_addr *sll_addr;
+ struct rte_mbuf *m;
+#ifdef ENABLE_TRACE
+ char str_ip[64];
+#endif
+
+ m = rte_pktmbuf_alloc(mp);
+ if (m == NULL)
+ rte_panic("Failed to alloc mbuf for ndp ns request");
+
+ eth = (struct ether_hdr *)rte_pktmbuf_append(m, sizeof(*eth));
+ ether_addr_copy(&ctx->mac, &eth->s_addr);
+ set_multicast_mac_v6(&eth->d_addr, addr);
+ eth->ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv6);
+
+ ip6h = (struct ipv6_hdr*)rte_pktmbuf_append(m, sizeof(struct ipv6_hdr));
+ ip6h->vtc_flow = 6 << 4;
+ ip6h->payload_len = sizeof(struct nd_neighbor_solicit) +
+ sizeof(struct nd_opt_hdr) +
+ sizeof(struct ether_addr);
+ ip6h->proto = IPPROTO_ICMPV6;
+ ip6h->hop_limits = 255;
+ rte_memcpy(ip6h->src_addr, &ctx->ipv6, sizeof(struct in6_addr));
+ rte_memcpy(ip6h->dst_addr, addr, sizeof(struct in6_addr));
+ set_multicast_ipv6(ip6h->dst_addr);
+
+ nsh = (struct nd_neighbor_solicit *)rte_pktmbuf_append(m, sizeof(*nsh));
+ nsh->nd_ns_hdr.icmp6_type = ND_NEIGHBOR_SOLICIT;
+ nsh->nd_ns_hdr.icmp6_code = 0;
+ nsh->nd_ns_hdr.icmp6_cksum = 0;
+ nsh->nd_ns_hdr.icmp6_dataun.icmp6_un_data32[0] = 0;
+ rte_memcpy(&nsh->nd_ns_target, addr, sizeof(struct in6_addr));
+
+ opth = (struct nd_opt_hdr *)rte_pktmbuf_append(m, sizeof(*opth));
+ opth->nd_opt_type = ND_OPT_SOURCE_LINKLAYER_ADDR;
+ opth->nd_opt_len = 1;
+
+ sll_addr = (struct ether_addr *)rte_pktmbuf_append(m, sizeof(*sll_addr));
+ ether_addr_copy(&ctx->mac, sll_addr);
+
+ nsh->nd_ns_hdr.icmp6_cksum = rte_ipv6_udptcp_cksum(ip6h, nsh);
+
+ send_pkts(ctx, &m, 1, "ARP6 request");
+}
+
+static void
+arp_send_request(struct glue_ctx *ctx, const struct in_addr *addr)
+{
+ struct rte_mempool *mp = get_mempool_by_socket(0); /* fix me */
+ struct ether_hdr *eth;
+ struct arp_hdr *ahdr;
+ struct arp_ipv4 *adata;
+ struct rte_mbuf *m;
+ uint16_t pad_len, i;
+ char *pad;
+
+ m = rte_pktmbuf_alloc(mp);
+ if (m == NULL)
+ rte_panic("Failed to alloc mbuf for arp request");
+
+ eth = (struct ether_hdr *)rte_pktmbuf_append(m, sizeof(*eth));
+ ether_addr_copy(&ctx->mac, &eth->s_addr);
+ set_broadcast_addr(&eth->d_addr);
+ eth->ether_type = rte_cpu_to_be_16(ETHER_TYPE_ARP);
+
+ ahdr = (struct arp_hdr *)rte_pktmbuf_append(m, sizeof(*ahdr));
+ ahdr->arp_hrd = rte_be_to_cpu_16(ARP_HRD_ETHER);
+ ahdr->arp_pro = rte_be_to_cpu_16(ETHER_TYPE_IPv4);
+ ahdr->arp_hln = sizeof(struct ether_addr);
+ ahdr->arp_pln = sizeof(*addr);
+ ahdr->arp_op = rte_be_to_cpu_16(ARP_OP_REQUEST);
+ adata = &ahdr->arp_data;
+ ether_addr_copy(&ctx->mac, &adata->arp_sha);
+ adata->arp_sip = ctx->ipv4;
+ set_broadcast_addr(&adata->arp_tha);
+ adata->arp_tip = addr->s_addr;
+
+ pad_len = ETHER_MIN_LEN - sizeof(*eth) - sizeof(*ahdr);
+ pad = rte_pktmbuf_append(m, pad_len);
+ for (i = 0; i < pad_len; ++i)
+ pad[i] = 0;
+
+ send_pkts(ctx, &m, 1, "ARP request");
+}
+
+#define addr2ipv4(addr) (&((const struct sockaddr_in *)addr)->sin_addr)
+#define addr2ipv6(addr) (&((const struct sockaddr_in6 *)addr)->sin6_addr)
+void
+mac_check(struct glue_ctx *ctx, const struct sockaddr *addr)
+{
+ int rc;
+ const struct in_addr *addr4 = NULL;
+ const struct in6_addr *addr6 = NULL;
+
+ if(addr->sa_family == AF_INET) {
+ addr4 = ipv4_gateway_lookup(ctx, addr2ipv4(addr));
+ rc = rte_hash_lookup(ctx->arp_hash, addr4);
+ } else {
+ addr6 = ipv6_gateway_lookup(ctx, addr2ipv6(addr));
+ rc = rte_hash_lookup(ctx->arp6_hash, addr6);
+ }
+ if (rc >= 0)
+ return;
+
+ if(addr->sa_family == AF_INET)
+ arp_send_request(ctx, addr4);
+ else
+ arp6_send_request(ctx, addr6);
+}
+
+static int
+arp_inherit(struct glue_ctx *ctx, const struct in_addr *addr)
+{
+ struct glue_ctx *next;
+ struct tle_dest *dst;
+ struct ether_hdr *eth;
+ uint64_t idx;
+ uint16_t i;
+ int rc;
+
+ for (i = 0; i < nb_ctx; i++) {
+ next = &ctx_array[i++];
+ if (next == NULL || next == ctx)
+ continue;
+
+ rc = rte_hash_lookup_data(next->arp_hash, addr, (void **)&idx);
+ if (rc < 0)
+ continue;
+
+ dst = &next->arp4[idx].dst;
+ eth = (struct ether_hdr *)dst->hdr;
+ ipv4_dst_add(ctx, addr, &eth->d_addr);
+ return 0;
+ }
+
+ return -1;
+}
+
+static int
+arp6_inherit(struct glue_ctx *ctx, const struct in6_addr *addr)
+{
+ struct glue_ctx *next;
+ struct ether_hdr *eth;
+ struct tle_dest *dst;
+ uint64_t idx;
+ uint16_t i;
+ int rc;
+
+ for (i = 0; i < nb_ctx; i++) {
+ next = &ctx_array[i++];
+ if (next == NULL || next == ctx)
+ continue;
+
+ rc = rte_hash_lookup_data(next->arp6_hash, addr, (void **)&idx);
+ if (rc < 0)
+ continue;
+
+ dst = &next->arp6[idx].dst;
+ eth = (struct ether_hdr *)dst->hdr;
+ ipv6_dst_add(ctx, addr, &eth->d_addr);
+ return 0;
+ }
+
+ return -1;
+}
+
+#define len_dest(dst) \
+ (offsetof(struct tle_dest, hdr) + dst->l2_len + dst->l3_len)
+
+int
+arp_ipv6_dst_lookup(void *data, const struct in6_addr *addr,
+ struct tle_dest *res, int proto)
+{
+ int32_t rc;
+ uint64_t idx;
+ struct tle_dest *dst;
+ struct ipv6_hdr *ip6h;
+ struct glue_ctx *ctx = data;
+
+ if (is_ipv6_loopback_addr(addr, ctx)) {
+ dst = &ctx->lb_dst_v6;
+ rte_memcpy(res, dst, len_dest(dst));
+ if (proto == IPPROTO_TCP)
+ res->dev = ctx->lb_tcp_dev;
+ else
+ res->dev = ctx->lb_udp_dev;
+ rc = 0;
+ goto set_proto;
+ }
+
+ rc = rte_hash_lookup_data(ctx->arp6_hash, addr, (void **)&idx);
+ if (rc >= 0) {
+ if (!ctx->arp6[idx].inuse)
+ ctx->arp6[idx].inuse = 1;
+ dst = &ctx->arp6[idx].dst;
+ rte_memcpy(res, dst, len_dest(dst));
+ } else {
+ memset(res, 0, sizeof(*res));
+ ipv6_dst_set(ctx, res, addr, NULL);
+ rc = 0;
+ }
+
+ if (proto == IPPROTO_TCP)
+ res->dev = ctx->tcp_dev;
+ else
+ res->dev = ctx->udp_dev;
+
+set_proto:
+ ip6h = (struct ipv6_hdr *)&res->hdr[res->l2_len];
+ ip6h->proto = proto;
+ return rc;
+}
+
+int
+arp_ipv4_dst_lookup(void *data, const struct in_addr *addr,
+ struct tle_dest *res, int proto)
+{
+ int32_t rc;
+ uint64_t idx;
+ struct tle_dest *dst;
+ struct ipv4_hdr *ip4h;
+ struct glue_ctx *ctx = data;
+
+ if (is_ipv4_loopback_addr(addr->s_addr, ctx)) {
+ dst = &ctx->lb_dst;
+ rte_memcpy(res, dst, len_dest(dst));
+ if (proto == IPPROTO_TCP)
+ res->dev = ctx->lb_tcp_dev;
+ else
+ res->dev = ctx->lb_udp_dev;
+ rc = 0;
+ goto set_proto;
+ }
+
+ rc = rte_hash_lookup_data(ctx->arp_hash, addr, (void **)&idx);
+ if (rc >= 0) {
+ if (!ctx->arp4[idx].inuse)
+ ctx->arp4[idx].inuse = 1;
+ dst = &ctx->arp4[idx].dst;
+ rte_memcpy(res, dst, len_dest(dst));
+ } else {
+ memset(res, 0, sizeof(*res));
+ ipv4_dst_set(ctx, res, addr, NULL);
+ rc = 0;
+ }
+
+ if (proto == IPPROTO_TCP)
+ res->dev = ctx->tcp_dev;
+ else
+ res->dev = ctx->udp_dev;
+
+set_proto:
+ ip4h = (struct ipv4_hdr *)&res->hdr[res->l2_len];
+ ip4h->next_proto_id = proto;
+ return rc;
+}
+
+int
+mac_fill(struct glue_ctx *ctx, struct rte_mbuf *m)
+{
+ int32_t rc;
+ uint64_t idx;
+ uint8_t ipver;
+ struct arp_entry* entry;
+ struct ether_addr *dst, *dst1;
+ struct ipv4_hdr *ipv4_hdr;
+ struct ipv6_hdr *ipv6_hdr;
+ const struct in_addr *addr4 = NULL;
+ const struct in6_addr *addr6 = NULL;
+
+ dst = rte_pktmbuf_mtod(m, struct ether_addr *);
+ if (!is_broadcast_ether_addr(dst))
+ return 0;
+
+ ipv4_hdr = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr *, m->l2_len);
+ ipv6_hdr = (struct ipv6_hdr*)ipv4_hdr;
+ ipver = ipv4_hdr->version_ihl >> 4;
+
+retry:
+ if (ipver == 4) {
+ addr4 = (struct in_addr *)&ipv4_hdr->dst_addr;
+ addr4 = ipv4_gateway_lookup(ctx, addr4);
+ rc = rte_hash_lookup_data(ctx->arp_hash, addr4, (void **)&idx);
+ if (rc >= 0)
+ entry = &ctx->arp4[idx];
+ } else {
+ addr6 = (struct in6_addr *)ipv6_hdr->dst_addr;
+ addr6 = ipv6_gateway_lookup(ctx, addr6);
+ rc = rte_hash_lookup_data(ctx->arp6_hash, addr6, (void **)&idx);
+ if (rc >= 0)
+ entry = &ctx->arp6[idx];
+ }
+
+ if (rc >= 0) {
+ dst1 = (struct ether_addr *)entry->dst.hdr;
+ if (!is_broadcast_ether_addr(dst1)) {
+ ether_addr_copy(dst1 , dst);
+ return 0;
+ }
+
+ if (ipver == 4)
+ arp_send_request(ctx, addr4);
+ else
+ arp6_send_request(ctx, addr6);
+ entry->req_time++;
+ if (entry->timer != NULL)
+ tle_timer_stop(ctx->arp_tmw, entry->timer);
+ entry->timer = arp_timer(ctx, entry, ARP_REQUEST_EXPIRE);
+ } else {
+ if (ipver == 4) {
+ if (arp_inherit(ctx, addr4) == 0)
+ goto retry;
+ ipv4_dst_add(ctx, addr4, NULL);
+ arp_send_request(ctx, addr4);
+ } else {
+ if (arp6_inherit(ctx, addr6) == 0)
+ goto retry;
+ ipv6_dst_add(ctx, addr6, NULL);
+ arp6_send_request(ctx, addr6);
+ }
+ }
+
+ return -1;
+}
+
+static inline const struct in_addr *
+get_addr_from_entry(struct arp_entry *e)
+{
+ const struct ipv4_hdr *ipv4;
+ const struct in_addr *addr;
+
+ ipv4 = (struct ipv4_hdr *)(e->dst.hdr + e->dst.l2_len);
+ addr = (const struct in_addr *)&ipv4->dst_addr;
+ return addr;
+}
+
+static inline const struct in6_addr *
+get_addr6_from_entry(struct arp_entry *e)
+{
+ const struct ipv6_hdr *ipv6;
+ const struct in6_addr *addr;
+
+ ipv6 = (struct ipv6_hdr *)(e->dst.hdr + e->dst.l2_len);
+ addr = (const struct in6_addr *)ipv6->dst_addr;
+ return addr;
+}
+
+static void
+drop_arp_wait(int af, struct glue_ctx *ctx, const void *addr)
+{
+ struct rte_mbuf *pkt, *pre;
+
+ for (pre = NULL, pkt = ctx->arp_wait; pkt; pkt = pkt->next_pkt) {
+ if ((af == AF_INET &&
+ !match_addr(ctx, pkt, (const struct in_addr *)addr)) ||
+ (af == AF_INET6 &&
+ !match_addr6(ctx, pkt, (const struct in6_addr *)addr))) {
+ pre = pkt;
+ continue;
+ }
+
+ if (pre == NULL)
+ ctx->arp_wait = pkt->next_pkt;
+ else
+ pre->next_pkt = pkt->next_pkt;
+
+ rte_pktmbuf_free(pkt);
+ }
+}
+
+static void
+arp_entry_del(struct glue_ctx *ctx, int af, struct arp_entry *e)
+{
+ const void *addr;
+ struct arp_entry *t;
+ uint32_t idx, last_idx;
+ const struct rte_hash *h;
+
+ if (af == AF_INET) {
+ addr = get_addr_from_entry(e);
+ t = ctx->arp4;
+ h = ctx->arp_hash;
+ last_idx = ctx->arp4_num - 1;
+ } else {
+ addr = get_addr6_from_entry(e);
+ t = ctx->arp6;
+ h = ctx->arp6_hash;
+ last_idx = ctx->arp6_num - 1;
+ }
+
+ idx = e - t;
+ if (idx > last_idx) /* entry has been moved */
+ return;
+
+ print_arp(af, addr, (struct ether_addr *)e->dst.hdr, "DELETE");
+
+ if (e->req_time > ARP_MAX_REQ_TIMES)
+ drop_arp_wait(af, ctx, addr);
+
+ rte_hash_del_key(h, addr);
+
+ if (idx < last_idx) {
+ /* replace current entry with last entry */
+ rte_memcpy(e, t + last_idx, sizeof(*e));
+ rte_hash_add_key_data(h, addr, (void *)(uintptr_t)idx);
+ tle_timer_stop(ctx->arp_tmw, t[last_idx].timer);
+ if (e->req_time > 0)
+ e->timer = arp_timer(ctx, e, ARP_REQUEST_EXPIRE);
+ else {
+ e->timer = arp_timer(ctx, e, ARP_ENTRY_EXPIRE);
+ e->inuse = 0;
+ }
+ }
+
+ /* we always delete the last entry to keep it contiguous */
+ t[last_idx].timer = NULL;
+ t[last_idx].inuse = 0;
+ t[last_idx].req_time = 0;
+ if (af == AF_INET)
+ ctx->arp4_num--;
+ else
+ ctx->arp6_num--;
+}
+
+void
+mac_timeout(struct glue_ctx *ctx)
+{
+#define ARP_PROCESS_MAX 32
+ struct arp_entry *entry[ARP_PROCESS_MAX], *e;
+ struct tle_timer_wheel *tw;
+ const struct in_addr *addr4;
+ const struct in6_addr *addr6;
+ uint32_t i, cnt;
+ uint8_t *l3h;
+
+ tw = ctx->arp_tmw;
+ tle_timer_expire(tw, rte_get_tsc_cycles() >> ctx->cycles_ms_shift);
+ cnt = tle_timer_get_expired_bulk(tw, (void**)entry, ARP_PROCESS_MAX);
+ if (cnt == 0)
+ return;
+
+ for(i = 0; i < cnt; i++) {
+ e = entry[i];
+ e->timer = NULL;
+ l3h = e->dst.hdr + e->dst.l2_len;
+ if (e->inuse ||
+ (e->req_time > 0 && e->req_time <= ARP_MAX_REQ_TIMES)) {
+ if (((struct ipv4_hdr *)l3h)->version_ihl >> 4 == 4) {
+ addr4 = get_addr_from_entry(e);
+ arp_send_request(ctx, addr4);
+ } else {
+ addr6 = get_addr6_from_entry(e);
+ arp6_send_request(ctx, addr6);
+ }
+
+ e->timer = arp_timer(ctx, e, ARP_REQUEST_EXPIRE);
+ e->inuse = 0;
+ e->req_time++;
+ } else {
+ if (((struct ipv4_hdr *)l3h)->version_ihl >> 4 == 4)
+ arp_entry_del(ctx, AF_INET, e);
+ else
+ arp_entry_del(ctx, AF_INET6, e);
+ }
+ }
+}
diff --git a/lib/libtle_glue/be.c b/lib/libtle_glue/be.c
new file mode 100644
index 0000000..7e2227e
--- /dev/null
+++ b/lib/libtle_glue/be.c
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <rte_ethdev.h>
+#include <rte_ip.h>
+
+#include <tle_tcp.h>
+#include <tle_udp.h>
+
+#include "config.h"
+#include "log.h"
+#include "util.h"
+#include "internal.h"
+
+static inline void
+rte_pktmbuf_copy_seg(struct rte_mbuf *dst, struct rte_mbuf* src)
+{
+ size_t offset = offsetof(struct rte_mbuf, data_off);
+ rte_memcpy((char*)dst + offset, (char*)src + offset,
+ sizeof(struct rte_mbuf) - offset);
+ rte_mbuf_refcnt_set(dst, 1);
+ dst->ol_flags &= ~IND_ATTACHED_MBUF;
+ rte_memcpy(rte_pktmbuf_mtod(dst, void*), rte_pktmbuf_mtod(src, void*),
+ src->data_len);
+}
+
+static inline struct rte_mbuf*
+rte_pktmbuf_copy(struct rte_mbuf *md, struct rte_mempool* mp)
+{
+ struct rte_mbuf *mc, *mi, **prev;
+ uint32_t pktlen;
+ uint16_t nseg;
+
+ if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL))
+ return NULL;
+
+ mi = mc;
+ prev = &mi->next;
+ pktlen = md->pkt_len;
+ nseg = 0;
+
+ do {
+ nseg++;
+ rte_pktmbuf_copy_seg(mi, md);
+ *prev = mi;
+ prev = &mi->next;
+ } while ((md = md->next) != NULL &&
+ (mi = rte_pktmbuf_alloc(mp)) != NULL);
+
+ *prev = NULL;
+ mc->nb_segs = nseg;
+ mc->pkt_len = pktlen;
+
+ /* Allocation of new indirect segment failed */
+ if (unlikely(mi == NULL)) {
+ rte_pktmbuf_free(mc);
+ return NULL;
+ }
+
+ __rte_mbuf_sanity_check(mc, 1);
+ return mc;
+}
+
+static inline int
+process_rx_pkts(struct glue_ctx *ctx, struct rte_mbuf *pkts[],
+ uint32_t n, uint8_t from_loopback)
+{
+ uint32_t i, j, k, jt, ju, jd;
+ struct rte_mbuf *tcp[MAX_PKTS_BURST];
+ struct rte_mbuf *udp[MAX_PKTS_BURST];
+ struct rte_mbuf *drop[MAX_PKTS_BURST];
+ int32_t rc[MAX_PKTS_BURST];
+ struct tle_dev *tcp_dev, *udp_dev;
+ struct rte_mempool *mp;
+ struct rte_mbuf *tmp;
+ uint64_t ts;
+
+ if (n == 0)
+ return 0;
+
+ if (unlikely(from_loopback)) {
+ tcp_dev = ctx->lb_tcp_dev;
+ udp_dev = ctx->lb_udp_dev;
+ mp = pkts[0]->pool;
+ for (i = 0; i < n; i++) {
+ tmp = rte_pktmbuf_copy(pkts[i], mp);
+ if (tmp != NULL) {
+ rte_pktmbuf_free(pkts[i]);
+ pkts[i] = tmp;
+ pkts[i]->ol_flags |= PKT_RX_IP_CKSUM_GOOD;
+ pkts[i]->ol_flags |= PKT_RX_L4_CKSUM_GOOD;
+ } else {
+ k = i;
+ for (; i < n; i++) {
+ rte_pktmbuf_free(pkts[i]);
+ }
+ n = k;
+ }
+ }
+ } else {
+ tcp_dev = ctx->tcp_dev;
+ udp_dev = ctx->udp_dev;
+ }
+
+ ts = rte_get_tsc_cycles() >> (ctx->cycles_ms_shift - 10);
+
+ for (j = 0, jt = 0, ju = 0, jd = 0; j < n; j++) {
+ pkts[j]->timestamp = ts;
+ switch (pkts[j]->packet_type & RTE_PTYPE_L4_MASK) {
+ case RTE_PTYPE_L4_TCP:
+ tcp[jt++] = pkts[j];
+ break;
+ case RTE_PTYPE_L4_UDP:
+ udp[ju++] = pkts[j];
+ break;
+ case RTE_PTYPE_L4_ICMP:
+ /* TODO */
+ case RTE_PTYPE_L4_FRAG:
+ /* TODO */
+ default:
+ drop[jd++] = pkts[j];
+ }
+ }
+
+ if (jt > 0) {
+ k = tle_tcp_rx_bulk(tcp_dev, tcp, drop + jd, rc, jt);
+ jd += jt - k;
+
+ TRACE("(port=%u, queue=%u), %u/%u (TCP) pkts are received",
+ port_id, queue_id, k, n);
+ }
+
+ if (ju > 0) {
+ k = tle_udp_rx_bulk(udp_dev, udp, drop + jd, rc, ju);
+ jd += ju - k;
+
+ TRACE("(port=%u, queue=%u), %u/%u (UDP) pkts are received",
+ port_id, queue_id, k, n);
+ }
+
+ for (j = 0; j < jd; j++)
+ rte_pktmbuf_free(drop[j]);
+
+ return jt + ju - jd;
+}
+
+static inline int
+be_rx(struct glue_ctx *ctx)
+{
+ int ret;
+ uint32_t n;
+ struct rte_mbuf *pkts[MAX_PKTS_BURST];
+ uint16_t port_id = ctx->port_id;
+ uint16_t queue_id = ctx->queue_id;
+
+ n = rte_eth_rx_burst(port_id, queue_id, pkts, RTE_DIM(pkts));
+ ret = process_rx_pkts(ctx, pkts, n, 0);
+
+ return ret;
+}
+
+int
+be_tx(struct glue_ctx *ctx)
+{
+ uint32_t n, j, k, s, ret;
+ const uint16_t max_pkts = MAX_PKTS_BURST;
+ struct rte_mbuf *pkts[max_pkts];
+ struct rte_mbuf *_pkts[max_pkts];
+ uint16_t port_id = ctx->port_id;
+ uint16_t queue_id = ctx->queue_id;
+
+ ret = 0;
+ tle_tcp_process(ctx->tcp_ctx, TCP_MAX_PROCESS);
+
+ n = tle_tcp_tx_bulk(ctx->lb_tcp_dev, pkts, max_pkts);
+ n += tle_udp_tx_bulk(ctx->lb_udp_dev, pkts + n, max_pkts - n);
+ if (n > 0) {
+ ret += n;
+ rte_eth_tx_burst(ctx->lb_port_id, 0, pkts, n);
+ /* loopback device could receive after transmit immediately */
+ n = rte_eth_rx_burst(ctx->lb_port_id, 0, pkts, RTE_DIM(pkts));
+ process_rx_pkts(ctx, pkts, n, 1);
+
+ /* wake up look-aside backend */
+ wake_lookaside_backend(ctx);
+ }
+
+ n = tle_tcp_tx_bulk(ctx->tcp_dev, pkts, max_pkts);
+ n += tle_udp_tx_bulk(ctx->udp_dev, pkts + n, max_pkts - n);
+ if (n == 0)
+ return 0;
+
+ ret += n;
+ s = 0;
+ for (j = 0; j != n; j++) {
+ if (mac_fill(ctx, pkts[j]) == 0) {
+ PKT_DUMP(pkts[j]);
+ _pkts[s++] = pkts[j];
+ continue;
+ }
+
+ pkts[j]->next_pkt = ctx->arp_wait;
+ ctx->arp_wait = pkts[j];
+ }
+
+ /* For virtio-user/vhost-kernel test case, it's normal that vhost
+ * kthread cannot catch up with packets generation speed in stack.
+ * Shall we drop those packets immdiately or retry some times to
+ * keep those packets? We find dropping packets here is not a good
+ * idea, which leads to lots of retrans and inefficiency of vhost
+ * kthread. Even below code does not work well:
+ *
+ * for (k = 0, retry = 0; k < s && retry < 10000; retry++)
+ * k += rte_eth_tx_burst(port_id, queue_id, _pkts + k, s - k);
+ *
+ * So we choose to blockingly send out packes.
+ */
+ k = 0;
+ while (k < s)
+ k += rte_eth_tx_burst(port_id, queue_id, _pkts + k, s - k);
+
+ for (j = k; j != s; j++)
+ rte_pktmbuf_free(_pkts[j]);
+
+ TRACE("(port=%u, queue=%u), %u/%u pkts are sent",
+ port_id, queue_id, k, s);
+
+ return ret;
+}
+
+int
+be_process(struct glue_ctx *ctx)
+{
+ int ret;
+
+ if (unlikely(stopped))
+ return 0;
+
+ ret = be_rx(ctx);
+ mac_timeout(ctx);
+ ret += be_tx(ctx);
+
+ return ret;
+}
diff --git a/lib/libtle_glue/config.h b/lib/libtle_glue/config.h
new file mode 100644
index 0000000..976495e
--- /dev/null
+++ b/lib/libtle_glue/config.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _TLE_GLUE_CONFIG_H_
+#define _TLE_GLUE_CONFIG_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_STREAMS_PER_CORE 64 * 1024
+#define MIN_STREAMS_PER_CORE 16
+#define DELTA_STREAMS 64
+#define FRAG_BUCKET 8
+#define FRAG_ENTRIES_PER_BUCKET 8
+#define MAX_ARP_ENTRY (1 << 10)
+
+/* RCV buffer & SND buffer
+ * This is not a reall rcv/snd buffer implementation. Below number means
+ * the slots to store mbufs of sent or received data. Each slot could
+ * contains a single mbuf with size of (1500B or 2048B) or a chained
+ * mbuf with size <= 64KB.
+ *
+ * TODO: add real snd/rcv buffer
+ */
+#define MAX_RECV_BUFS_PER_STREAM 256
+#define MAX_SEND_BUFS_PER_STREAM 256
+
+#ifdef LOOK_ASIDE_BACKEND
+#define MAX_NB_CTX 1
+#else
+#define MAX_NB_CTX 16
+#endif
+
+#define MAX_MBUFS 0x80000
+/* should calculated by:
+ * MAX_NB_CTX * MAX_STREAMS_PER_CORE * (MAX_RECV_BUFS_PER_STREAM + MAX_SEND_BUFS_PER_STREAM))
+ */
+
+#define MBUF_DYNAMIC_SIZE 0x800
+
+#define MBUF_PERCORE_CACHE 32
+
+#define MAX_PKTS_BURST 0x20
+
+#define TCP_MAX_PROCESS 32
+
+#define ARP_ENTRY_EXPIRE 60000U
+#define ARP_REQUEST_EXPIRE 1000U /* ms */
+#define ARP_MAX_REQ_TIMES 5
+
+#define MTU_NORMAL 1500
+#define MTU_LOOPBACK 65535
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /*_TLE_GLUE_CONFIG_H_ */
diff --git a/lib/libtle_glue/ctx.c b/lib/libtle_glue/ctx.c
new file mode 100644
index 0000000..dc78f39
--- /dev/null
+++ b/lib/libtle_glue/ctx.c
@@ -0,0 +1,535 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <stdlib.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+
+#include <rte_malloc.h>
+#include <rte_random.h>
+#include <rte_cycles.h>
+#include <rte_ethdev.h>
+#include <rte_hash.h>
+#include <rte_spinlock.h>
+
+#include "config.h"
+#include "ctx.h"
+#include "log.h"
+#include "util.h"
+#include "internal.h"
+#include "gateway.h"
+#include "tle_timer.h"
+
+RTE_DEFINE_PER_LCORE(struct glue_ctx *, glue_ctx);
+
+int nb_ctx;
+struct glue_ctx ctx_array[MAX_NB_CTX];
+struct glue_ctx *default_ctx = &ctx_array[0];
+
+static int
+ipv4_dst_lookup_tcp(void *data, const struct in_addr *addr,
+ struct tle_dest *res)
+{
+ addr = ipv4_gateway_lookup(data, addr);
+ return arp_ipv4_dst_lookup(data, addr, res, IPPROTO_TCP);
+}
+
+static int
+ipv4_dst_lookup_udp(void *data, const struct in_addr *addr,
+ struct tle_dest *res)
+{
+ addr = ipv4_gateway_lookup(data, addr);
+ return arp_ipv4_dst_lookup(data, addr, res, IPPROTO_UDP);
+}
+
+static int
+ipv6_dst_lookup_tcp(void *data, const struct in6_addr *addr,
+ struct tle_dest *res)
+{
+ addr = ipv6_gateway_lookup(data, addr);
+ return arp_ipv6_dst_lookup(data, addr, res, IPPROTO_TCP);
+}
+
+static int
+ipv6_dst_lookup_udp(void *data, const struct in6_addr *addr,
+ struct tle_dest *res)
+{
+ addr = ipv6_gateway_lookup(data, addr);
+ return arp_ipv6_dst_lookup(data, addr, res, IPPROTO_UDP);
+}
+
+static struct tle_ctx *
+proto_ctx_create(uint32_t socket_id, uint32_t proto, void *data)
+{
+ struct tle_ctx_param cprm;
+
+ if (proto != TLE_PROTO_TCP && proto != TLE_PROTO_UDP)
+ rte_panic("Invalid proto [%u]\n", proto);
+
+ cprm.socket_id = socket_id;
+ cprm.proto = proto;
+ cprm.max_streams = MAX_STREAMS_PER_CORE;
+ cprm.min_streams = MIN_STREAMS_PER_CORE;
+ cprm.delta_streams = DELTA_STREAMS;
+ cprm.max_stream_rbufs = MAX_RECV_BUFS_PER_STREAM;
+ cprm.max_stream_sbufs = MAX_SEND_BUFS_PER_STREAM;
+ if (proto == TLE_PROTO_TCP) {
+ cprm.lookup4 = ipv4_dst_lookup_tcp;
+ cprm.lookup6 = ipv6_dst_lookup_tcp;
+ } else {
+ cprm.lookup4 = ipv4_dst_lookup_udp;
+ cprm.lookup6 = ipv6_dst_lookup_udp;
+ }
+ cprm.lookup4_data = data;
+ cprm.lookup6_data = data;
+#ifdef LOOK_ASIDE_BACKEND
+ cprm.flags = 0;
+#else
+ cprm.flags = TLE_CTX_FLAG_ST; /* ctx will be used by single thread*/
+#endif
+ cprm.send_bulk_size = 0; /* 32 if 0 */
+ cprm.hash_alg = TLE_SIPHASH;
+ cprm.secret_key.u64[0] = rte_rand();
+ cprm.secret_key.u64[1] = rte_rand();
+ cprm.icw = 0; /**< congestion window, default is 2*MSS if 0. */
+ cprm.timewait = 1; /* TLE_TCP_TIMEWAIT_DEFAULT */
+
+ return tle_ctx_create(&cprm);
+}
+
+static int
+evq_init(struct glue_ctx *ctx, uint32_t socket_id)
+{
+ struct tle_evq_param eprm = {
+ .socket_id = socket_id,
+ .max_events = 0, /* We don't pre-allocate any event */
+ };
+
+ ctx->ereq = tle_evq_create(&eprm);
+ if (ctx->ereq == NULL)
+ rte_panic("Cannot create ereq");
+
+ ctx->rxeq = tle_evq_create(&eprm);
+ if (ctx->rxeq == NULL)
+ rte_panic("Cannot create rxeq");
+
+ ctx->txeq = tle_evq_create(&eprm);
+ if (ctx->txeq == NULL)
+ rte_panic("Cannot create txeq");
+
+ return 0;
+}
+
+static void
+tle_ctx_init(struct glue_ctx *ctx, uint32_t socket_id)
+{
+ struct tle_dev_param dprm;
+ struct rte_eth_dev_info dev_info;
+ uint16_t port_id = 0; /* currently only use one port */
+
+ ctx->tcp_ctx = proto_ctx_create(socket_id, TLE_PROTO_TCP, ctx);
+ if (ctx->tcp_ctx == NULL)
+ rte_panic("Cannot create tle_ctx for tcp");
+
+ ctx->udp_ctx = proto_ctx_create(socket_id, TLE_PROTO_UDP, ctx);
+ if (ctx->udp_ctx == NULL)
+ rte_panic("Cannot create tle_ctx for udp");
+
+ memset(&dprm, 0, sizeof(dprm));
+
+ /* offloading check and set */
+ rte_eth_dev_info_get(port_id, &dev_info);
+ dprm.rx_offload = dev_info.rx_offload_capa & rx_offload;
+ dprm.tx_offload = dev_info.tx_offload_capa & tx_offload;
+
+ dprm.local_addr4.s_addr = ctx->ipv4;
+ rte_memcpy(&dprm.local_addr6, &ctx->ipv6, sizeof(struct in6_addr));
+ dprm.bl4.nb_port = 0;
+ dprm.bl4.port = NULL;
+ dprm.bl6.nb_port = 0;
+ dprm.bl6.port = NULL;
+
+ ctx->tcp_dev = tle_add_dev(ctx->tcp_ctx, &dprm);
+ if (ctx->tcp_dev == NULL)
+ rte_panic("add tle_dev for tcp failed: %u", rte_errno);
+
+ ctx->udp_dev = tle_add_dev(ctx->udp_ctx, &dprm);
+ if (ctx->udp_dev == NULL)
+ rte_panic("add tle_dev for udp failed: %u", rte_errno);
+
+ if (ctx == default_ctx) {
+ dprm.rx_offload = rx_offload;
+ dprm.tx_offload = tx_offload;
+ dprm.local_addr4.s_addr = htonl(INADDR_LOOPBACK);
+ rte_memcpy(&dprm.local_addr6, &in6addr_loopback,
+ sizeof(struct in6_addr));
+
+ ctx->lb_tcp_dev = tle_add_dev(ctx->tcp_ctx, &dprm);
+ if (ctx->lb_tcp_dev == NULL)
+ rte_panic("failed to add loopback tcp dev: %u\n",
+ rte_errno);
+
+ ctx->lb_udp_dev = tle_add_dev(ctx->udp_ctx, &dprm);
+ if (ctx->lb_udp_dev == NULL)
+ rte_panic("failed to add loopback udp dev: %u\n",
+ rte_errno);
+ }
+
+ evq_init(ctx, socket_id);
+}
+
+static uint32_t
+get_ip(void)
+{
+ struct in_addr addr;
+ const char *ip_str = getenv(DPDK_IP);
+
+ if (ip_str == NULL) {
+ ip_str = DPDK_IP_DEF;
+ GLUE_LOG(INFO, "will use the default IP %s", DPDK_IP_DEF);
+ } else
+ GLUE_LOG(INFO, "will use the IP %s", ip_str);
+
+ if (inet_aton(ip_str, &addr) == 0)
+ rte_panic("Invalid addr from env DPDK_IP: %s", ip_str);
+
+ return addr.s_addr;
+}
+
+static uint8_t
+get_ip_mask(void)
+{
+ const char *mask_str = getenv(DPDK_IP_MASK);
+
+ if (mask_str == NULL) {
+ mask_str = DPDK_IP_MASK_DEF;
+ GLUE_LOG(INFO, "will use the default IP Mask %s", DPDK_IP_MASK_DEF);
+ } else
+ GLUE_LOG(INFO, "will use the IP Mask %s", mask_str);
+
+ return (uint8_t)atoi(mask_str);
+}
+
+static uint32_t
+get_ip_gate(void)
+{
+ struct in_addr addr;
+ const char *ip_str = getenv(DPDK_IP_GATEWAY);
+
+ if (ip_str == NULL) {
+ ip_str = DPDK_IP_GATEWAY_DEF;
+ GLUE_LOG(INFO, "will use the default IP gateway %s",
+ DPDK_IP_GATEWAY_DEF);
+ } else
+ GLUE_LOG(INFO, "will use the IP gateway %s", ip_str);
+
+ if (inet_aton(ip_str, &addr) == 0)
+ rte_panic("Invalid addr from env DPDK_IP_GATEWAY: %s", ip_str);
+
+ return addr.s_addr;
+}
+
+static struct in6_addr*
+get_ipv6(void)
+{
+ static struct in6_addr addr;
+ const char *ip_str = getenv(DPDK_IPV6);
+
+ if (ip_str == NULL) {
+ ip_str = DPDK_IPV6_DEF;
+ GLUE_LOG(INFO, "will use the default IP(V6) %s", DPDK_IPV6_DEF);
+ } else
+ GLUE_LOG(INFO, "will use the IP(V6) %s", ip_str);
+
+ if (inet_pton(AF_INET6, ip_str, &addr) == 0)
+ rte_panic("Invalid addr from env DPDK_IPV6: %s", ip_str);
+
+ return &addr;
+}
+
+static uint8_t
+get_ipv6_mask(void)
+{
+ const char *mask_str = getenv(DPDK_IPV6_MASK);
+
+ if (mask_str == NULL) {
+ mask_str = DPDK_IPV6_MASK_DEF;
+ GLUE_LOG(INFO, "will use the default IPV6 Mask %s",
+ DPDK_IPV6_MASK_DEF);
+ } else
+ GLUE_LOG(INFO, "will use the IPV6 Mask %s", mask_str);
+
+ return (uint8_t)atoi(mask_str);
+}
+
+static struct in6_addr*
+get_ipv6_gate(void)
+{
+ static struct in6_addr addr;
+ const char *ip_str = getenv(DPDK_IPV6_GATEWAY);
+
+ if (ip_str == NULL) {
+ ip_str = DPDK_IPV6_GATEWAY_DEF;
+ GLUE_LOG(INFO, "will use the default IP(V6) gateway %s",
+ DPDK_IPV6_GATEWAY_DEF);
+ } else
+ GLUE_LOG(INFO, "will use the IP(V6) gateway %s", ip_str);
+
+ if (inet_pton(AF_INET6, ip_str, &addr) == 0)
+ rte_panic("Invalid addr from env DPDK_IPV6_GATEWAY: %s", ip_str);
+
+ return &addr;
+}
+
+static bool
+lo4_enabled(void)
+{
+ const char *str = getenv("DPDK_LO4_ENABLED");
+ if (str != NULL && strcmp(str, "0") == 0)
+ return false;
+ return true;
+}
+
+static bool
+lo6_enabled(void)
+{
+ const char *str = getenv("DPDK_LO6_ENABLED");
+ if (str == NULL || strcmp(str, "1") != 0)
+ return false;
+ return true;
+}
+
+static void
+loopback_dst_init(struct glue_ctx *ctx)
+{
+ struct tle_dest *dst;
+ struct ether_hdr *eth;
+ struct ipv4_hdr *ip4h;
+ struct ipv6_hdr *ip6h;
+
+ /* init ipv4 dst */
+ dst = &ctx->lb_dst;
+ dst->mtu = 65535;
+
+ dst->l2_len = sizeof(*eth);
+ dst->head_mp = get_mempool_by_socket(0); /* fix me */
+ eth = (struct ether_hdr *)dst->hdr;
+ memset(eth, 0, 2 * sizeof(eth->d_addr));
+ eth->ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv4);
+
+ dst->l3_len = sizeof(*ip4h);
+ ip4h = (struct ipv4_hdr *)(eth + 1);
+ ip4h->dst_addr = htonl(INADDR_LOOPBACK);
+ ip4h->version_ihl = 4 << 4 | sizeof(*ip4h) / IPV4_IHL_MULTIPLIER;
+ ip4h->time_to_live = 64;
+ ip4h->next_proto_id = IPPROTO_TCP;
+
+ /* init ipv6 dst */
+ dst = &ctx->lb_dst_v6;
+ dst->mtu = 65535;
+
+ dst->l2_len = sizeof(*eth);
+ dst->head_mp = get_mempool_by_socket(0); /* fix me */
+ eth = (struct ether_hdr *)dst->hdr;
+ memset(eth, 0, 2 * sizeof(eth->d_addr));
+ eth->ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv6);
+
+ dst->l3_len = sizeof(*ip6h);
+ ip6h = (struct ipv6_hdr *)(eth + 1);
+ rte_memcpy(ip6h->dst_addr, &in6addr_loopback, sizeof(struct in6_addr));
+ ip6h->vtc_flow = 6 << 4;
+ ip6h->hop_limits = 255;
+ ip6h->proto = IPPROTO_TCP;
+}
+
+static void
+arp_hash_init(struct glue_ctx *ctx, unsigned socket_id)
+{
+ char str[RTE_HASH_NAMESIZE];
+ struct rte_hash_parameters hprm;
+
+ /* init ipv4 arp hash */
+ snprintf(str, sizeof(str), "arp_hash_4@ctx%u", ctx->queue_id);
+ memset(&hprm, 0, sizeof(hprm));
+ hprm.name = str;
+ hprm.entries = MAX_ARP_ENTRY * 2;
+ hprm.socket_id = socket_id;
+ hprm.key_len = sizeof(struct in_addr);
+ ctx->arp_hash = rte_hash_create(&hprm);
+ if (ctx->arp_hash == NULL) {
+ rte_panic("Failed to init hashtable for ARP");
+ }
+
+ /* init ipv6 arp hash */
+ snprintf(str, sizeof(str), "arp_hash_6@ctx%u", ctx->queue_id);
+ memset(&hprm, 0, sizeof(hprm));
+ hprm.name = str;
+ hprm.entries = MAX_ARP_ENTRY * 2;
+ hprm.socket_id = socket_id;
+ hprm.key_len = sizeof(struct in6_addr);
+ ctx->arp6_hash = rte_hash_create(&hprm);
+ if (ctx->arp6_hash == NULL) {
+ rte_panic("Failed to init hashtable for ARP6");
+ }
+}
+
+/* get current timestamp in ms, see tcp_get_tms() */
+static inline uint64_t
+arp_get_tms(uint32_t mshift)
+{
+ uint64_t ts;
+
+ ts = rte_get_tsc_cycles() >> mshift;
+ return ts;
+}
+
+static void
+arp_timer_init(struct glue_ctx *ctx, unsigned socket_id)
+{
+ struct tle_timer_wheel_args twprm;
+
+ twprm.tick_size = 1000U;
+ twprm.max_timer = MAX_ARP_ENTRY + 8;
+ twprm.socket_id = socket_id;
+ ctx->arp_tmw = tle_timer_create(&twprm,
+ arp_get_tms(ctx->cycles_ms_shift));
+ if (ctx->arp_tmw == NULL)
+ rte_panic("Failed to init timer wheel for ARP");
+}
+
+static void
+glue_ctx_init(struct glue_ctx *ctx, uint32_t socket_id)
+{
+ uint64_t ms;
+
+ ctx->arp4 = rte_zmalloc_socket(NULL,
+ sizeof(struct arp_entry) * MAX_ARP_ENTRY,
+ RTE_CACHE_LINE_SIZE, socket_id);
+ ctx->arp6 = rte_zmalloc_socket(NULL,
+ sizeof(struct arp_entry) * MAX_ARP_ENTRY,
+ RTE_CACHE_LINE_SIZE, socket_id);
+ if (!ctx->arp4 || !ctx->arp6)
+ rte_panic("Failed to allocate arp table");
+
+ ctx->port_id = 0;
+ ctx->queue_id = nb_ctx - 1;
+ ctx->ipv4 = get_ip();
+ ctx->ipv4_ml = get_ip_mask();
+ ctx->ipv4_gw.s_addr = get_ip_gate();
+ ctx->lo4_enabled = lo4_enabled();
+ rte_memcpy(&ctx->ipv6, get_ipv6(), sizeof(struct in6_addr));
+ ctx->ipv6_ml = get_ipv6_mask();
+ rte_memcpy(&ctx->ipv6_gw, get_ipv6_gate(), sizeof(struct in6_addr));
+ ctx->lo6_enabled = lo6_enabled();
+
+ /* caclulate closest shift to convert from cycles to ms (approximate) */
+ ms = (rte_get_tsc_hz() + MS_PER_S - 1) / MS_PER_S;
+ ctx->cycles_ms_shift = sizeof(ms) * CHAR_BIT - __builtin_clzll(ms) - 1;
+
+ arp_hash_init(ctx, socket_id);
+ arp_timer_init(ctx, socket_id);
+ ctx->arp_wait = NULL;
+
+ ctx->frag_tbl = rte_ip_frag_table_create(FRAG_BUCKET,
+ FRAG_ENTRIES_PER_BUCKET,
+ FRAG_BUCKET * FRAG_ENTRIES_PER_BUCKET,
+ rte_get_tsc_hz(),
+ socket_id);
+ if (ctx->frag_tbl == NULL)
+ rte_panic("Failed to create ip defrag table");
+
+ PERCPU_MIB = &ctx->mib;
+}
+
+static int ctx_seq;
+static rte_spinlock_t ctx_lock = RTE_SPINLOCK_INITIALIZER;
+
+uint8_t
+glue_ctx_alloc(void)
+{
+ uint32_t socket_id;
+ struct glue_ctx *ctx;
+
+ /* fix me: we need a fine grainer lock */
+ rte_spinlock_lock(&ctx_lock);
+
+ GLUE_LOG(INFO, "allocate ctx: %d", ctx_seq);
+ if (ctx_seq == 0)
+ /* Called from constructor init() */
+ ctx_seq = 1;
+ else if (ctx_seq == 1) {
+ /* Called from first epoll_create() or poll() */
+ ctx_seq = 2;
+ ctx = default_ctx;
+ goto unlock;
+ }
+
+ if (nb_ctx >= MAX_NB_CTX)
+ rte_panic("Exceed the max number of ctx");
+
+ ctx = &ctx_array[nb_ctx++];
+ GLUE_LOG(INFO, "%u ctx allocated, and will init", nb_ctx);
+
+ socket_id = get_socket_id();
+
+ glue_ctx_init(ctx, socket_id);
+
+ /* reconfigure the "physical" port whenever # of ctx changes */
+ port_reconfig();
+
+ if (ctx == default_ctx) {
+ loopback_dst_init(ctx);
+
+ ctx->lb_port_id = create_loopback(socket_id);
+ GLUE_LOG(INFO, "loopback port_id: %u", ctx->lb_port_id);
+ }
+
+ rte_eth_macaddr_get(ctx->port_id, &ctx->mac);
+
+ tle_ctx_init(ctx, socket_id);
+
+unlock:
+ rte_spinlock_unlock(&ctx_lock);
+ return ctx - ctx_array;
+}
+
+void
+glue_ctx_free(struct glue_ctx *ctx __rte_unused)
+{
+ if (nb_ctx == 1 && ctx_seq == 2) {
+ GLUE_LOG(INFO, "free ctx");
+ ctx_seq = 1;
+ return;
+ }
+
+ rte_panic("close epoll fd on running is not supported\n");
+}
+
+struct glue_ctx *
+glue_ctx_lookup(uint16_t port_id, uint16_t queue_id)
+{
+ int i;
+
+ if (port_id == 1) /* loopback */
+ return default_ctx;
+
+ for (i = 0; i < nb_ctx; i++) {
+ if (ctx_array[i].port_id == port_id &&
+ ctx_array[i].queue_id == queue_id)
+ return &ctx_array[i];
+ }
+
+ return NULL;
+}
diff --git a/lib/libtle_glue/ctx.h b/lib/libtle_glue/ctx.h
new file mode 100644
index 0000000..e78b68f
--- /dev/null
+++ b/lib/libtle_glue/ctx.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _TLE_GLUE_SOCK_H_
+#define _TLE_GLUE_SOCK_H_
+
+#include <stdbool.h>
+#include <pthread.h>
+
+#include <rte_memzone.h>
+#include <rte_mempool.h>
+#include <rte_ether.h>
+#include <rte_ip_frag.h>
+
+#include <tle_ctx.h>
+#include <tle_event.h>
+#include <tle_stats.h>
+
+#include <sys/queue.h>
+
+#include "config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define DPDK_IP "DPDK_IP"
+#define DPDK_IP_DEF "0.0.0.0"
+#define DPDK_IP_MASK "DPDK_IP_MASK"
+#define DPDK_IP_MASK_DEF "16"
+#define DPDK_IP_GATEWAY "DPDK_IP_GATEWAY"
+#define DPDK_IP_GATEWAY_DEF "0.0.0.0"
+#define DPDK_IPV6 "DPDK_IPV6"
+#define DPDK_IPV6_DEF "::"
+#define DPDK_IPV6_MASK "DPDK_IPV6_MASK"
+#define DPDK_IPV6_MASK_DEF "64"
+#define DPDK_IPV6_GATEWAY "DPDK_IPV6_GATEWAY"
+#define DPDK_IPV6_GATEWAY_DEF "::"
+
+struct arp_entry {
+ struct tle_dest dst;
+ uint8_t inuse;
+ uint8_t req_time;
+ void* timer;
+};
+
+struct glue_ctx {
+ struct tle_ctx *tcp_ctx;
+ struct tle_dev *tcp_dev;
+ struct tle_dev *lb_tcp_dev;
+ struct tle_ctx *udp_ctx;
+ struct tle_dev *udp_dev;
+ struct tle_dev *lb_udp_dev;
+
+ struct tle_evq *ereq;
+ struct tle_evq *rxeq;
+ struct tle_evq *txeq;
+
+ uint16_t port_id;
+ uint16_t queue_id;
+ uint16_t lb_port_id;
+
+ struct {
+ uint8_t ipv4_ml;
+ uint8_t ipv6_ml;
+ };
+
+ struct ether_addr mac;
+ struct rte_mbuf *arp_wait;
+ struct tle_timer_wheel *arp_tmw;
+ uint32_t cycles_ms_shift; /* to convert from cycles to ms */
+
+ struct {
+ uint32_t ipv4;
+ struct in_addr ipv4_gw;
+ bool lo4_enabled;
+
+ uint32_t arp4_num;
+ struct arp_entry *arp4;
+ struct rte_hash *arp_hash;
+ };
+
+ struct {
+ struct in6_addr ipv6;
+ struct in6_addr ipv6_gw;
+ bool lo6_enabled;
+
+ uint32_t arp6_num;
+ struct arp_entry *arp6;
+ struct rte_hash *arp6_hash;
+ };
+
+ struct {
+ rte_spinlock_t frag_lock;
+ struct rte_ip_frag_tbl *frag_tbl;
+ struct rte_ip_frag_death_row frag_dr;
+ };
+
+ struct tle_dest lb_dst;
+ struct tle_dest lb_dst_v6;
+
+ struct tle_mib mib;
+} __rte_cache_aligned;
+
+extern int nb_ctx;
+extern struct glue_ctx *default_ctx;
+extern struct glue_ctx ctx_array[MAX_NB_CTX];
+
+RTE_DECLARE_PER_LCORE(struct glue_ctx *, glue_ctx);
+
+static inline struct glue_ctx *
+get_ctx(void)
+{
+ if (RTE_PER_LCORE(glue_ctx))
+ return RTE_PER_LCORE(glue_ctx);
+ return default_ctx;
+}
+
+static inline uint8_t
+get_cid(void)
+{
+ return get_ctx() - ctx_array;
+}
+
+uint8_t glue_ctx_alloc(void);
+
+struct glue_ctx * glue_ctx_lookup(uint16_t port_id, uint16_t queue_id);
+
+void glue_ctx_free(struct glue_ctx *ctx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _TLE_GLUE_SOCK_H_ */
diff --git a/lib/libtle_glue/epoll.c b/lib/libtle_glue/epoll.c
new file mode 100644
index 0000000..1c8751b
--- /dev/null
+++ b/lib/libtle_glue/epoll.c
@@ -0,0 +1,577 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <errno.h>
+
+#include <rte_common.h>
+#include <rte_spinlock.h>
+#include <rte_malloc.h>
+#include <rte_ethdev.h>
+#include <rte_atomic.h>
+#include <rte_eal_interrupts.h>
+
+#include "fd.h"
+#include "ctx.h"
+#include "sym.h"
+#include "log.h"
+#include "util.h"
+#include "sock.h"
+#include "internal.h"
+#include "tle_glue.h"
+#include "../libtle_l4p/udp_stream.h"
+#include "../libtle_l4p/tcp_stream.h"
+
+#define EPOLL_DATA_SPECIAL 0xFFFFFFFFFFFFFF01
+
+/* We don't use rte_eth_dev_rx_intr_ctl_q as it has its
+ * own way to specify event.data
+ */
+static int
+dev_rx_intr_ctl_q(uint16_t port_id, uint16_t queue_id, int efd, int op, int rx)
+{
+ int fd, ret;
+ uint32_t vec, efd_idx;
+ struct rte_eth_dev *dev;
+ struct rte_intr_handle *intr_handle;
+ static struct epoll_event ev = {
+ .events = EPOLLIN | EPOLLPRI | EPOLLET,
+ .data = {
+ .u64 = EPOLL_DATA_SPECIAL,
+ },
+ };
+ char buf[32];
+
+ RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV);
+
+ dev = &rte_eth_devices[port_id];
+ if (queue_id >= dev->data->nb_rx_queues)
+ return -EINVAL;
+
+ if (!dev->intr_handle)
+ return -ENOTSUP;
+
+ intr_handle = dev->intr_handle;
+ if (!intr_handle->intr_vec)
+ return -EPERM;
+
+ vec = intr_handle->intr_vec[queue_id];
+
+ efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ?
+ (vec - RTE_INTR_VEC_RXTX_OFFSET) : vec;
+
+ fd = intr_handle->efds[efd_idx];
+
+ if (rx) {
+ /* almost all devices use eventfd, we shall read out */
+ ret = read(fd, buf, sizeof(uint64_t));
+ RTE_SET_USED(ret);
+ }
+
+ return k_epoll_ctl(efd, op, fd, &ev);
+}
+
+int
+PRE(epoll_create)(int size)
+{
+ int epfd;
+ struct sock *so;
+
+ if (!fd_table_initialized)
+ return k_epoll_create(size);
+
+ epfd = get_unused_fd();
+ if (epfd == -1) {
+ errno = EMFILE;
+ return -1;
+ }
+
+
+ so = fd2sock(epfd);
+ so->cid = glue_ctx_alloc();
+
+ so->shadow_efd = k_epoll_create(1);
+ if (so->shadow_efd < 0)
+ rte_panic("Failed to create shadow efd");
+
+ if (dev_rx_intr_ctl_q(CTX(so)->port_id, CTX(so)->queue_id,
+ so->shadow_efd, RTE_INTR_EVENT_ADD, 0) < 0)
+ rte_panic("Failed to epoll_ctl rxq interrupt fd");
+
+ so->epoll = 1;
+
+ return epfd;
+}
+
+int
+PRE(epoll_create1)(int flags __rte_unused)
+{
+ return PRE(epoll_create)(1);
+}
+
+int
+PRE(epoll_ctl)(int epfd, int op, int fd, struct epoll_event *event)
+{
+ struct sock *so_ep;
+ struct sock *so;
+
+ if (is_kernel_fd(epfd)) {
+ if (!is_kernel_fd(fd))
+ rte_panic("kernel epoll (%d) on an userspace fd: %d",
+ epfd, fd);
+
+ return k_epoll_ctl(epfd, op, fd, event);
+ }
+
+ so_ep = fd2sock(epfd);
+
+ if (is_kernel_fd(fd)) {
+ /* Use a shadow epoll fd for possible kernel I/O events. */
+ return k_epoll_ctl(so_ep->shadow_efd, op, fd, event);
+ }
+
+ so = fd2sock(fd);
+
+ if (unlikely(so->cid != so_ep->cid))
+ rte_panic("Different ctx %d and %d for epoll fd and socket fd",
+ so_ep->cid, so->cid);
+
+ GLUE_DEBUG("epoll_ctl: op = %x, fd = %d, event = %x",
+ op, fd, event->events);
+ switch (op) {
+ case EPOLL_CTL_ADD:
+ if (so->event.events) {
+ errno = EEXIST;
+ return -1;
+ }
+
+#ifdef LOOK_ASIDE_BACKEND
+ if (event->events & EPOLLIN)
+ tle_event_active(&so->rxev, TLE_SEV_DOWN);
+ if (event->events & EPOLLOUT)
+ tle_event_active(&so->txev, TLE_SEV_DOWN);
+#endif
+ so->event = *event;
+
+ break;
+ case EPOLL_CTL_MOD:
+ if (so->event.events == 0) {
+ errno = ENOENT;
+ return -1;
+ }
+
+#ifdef LOOK_ASIDE_BACKEND
+ if (event->events & EPOLLIN)
+ tle_event_active(&so->rxev, TLE_SEV_DOWN);
+ else
+ tle_event_idle(&so->rxev);
+ if (event->events & EPOLLOUT)
+ tle_event_active(&so->txev, TLE_SEV_DOWN);
+ else
+ tle_event_idle(&so->txev);
+#endif
+ so->event = *event;
+ break;
+ case EPOLL_CTL_DEL:
+ if (so->event.events == 0) {
+ errno = ENOENT;
+ return -1;
+ }
+
+#ifdef LOOK_ASIDE_BACKEND
+ if (so->event.events & EPOLLIN)
+ tle_event_idle(&so->rxev);
+ if (so->event.events & EPOLLOUT)
+ tle_event_idle(&so->txev);
+#endif
+ so->event.events = 0;
+ break;
+ default:
+ errno = EINVAL;
+ return -1;
+ }
+
+ return 0;
+}
+
+static inline int32_t
+tle_evq_fetch(struct tle_evq *evq, const void *evd[],
+ uint32_t num, uint32_t event)
+{
+ uint32_t i, k;
+ uint32_t polled;
+ struct tle_event *ev;
+ struct tle_event *next;
+
+ if (evq->nb_armed == 0)
+ return 0;
+
+ rte_compiler_barrier();
+
+ rte_spinlock_lock(&evq->lock);
+ ev = TAILQ_FIRST(&evq->armed);
+ for (i = 0, k = 0; i != evq->nb_armed; i++) {
+ next = TAILQ_NEXT(ev, ql);
+ polled = ((const struct sock *)ev->data)->event.events;
+ /* Always report EPOLLHUP, see man epoll_ctl(2) */
+ if (polled && ((polled | EPOLLHUP) & event)) {
+ evd[k++] = ev->data;
+ TAILQ_REMOVE(&evq->armed, ev, ql);
+ /* don't down erev; and assign NULL to data means this
+ * ev is already removed from the queue, refer to
+ * tle_event_idle_err().
+ */
+ if (event != EPOLLHUP)
+ ev->state = TLE_SEV_DOWN;
+ else
+ ev->data = NULL;
+ }
+ if (k == num)
+ break;
+ ev = next;
+ }
+ evq->nb_armed -= k;
+ rte_spinlock_unlock(&evq->lock);
+ return k;
+}
+
+static int
+evq_drain(struct tle_evq *q, uint32_t event,
+ struct epoll_event *events, int maxevents)
+{
+ uint32_t i, n;
+ struct sock *socks[maxevents];
+
+ n = tle_evq_fetch(q, (const void **)(uintptr_t)socks, maxevents, event);
+ for (i = 0; i < n; ++i) {
+ events[i].events = event;
+ events[i].data = socks[i]->event.data;
+
+ /* when EPOLLHUP happens, also return EPOLLIN and EPOLLOUT
+ * if they are registered. So as to emulate behaviour of linux
+ * kernel.
+ * Some applications (e.g. redis) need these events to determine
+ * following works.
+ */
+ if (event & EPOLLHUP)
+ events[i].events |= (socks[i]->event.events &
+ (EPOLLIN | EPOLLOUT));
+
+ /* if multiple events of single socket are triggered,
+ * return single event with multiple event types rather than
+ * multiple events.
+ *
+ * we drain evq in order of EPOLLOUT -> EPOLLIN -> EPOLLHUP,
+ * so only need to check event in evq that has not been drained.
+ */
+ switch (event) {
+ case EPOLLOUT:
+ if ((socks[i]->event.events & EPOLLIN) &&
+ tle_event_state(&socks[i]->rxev) == TLE_SEV_UP) {
+ tle_event_down(&socks[i]->rxev);
+ events[i].events |= EPOLLIN;
+ }
+ /* fallthrough */
+ case EPOLLIN:
+ if (tle_event_state(&socks[i]->erev) == TLE_SEV_UP) {
+ rte_spinlock_lock(&socks[i]->erev.head->lock);
+ if (socks[i]->erev.data != NULL &&
+ tle_event_state(&socks[i]->erev) == TLE_SEV_UP) {
+ TAILQ_REMOVE(&socks[i]->erev.head->armed,
+ &socks[i]->erev, ql);
+ socks[i]->erev.head->nb_armed--;
+ socks[i]->erev.data = NULL;
+ }
+ rte_spinlock_unlock(&socks[i]->erev.head->lock);
+ events[i].events |= EPOLLHUP;
+ }
+ }
+
+ GLUE_DEBUG("event for fd = %d, event = %x",
+ socks[i]->event.data.fd, event);
+ }
+ return n;
+}
+
+#ifdef LOOK_ASIDE_BACKEND
+rte_atomic32_t flag_sleep;
+
+int
+epoll_kernel_wait(struct glue_ctx *ctx, int efd,
+ struct epoll_event *events,
+ int maxevents, int timeout, int *rx)
+{
+ struct epoll_event event;
+ uint16_t port_id = ctx->port_id;
+ uint16_t queue_id = ctx->queue_id;
+
+ RTE_SET_USED(events);
+ RTE_SET_USED(maxevents);
+ RTE_SET_USED(rx);
+
+ rte_eth_dev_rx_intr_enable(port_id, queue_id);
+
+ /* TODO: timeout shall be limited by the latest tcp timer */
+
+ if (be_process(ctx) > 0) /* use this way to avoid concurrency */ {
+ /* Do nothing */
+ } else
+ sleep_with_lock(efd, &event, 1, timeout);
+
+ rte_eth_dev_rx_intr_disable(port_id, queue_id);
+ /* We don't have kernel events for report, so just return zero */
+ return 0;
+}
+#else
+int
+epoll_kernel_wait(struct glue_ctx *ctx, int efd,
+ struct epoll_event *events,
+ int maxevents, int timeout, int *rx)
+{
+ int i, j, rc;
+ int flag_tmp = 0;
+ uint16_t port_id = ctx->port_id;
+ uint16_t queue_id = ctx->queue_id;
+#define LEAST_EVENTS 8
+ struct epoll_event s_events[LEAST_EVENTS];
+ struct epoll_event *r_events;
+ int r_maxevents;
+ int fastpath = 0;
+
+ *rx = 0;
+
+ if (efd == -1) {
+ flag_tmp = 1;
+ efd = k_epoll_create(1);
+ if (efd < 0)
+ rte_panic("Failed to create tmp efd");
+ }
+
+ if (stopped) {
+ rc = k_epoll_pwait(efd, events, maxevents, timeout, NULL);
+ goto check;
+ }
+
+ if (maxevents < LEAST_EVENTS) {
+ r_events = s_events;
+ r_maxevents = maxevents + 1;
+ } else {
+ r_events = events;
+ r_maxevents = maxevents;
+ }
+
+ if (flag_tmp &&
+ dev_rx_intr_ctl_q(port_id, queue_id, efd, RTE_INTR_EVENT_ADD, 0) < 0)
+ /* TODO: fall back to busy polling */
+ rte_panic("Failed to enable rxq interrupt");
+
+ rte_eth_dev_rx_intr_enable(port_id, queue_id);
+
+ /* TODO: timeout shall be limited by the latest tcp timer */
+
+ if (timeout != 0 && be_process(ctx) > 0) {
+ /* use this way to avoid concurrency */
+ rc = 0;
+ fastpath = 1;
+ } else
+ rc = sleep_with_lock(efd, r_events, r_maxevents, timeout);
+
+ rte_eth_dev_rx_intr_disable(port_id, queue_id);
+
+ /* filter out rxq event */
+ for (i = 0, j = 0; i < rc; ++i) {
+ if (r_events[i].data.u64 == EPOLL_DATA_SPECIAL) {
+ *rx = true;
+ if (i + 1 < rc) {
+ memcpy(&r_events[j], &r_events[i+1],
+ (rc-i-1) * sizeof(*events));
+ }
+ rc -= 1;
+ break;
+ } else {
+ if (i != j)
+ r_events[j] = r_events[i];
+ j++;
+ }
+ }
+
+ if (rc > 0 && maxevents < LEAST_EVENTS)
+ memcpy(events, r_events, rc * sizeof(*events));
+
+ if (flag_tmp)
+ dev_rx_intr_ctl_q(port_id, queue_id, efd,
+ RTE_INTR_EVENT_DEL, *rx);
+
+ if (fastpath)
+ *rx = true;
+check:
+ if (flag_tmp)
+ close(efd);
+
+ return rc;
+}
+#endif
+
+/* If only there are some packets to process, we don't sleep; we will poll
+ * for some number of iterations to check packets.
+ *
+ * TODO: change to wait for a period of time?
+ */
+#define IDLE_ITERATIONS 5
+
+int
+poll_common(struct glue_ctx *ctx, struct epoll_event *events,
+ int maxevents, int timeout, int shadow_efd)
+{
+ int rx;
+ int total = 0;
+ int idle = IDLE_ITERATIONS;
+
+again:
+ /* We will start with send, then recv, and last err queue, as we want
+ * to serve exiting connections firstly, then new connections, and
+ * lastly, the wrong connections.
+ */
+
+ /* 0. send evq */
+ total += evq_drain(ctx->txeq, EPOLLOUT,
+ events + total, maxevents-total);
+ if (total == maxevents)
+ return total;
+
+ /* 1. recv evq */
+ total += evq_drain(ctx->rxeq, EPOLLIN,
+ events + total, maxevents-total);
+ if (total == maxevents)
+ return total;
+
+ /* 2. err evq */
+ total += evq_drain(ctx->ereq, EPOLLHUP,
+ events + total, maxevents-total);
+
+ if (total > 0)
+ return total;
+
+ if (idle > 0) {
+ if (be_process(ctx) == 0)
+ idle--;
+ else
+ idle = IDLE_ITERATIONS;
+ goto again;
+ }
+
+ if (timeout == 0)
+ return 0;
+
+ /* Setup rxq interrupt mode, and check kernel I/O events */
+ total = epoll_kernel_wait(ctx, shadow_efd, events,
+ maxevents, timeout, &rx);
+
+ /* Kernel I/O events are available (total > 0) or
+ * timeout (total < 0) or something bad happens.
+ */
+ if (total != 0)
+ return total;
+
+ /* Check userspace I/O events */
+ idle = IDLE_ITERATIONS;
+ be_process(ctx);
+ goto again;
+}
+
+int
+PRE(epoll_wait)(int epfd, struct epoll_event *events,
+ int maxevents, int timeout)
+{
+ struct sock *so;
+
+ if (is_kernel_fd(epfd))
+ return k_epoll_pwait(epfd, events, maxevents, timeout, NULL);
+
+ so = fd2sock(epfd);
+
+ /* thread <> context binding happens here */
+ if (RTE_PER_LCORE(glue_ctx) == NULL)
+ RTE_PER_LCORE(glue_ctx) = CTX(so);
+
+ return poll_common(CTX(so), events, maxevents, timeout, so->shadow_efd);
+}
+
+int
+PRE(epoll_pwait)(int epfd, struct epoll_event *events,
+ int maxevents, int timeout, const sigset_t *sigmask)
+{
+ if (sigmask != NULL) {
+ rte_panic("epoll_pwait with signal is not supported");
+ }
+
+ return epoll_wait(epfd, events, maxevents, timeout);
+}
+
+int
+fd_ready(int fd, int events)
+{
+ int ret = 0;
+ struct sock *so = fd2sock(fd);
+
+ if (unlikely(!so->s)) {
+ if (tle_event_state(&so->erev) == TLE_SEV_UP)
+ /* socket has been shutdown */
+ return events | EPOLLHUP;
+ else /* socket is not set up yet */
+ return 0;
+ }
+
+ if (unlikely(IS_TCP(so) &&
+ TCP_STREAM(so->s)->tcb.state == TCP_ST_CLOSED)) {
+ return events | EPOLLHUP | EPOLLERR;
+ }
+
+ if (tle_event_state(&so->erev) == TLE_SEV_UP)
+ ret |= EPOLLHUP;
+
+ if (events & EPOLLIN) {
+ if (so->rx_left ||
+ (IS_TCP(so) && rte_ring_count(TCP_STREAM(so->s)->rx.q) > 0) ||
+ (IS_UDP(so) && rte_ring_count(UDP_STREAM(so->s)->rx.q) > 0))
+ ret |= EPOLLIN;
+ }
+
+ if (events & EPOLLOUT) {
+ if ((IS_TCP(so) &&
+ TCP_STREAM(so->s)->tcb.state >= TCP_ST_ESTABLISHED &&
+ rte_ring_free_count(TCP_STREAM(so->s)->tx.q) > 0) ||
+ (IS_UDP(so) &&
+ rte_ring_count(UDP_STREAM(so->s)->tx.drb.r) > 0))
+ ret |= EPOLLOUT;
+ }
+
+ return ret;
+}
+
+void
+v_get_stats_snmp(unsigned long mibs[])
+{
+ int i, j, k;
+
+ memcpy(mibs, &default_mib, sizeof(default_mib));
+
+ for (i = 0; i < nb_ctx; ++i) {
+ for (j = 0; j < TCP_MIB_MAX; ++j)
+ mibs[j] += ctx_array[i].mib.tcp.mibs[j];
+
+ for (k = 0; k < UDP_MIB_MAX; ++k)
+ mibs[j+k] += ctx_array[i].mib.udp.mibs[k];
+ }
+}
diff --git a/lib/libtle_glue/fd.c b/lib/libtle_glue/fd.c
new file mode 100644
index 0000000..cc855f9
--- /dev/null
+++ b/lib/libtle_glue/fd.c
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <string.h>
+
+#include "fd.h"
+#include "log.h"
+#include "util.h"
+#include "config.h"
+
+bool fd_table_initialized;
+
+struct fd_table fd_table = { .fd_base = INT_MAX, };
+
+static int
+get_ulimit_nofile(void)
+{
+ struct rlimit rlim;
+
+#define GLUE_BASE_FD 1024
+ if (getrlimit(RLIMIT_NOFILE, &rlim) < 0)
+ return GLUE_BASE_FD;
+
+ return rlim.rlim_cur; /* soft limit, rlim_max is the hard limit */
+}
+
+static void
+fd_num_set(int *fd_base, int *fd_num)
+{
+ int limit = get_ulimit_nofile();
+
+ /* fix me: alignment of power of two */
+ /* fix me: use dup2 to occupy these fds */
+ *fd_num = limit / 2;
+ *fd_num = RTE_MIN(MAX_STREAMS_PER_CORE * 2 * MAX_NB_CTX, *fd_num);
+
+ *fd_base = limit - *fd_num;
+ GLUE_LOG(INFO, "fd_base = %d, fd_num = %d", *fd_base, *fd_num);
+}
+
+static void
+add_fd(struct rte_mempool *mp __rte_unused, void *opaque __rte_unused,
+ void *obj, unsigned obj_idx)
+{
+ ((struct sock *)obj)->fd = obj_idx + fd_table.fd_base;
+ fd_table.socks[obj_idx] = obj;
+}
+
+void
+fd_init(void)
+{
+ int ret;
+ size_t sz;
+ uint32_t socket_id;
+ int fd_base, fd_num;
+ struct rte_mempool *mp = NULL;
+ char name[RTE_MEMPOOL_NAMESIZE];
+
+ socket_id = get_socket_id();
+
+ fd_num_set(&fd_base, &fd_num);
+
+ sz = sizeof(fd_table.socks[0]) * fd_num;
+ fd_table.socks = rte_zmalloc_socket("fdtable", sz,
+ RTE_CACHE_LINE_SIZE, socket_id);
+ if (fd_table.socks == NULL) {
+ GLUE_LOG(ERR, "Failed to malloc fd table");
+ goto err;
+ }
+
+ snprintf(name, RTE_MEMPOOL_NAMESIZE, "mp_fd_%d_%d", fd_base, fd_num);
+ mp = rte_mempool_create_empty(name, fd_num - 1, sizeof(struct sock),
+ 32, 0, socket_id, MEMPOOL_F_DYNAMIC);
+ if (mp == NULL) {
+ GLUE_LOG(ERR, "Failed to create mp for fd table");
+ goto err;
+ }
+
+ GLUE_LOG(INFO, "sizeof(struct sock): %lu, elt_size of fd table = %u",
+ sizeof(struct sock), mp->elt_size);
+
+ ret = rte_mempool_set_ops_byname(mp, "ring_mp_mc", NULL);
+ if (ret != 0) {
+ GLUE_LOG(ERR, "Failed to set mp ops: %d", ret);
+ goto err;
+ }
+
+ rte_mempool_set_dynamic_size(mp, 1024);
+ rte_mempool_set_dynamic_cb(mp, add_fd);
+
+ fd_table.mp = mp;
+ fd_table.fd_base = fd_base;
+ fd_table.fd_num = fd_num;
+
+ /* should populate after fd_table is set */
+ ret = rte_mempool_populate_default(mp);
+ if (ret < 0) {
+ GLUE_LOG(ERR, "Failed to populate mp: %d", ret);
+ goto err;
+ }
+
+ fd_table_initialized = true;
+
+ return;
+err:
+ rte_mempool_free(mp);
+ rte_panic("Failed to init fd_table");
+}
diff --git a/lib/libtle_glue/fd.h b/lib/libtle_glue/fd.h
new file mode 100644
index 0000000..d0ac4fe
--- /dev/null
+++ b/lib/libtle_glue/fd.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _TLE_GLUE_FD_H_
+#define _TLE_GLUE_FD_H_
+
+#include <stdbool.h>
+#include <sys/epoll.h>
+#include <fcntl.h>
+
+#include <rte_mempool.h>
+#include <rte_malloc.h>
+
+#include <tle_event.h>
+#include <tle_ctx.h>
+#include <tle_tcp.h>
+
+#include "log.h"
+#include "sock.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct fd_table {
+ int fd_base; /* The mininum fd, 64 aligned */
+ int fd_num; /* The number of fds, 64 aligned */
+ struct rte_mempool *mp; /* O(1) get and put */
+ struct sock **socks;
+};
+
+extern bool fd_table_initialized;
+extern struct fd_table fd_table;
+
+static inline struct sock *
+fd2sock(int fd)
+{
+ return fd_table.socks[fd - fd_table.fd_base];
+}
+
+static inline int
+sock2fd(struct sock *so)
+{
+ return so->fd;
+}
+
+static inline int
+get_unused_fd(void)
+{
+ struct sock *so;
+
+ if (unlikely(rte_mempool_get(fd_table.mp, (void **)&so) < 0)) {
+ GLUE_LOG(ERR, "FDs have been exhausted");
+ return -1;
+ }
+
+ so->valid = 1;
+ return sock2fd(so);
+}
+
+static inline void
+tle_event_idle_err(struct tle_event *ev)
+{
+ struct tle_evq *q;
+
+ if (ev->state == TLE_SEV_IDLE)
+ return;
+
+ q = ev->head;
+ rte_compiler_barrier();
+
+ rte_spinlock_lock(&q->lock);
+ if (ev->state == TLE_SEV_UP && ev->data) {
+ TAILQ_REMOVE(&q->armed, ev, ql);
+ q->nb_armed--;
+ }
+ ev->state = TLE_SEV_IDLE;
+ rte_spinlock_unlock(&q->lock);
+}
+
+static inline void
+put_free_fd(int fd)
+{
+ struct sock *so = fd2sock(fd);
+
+ rte_mempool_put(fd_table.mp, so);
+}
+
+static inline bool
+is_kernel_fd(int fd)
+{
+ return fd < fd_table.fd_base;
+}
+
+void fd_init(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _TLE_GLUE_FD_H_ */
diff --git a/lib/libtle_glue/gateway.h b/lib/libtle_glue/gateway.h
new file mode 100644
index 0000000..29de6b1
--- /dev/null
+++ b/lib/libtle_glue/gateway.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2019 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _TLE_GATEWAY_H_
+#define _TLE_GATEWAY_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static inline bool
+is_ipv4_loopback_addr(in_addr_t addr, struct glue_ctx *ctx)
+{
+ if (addr == ctx->ipv4 || addr == htonl(INADDR_LOOPBACK))
+ return true;
+ else
+ return false;
+}
+
+static inline bool
+is_ipv6_loopback_addr(const struct in6_addr *addr, struct glue_ctx *ctx)
+{
+ if (memcmp(addr, &ctx->ipv6, sizeof(struct in6_addr)) == 0 ||
+ IN6_IS_ADDR_LOOPBACK(addr) ||
+ (IN6_IS_ADDR_V4COMPAT(addr) &&
+ addr->__in6_u.__u6_addr32[3] == htonl(INADDR_LOOPBACK)) ||
+ (IN6_IS_ADDR_V4MAPPED(addr) &&
+ addr->__in6_u.__u6_addr32[3] == htonl(INADDR_LOOPBACK)))
+ return true;
+ else
+ return false;
+}
+
+static inline const struct in_addr *
+ipv4_gateway_lookup(void *data, const struct in_addr *addr)
+{
+ uint8_t ls;
+ struct glue_ctx *ctx = data;
+
+ if (is_ipv4_loopback_addr(addr->s_addr, ctx))
+ return addr;
+
+ ls = 32 - ctx->ipv4_ml;
+ if ((addr->s_addr << ls) == (ctx->ipv4 << ls))
+ return addr;
+
+ if (ctx->ipv4_gw.s_addr != 0)
+ return &ctx->ipv4_gw;
+
+ return addr;
+}
+
+static inline const struct in6_addr *
+ipv6_gateway_lookup(void *data, const struct in6_addr *addr)
+{
+ uint8_t ls;
+ struct glue_ctx *ctx = data;
+
+ if (is_ipv6_loopback_addr(addr, ctx))
+ return addr;
+
+ if (ctx->ipv6_ml <= 64) {
+ ls = 64 - ctx->ipv6_ml;
+ if ((*(const uint64_t*)addr << ls) ==
+ (*(const uint64_t*)&ctx->ipv6 << ls))
+ return addr;
+ } else if (*(const uint64_t*)addr == *(const uint64_t*)&ctx->ipv6) {
+ ls = 128 - ctx->ipv6_ml;
+ if ((*((const uint64_t*)addr + 1) << ls) ==
+ (*((const uint64_t*)&ctx->ipv6 + 1) << ls))
+ return addr;
+ }
+
+ if (!IN6_IS_ADDR_UNSPECIFIED(&ctx->ipv6_gw))
+ return &ctx->ipv6_gw;
+
+ return addr;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _TLE_GATEWAY_H_ */
diff --git a/lib/libtle_glue/icmp.c b/lib/libtle_glue/icmp.c
new file mode 100644
index 0000000..aba1c4b
--- /dev/null
+++ b/lib/libtle_glue/icmp.c
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <time.h>
+#include <netinet/icmp6.h>
+
+#include <rte_common.h>
+#include <rte_byteorder.h>
+#include <rte_ethdev.h>
+#include <rte_icmp.h>
+#include <rte_ip.h>
+
+#include "log.h"
+#include "ctx.h"
+#include "internal.h"
+
+#define ICMP_ECHOREPLY 0 /* Echo Reply */
+#define ICMP_ECHO 8 /* Echo Request */
+#define ICMP_TIMESTAMP 13 /* Timestamp Request */
+#define ICMP_TIMESTAMPREPLY 14 /* Timestamp Reply */
+
+/* Codes for TIME_EXCEEDED. */
+#define ICMP_EXC_TTL 0 /* TTL count exceeded */
+#define ICMP_EXC_FRAGTIME 1 /* Fragment Reass time exceeded */
+
+/* Parameters used to convert the timespec values */
+#define SECONDS_PER_DAY 86400L
+#define MSEC_PER_SEC 1000L
+#define USEC_PER_MSEC 1000L
+#define NSEC_PER_USEC 1000L
+#define NSEC_PER_MSEC (NSEC_PER_USEC * USEC_PER_MSEC)
+
+#define IS_IPV4_BCAST(x) ((x) == (uint32_t)0xFFFFFFFF)
+
+struct icmp_pkt {
+ struct icmp_hdr icmp_h;
+ uint32_t times[3];
+};
+
+/* Return remainder for ``dividend / divisor`` */
+static inline uint32_t
+div_uint64_rem(uint64_t dividend, uint32_t divisor)
+{
+ return dividend % divisor;
+}
+
+/* Return milliseconds since midnight (UTC) in network byte order. */
+static uint32_t
+current_timestamp(void)
+{
+ struct timespec ts;
+ uint32_t msecs;
+ uint32_t secs;
+
+ (void)clock_gettime(CLOCK_REALTIME, &ts);
+
+ /* Get secs since midnight. */
+ secs = div_uint64_rem(ts.tv_sec, SECONDS_PER_DAY);
+ /* Convert to msecs. */
+ msecs = secs * MSEC_PER_SEC;
+ /* Convert nsec to msec. */
+ msecs += (uint32_t)ts.tv_nsec / NSEC_PER_MSEC;
+
+ /* Convert to network byte order. */
+ return rte_cpu_to_be_32(msecs);
+}
+
+/*
+ * Process the checksum of an ICMP packet. The checksum field must be set
+ * to 0 by the caller.
+ */
+static uint16_t
+icmp_cksum(const struct icmp_hdr *icmp, uint32_t data_len)
+{
+ uint16_t cksum;
+
+ cksum = rte_raw_cksum(icmp, sizeof(struct icmp_hdr) + data_len);
+ return (cksum == 0xffff) ? cksum : ~cksum;
+}
+
+/**
+ * Receive and handle an ICMP packet.
+ *
+ * @param ctx
+ * The pointer to the glue context.
+ * @param pkt
+ * The pointer to the raw packet data.
+ * @param l2_len
+ * The the size of the l2 header.
+ * @return
+ * MUST return NULL now. :-)
+ */
+struct rte_mbuf *
+icmp_recv(struct glue_ctx *ctx, struct rte_mbuf *pkt,
+ uint32_t l2_len, uint32_t l3_len)
+{
+ struct ether_addr eth_addr;
+ struct icmp_pkt *icmp_pkt;
+ struct ether_hdr *eth_h;
+ struct icmp_hdr *icmp_h;
+ struct ipv4_hdr *ip_h;
+ uint32_t ip_addr;
+ uint32_t cksum;
+
+ eth_h = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+ ip_h = (struct ipv4_hdr *) ((char *)eth_h + l2_len);
+
+ icmp_h = (struct icmp_hdr *)((char *)ip_h + l3_len);
+ if (icmp_h->icmp_type != IP_ICMP_ECHO_REQUEST &&
+ icmp_h->icmp_type != ICMP_TIMESTAMP)
+ goto drop_pkt;
+
+ icmp_pkt = (struct icmp_pkt *)icmp_h;
+
+ ether_addr_copy(&eth_h->s_addr, &eth_addr);
+ ether_addr_copy(&eth_h->d_addr, &eth_h->s_addr);
+ ether_addr_copy(&eth_addr, &eth_h->d_addr);
+
+ /*
+ * Similar to Linux implementation, we silently drop the broadcast or
+ * multicast ICMP pakcets.
+ *
+ * RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be
+ * silently ignored.
+ * RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently
+ * discarded if to broadcast/multicast.
+ */
+ ip_addr = rte_be_to_cpu_32(ip_h->dst_addr);
+ if (IS_IPV4_MCAST(ip_addr) || IS_IPV4_BCAST(ip_addr))
+ goto drop_pkt;
+
+ ip_addr = ip_h->src_addr;
+ ip_h->src_addr = ip_h->dst_addr;
+ ip_h->dst_addr = ip_addr;
+
+ if (icmp_h->icmp_type == IP_ICMP_ECHO_REQUEST &&
+ icmp_h->icmp_code == 0) {
+
+ /* Must clear checksum field before calling the helper. */
+ ip_h->hdr_checksum = 0;
+ ip_h->hdr_checksum = rte_ipv4_cksum(ip_h);
+
+ icmp_h->icmp_type = IP_ICMP_ECHO_REPLY;
+ icmp_h->icmp_code = 0;
+
+ /*
+ * Fix me: the data part of an ICMP echo request/reply
+ * message is implementation specific, we don't know
+ * how to verify or calculate the checksum.
+ *
+ * Need to see BSD or LINUX implementation.
+ */
+ cksum = ~icmp_h->icmp_cksum & 0xffff;
+ cksum += ~rte_cpu_to_be_16(IP_ICMP_ECHO_REQUEST << 8) & 0xffff;
+ cksum += rte_cpu_to_be_16(IP_ICMP_ECHO_REPLY << 8);
+ cksum = (cksum & 0xffff) + (cksum >> 16);
+ cksum = (cksum & 0xffff) + (cksum >> 16);
+ icmp_h->icmp_cksum = ~cksum;
+
+ } else if (icmp_h->icmp_type == ICMP_TIMESTAMP &&
+ icmp_h->icmp_code == 0) {
+
+ /*
+ * RFC 1122: 3.2.2.8 MAY implement ICMP timestamp requests.
+ * SHOULD be in the kernel for minimum random latency.
+ * MUST be accurate to a few minutes.
+ * MUST be updated at least at 15Hz.
+ */
+ icmp_h->icmp_type = ICMP_TIMESTAMPREPLY;
+ icmp_h->icmp_code = 0;
+ icmp_pkt->times[1] = current_timestamp();
+ icmp_pkt->times[2] = icmp_pkt->times[1];
+
+ icmp_h->icmp_cksum = 0;
+ /* the data part of an ICMP timestamp reply is 12 bytes. */
+ icmp_h->icmp_cksum = icmp_cksum(icmp_h, 12);
+ } else
+ goto drop_pkt;
+
+ if (pkt->pkt_len < ETHER_MIN_LEN)
+ rte_pktmbuf_append(pkt, ETHER_MIN_LEN - pkt->pkt_len);
+
+ if (rte_eth_tx_burst(ctx->port_id, ctx->queue_id, &pkt, 1))
+ GLUE_LOG(DEBUG, "Send ICMP echo reply OK");
+
+ return NULL;
+
+drop_pkt:
+ rte_pktmbuf_free(pkt);
+ return NULL;
+}
+
+/**
+ * Receive and handle an ICMPv6 packet.
+ *
+ * @param ctx
+ * The pointer to the glue context.
+ * @param pkt
+ * The pointer to the raw packet data.
+ * @param l2_len
+ * The the size of the l2 header.
+ * @return
+ * MUST return NULL now. :-)
+ */
+struct rte_mbuf *
+icmp6_recv(struct glue_ctx *ctx, struct rte_mbuf *pkt,
+ uint32_t l2_len, uint32_t l3_len)
+{
+ struct ether_addr eth_addr;
+ struct ether_hdr *eth_h;
+ struct icmp6_hdr *icmp6_h;
+ struct ipv6_hdr *ipv6_h;
+ struct in6_addr ipv6_addr;
+ uint32_t cksum;
+
+ eth_h = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+ ipv6_h = (struct ipv6_hdr *) ((char *)eth_h + l2_len);
+
+ icmp6_h = (struct icmp6_hdr *)((char *)ipv6_h + l3_len);
+
+ /* NDP pkt */
+ if ((icmp6_h->icmp6_type == ND_NEIGHBOR_SOLICIT ||
+ icmp6_h->icmp6_type == ND_NEIGHBOR_ADVERT) &&
+ icmp6_h->icmp6_code == 0)
+ return ndp_recv(ctx, pkt, l2_len, l3_len);
+
+ /* only support ECHO now, other types of pkts are dropped */
+ if ((icmp6_h->icmp6_type != ICMP6_ECHO_REQUEST &&
+ icmp6_h->icmp6_type != ICMP6_ECHO_REPLY) ||
+ icmp6_h->icmp6_code != 0)
+ goto drop_pkt;
+
+ ether_addr_copy(&eth_h->s_addr, &eth_addr);
+ ether_addr_copy(&eth_h->d_addr, &eth_h->s_addr);
+ ether_addr_copy(&eth_addr, &eth_h->d_addr);
+
+ /*
+ * Now, we silently drop the anycast or multicast ICMP pakcets.
+ * But it does not conform to RFC 4443. Maybe fix it latter.
+ *
+ * RFC 4443: 4.2 An Echo Reply SHOULD be sent in response to an
+ * Echo Request message sent to an IPv6 multicast or anycast address.
+ * In this case, thesource address of the reply MUST be a unicast
+ * address belonging to the interface on which the Echo Request
+ * message was received.
+ */
+ switch (icmp6_h->icmp6_type) {
+ case ICMP6_ECHO_REQUEST:
+ if (memcmp(ipv6_h->dst_addr, &ctx->ipv6,
+ sizeof(struct in6_addr)) != 0)
+ goto drop_pkt;
+
+ rte_memcpy(&ipv6_addr, ipv6_h->src_addr,
+ sizeof(struct in6_addr));
+ rte_memcpy(ipv6_h->src_addr, ipv6_h->dst_addr,
+ sizeof(struct in6_addr));
+ rte_memcpy(ipv6_h->dst_addr, &ipv6_addr,
+ sizeof(struct in6_addr));
+
+ icmp6_h->icmp6_type = ICMP6_ECHO_REPLY;
+
+ cksum = ~icmp6_h->icmp6_cksum & 0xffff;
+ cksum += ~rte_cpu_to_be_16(ICMP6_ECHO_REQUEST << 8) & 0xffff;
+ cksum += rte_cpu_to_be_16(ICMP6_ECHO_REPLY << 8);
+ cksum = (cksum & 0xffff) + (cksum >> 16);
+ cksum = (cksum & 0xffff) + (cksum >> 16);
+ icmp6_h->icmp6_cksum = ~cksum;
+
+ break;
+ default:
+ goto drop_pkt;
+ }
+
+ if (pkt->pkt_len < ETHER_MIN_LEN)
+ rte_pktmbuf_append(pkt, ETHER_MIN_LEN - pkt->pkt_len);
+
+ if (rte_eth_tx_burst(ctx->port_id, ctx->queue_id, &pkt, 1))
+ GLUE_LOG(DEBUG, "Send ICMP echo reply OK");
+
+ return NULL;
+
+drop_pkt:
+ rte_pktmbuf_free(pkt);
+ return NULL;
+}
diff --git a/lib/libtle_glue/init.c b/lib/libtle_glue/init.c
new file mode 100644
index 0000000..d845ef8
--- /dev/null
+++ b/lib/libtle_glue/init.c
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <sched.h>
+#include <pthread.h>
+#include <stdlib.h>
+
+#include <rte_compat.h>
+#include <rte_common.h>
+#include <rte_debug.h>
+#include <rte_eal.h>
+
+#include "util.h"
+#include "fd.h"
+#include "ctx.h"
+#include "sym.h"
+#include "log.h"
+#include "internal.h"
+#include "tle_glue.h"
+
+void
+glue_init1(int argc, char **argv)
+{
+ GLUE_LOG(INFO, "init: DPDK and fd table...");
+
+ if (rte_eal_init(argc, argv) < 0)
+ rte_panic("Failed to init DPDK");
+
+ fd_init();
+}
+
+static void __attribute__((constructor(1000)))
+glue_init(void)
+{
+ char *p;
+ int i, err, argc = 0;
+ char **argv = NULL, **argv_to_release = NULL;
+ char *vnic, *params, *no_huge;
+ cpu_set_t cpuset;
+ pthread_t tid = pthread_self();
+
+ symbol_init();
+
+#define DPDK_PARAMS "DPDK_PARAMS"
+ params = getenv(DPDK_PARAMS);
+#define DPDK_NO_HUGE "DPDK_NO_HUGE"
+ no_huge = getenv(DPDK_NO_HUGE);
+#define DPDK_VNIC "DPDK_VNIC"
+ vnic = getenv(DPDK_VNIC);
+
+ if (params == NULL && no_huge == NULL && vnic == NULL)
+ return;
+
+ argv = grow_argv(argv, argc, 1);
+ argv[argc++] = xstrdup("userspace-stack");
+
+ /* Get the main thread affinity */
+ CPU_ZERO(&cpuset);
+ err = pthread_getaffinity_np(tid, sizeof(cpu_set_t), &cpuset);
+ if (!err) {
+ for (i = 0; i < CPU_SETSIZE; i++) {
+ if (CPU_ISSET(i, &cpuset)) {
+ argv = grow_argv(argv, argc, 2);
+ argv[argc++] = xstrdup("-l");
+ argv[argc++] = xasprintf("%d", i);
+ i = CPU_SETSIZE;
+ }
+ }
+ } else {
+ argv = grow_argv(argv, argc, 2);
+ argv[argc++] = xstrdup("-l");
+ argv[argc++] = xasprintf("0");
+ }
+
+ if (params)
+ p = strtok(params, " ");
+ else
+ p = NULL;
+ while (p != NULL) {
+ argv = grow_argv(argv, argc, 1);
+ argv[argc++] = xstrdup(p);
+ p = strtok(NULL, " ");
+ }
+
+ if (no_huge) {
+ argv = grow_argv(argv, argc, 3);
+ argv[argc++] = xstrdup("-m");
+ argv[argc++] = xstrdup("2048");
+ argv[argc++] = xstrdup("--no-huge");
+ }
+
+ if (vnic) {
+ argv = grow_argv(argv, argc, 2);
+ argv[argc++] = xstrdup(vnic);
+ argv[argc++] = xstrdup("--no-pci");
+ }
+
+ argv = grow_argv(argv, argc, 1);
+ argv[argc++] = xstrdup("--");
+
+ argv_to_release = grow_argv(argv_to_release, 0, argc);
+ for (i = 0; i < argc; ++i)
+ argv_to_release[i] = argv[i];
+
+ glue_init1(argc, argv);
+
+ /* Alloc and setup this default ctx for any sockets operations before
+ * thread/ctx binding which happens when epoll_wait.
+ */
+ glue_ctx_alloc();
+
+ release_argv(argc, argv_to_release, argv);
+
+ /* Set back the affinity */
+ err = pthread_setaffinity_np(tid, sizeof(cpu_set_t), &cpuset);
+ if (err)
+ GLUE_LOG(ERR, "Failed to set back affinity");
+}
+
+static void __attribute__((destructor))
+glue_uninit(void)
+{
+ struct sock *so;
+ struct glue_ctx *ctx;
+ int i, max = fd_table.fd_base + fd_table.fd_num;
+
+ /* TODO: lets optimize it */
+ for (i = fd_table.fd_base; i < max; i++) {
+ so = fd2sock(i);
+ if (!so || !so->valid)
+ continue;
+ if (IS_TCP(so))
+ tle_tcp_stream_kill(so->s);
+ }
+
+ for (i = 0; i < nb_ctx; ++i) {
+ ctx = glue_ctx_lookup(0, i);
+ while (be_process(ctx)) { /* empty */ };
+ }
+}
diff --git a/lib/libtle_glue/internal.h b/lib/libtle_glue/internal.h
new file mode 100644
index 0000000..91fe784
--- /dev/null
+++ b/lib/libtle_glue/internal.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _TLE_GLUE_INTERNAL_H_
+#define _TLE_GLUE_INTERNAL_H_
+
+#include <rte_mbuf.h>
+#include <rte_atomic.h>
+
+#include <tle_ctx.h>
+
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <sys/epoll.h>
+
+#include "ctx.h"
+#include "sym.h"
+#include <rte_mempool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int stopped;
+
+extern uint64_t rx_offload;
+extern uint64_t tx_offload;
+
+void port_reconfig(void);
+
+uint16_t create_loopback(uint32_t socket_id);
+
+struct rte_mempool * get_mempool_by_socket(int32_t socket_id);
+
+int be_process(struct glue_ctx *ctx);
+
+int be_tx(struct glue_ctx *ctx);
+
+struct rte_mbuf * arp_recv(struct glue_ctx *ctx,
+ struct rte_mbuf *m, uint32_t l2len);
+
+struct rte_mbuf * ndp_recv(struct glue_ctx *ctx,
+ struct rte_mbuf *m, uint32_t l2len, uint32_t l3len);
+
+
+void mac_check(struct glue_ctx *ctx, const struct sockaddr* addr);
+
+int arp_ipv4_dst_lookup(void *data, const struct in_addr *addr,
+ struct tle_dest *res, int proto);
+
+int arp_ipv6_dst_lookup(void *data, const struct in6_addr *addr,
+ struct tle_dest *res, int proto);
+
+int mac_fill(struct glue_ctx *ctx, struct rte_mbuf *m);
+
+void mac_timeout(struct glue_ctx *ctx);
+
+int setup_rx_cb(uint16_t port_id, uint16_t qid);
+
+int epoll_kernel_wait(struct glue_ctx *ctx, int efd,
+ struct epoll_event *events,
+ int maxevents, int timeout, int *rx);
+
+int poll_common(struct glue_ctx *ctx, struct epoll_event *events,
+ int maxevents, int timeout, int shadow_efd);
+
+int dev_rxq_wakeup(uint16_t port_id);
+
+struct rte_mbuf * icmp_recv(struct glue_ctx *ctx, struct rte_mbuf *pkt,
+ uint32_t l2len, uint32_t l3len);
+
+struct rte_mbuf * icmp6_recv(struct glue_ctx *ctx, struct rte_mbuf *pkt,
+ uint32_t l2len, uint32_t l3len);
+
+uint16_t typen_rx_callback(uint16_t port, uint16_t queue,
+ struct rte_mbuf *pkt[], uint16_t nb_pkts,
+ uint16_t max_pkts, void *user_param);
+
+void ipv4_dst_add(struct glue_ctx *ctx, const struct in_addr *addr,
+ struct ether_addr *e_addr);
+
+void ipv6_dst_add(struct glue_ctx *ctx, const struct in6_addr *addr,
+ struct ether_addr *e_addr);
+
+#ifdef LOOK_ASIDE_BACKEND
+extern rte_atomic32_t flag_sleep;
+
+enum {
+ IOTHREAD_BUSY = 0, /* io thread is busy */
+ IOTHREAD_SLEEP, /* io thread is sleeping */
+ IOTHREAD_PREEMPT, /* io thread is preempted by another worker thread */
+};
+
+static inline int
+sleep_with_lock(int efd, struct epoll_event *events, int max, int to)
+{
+ int rc;
+
+ rte_atomic32_set(&flag_sleep, IOTHREAD_SLEEP);
+ rc = k_epoll_pwait(efd, events, max, to, NULL);
+ while (rte_atomic32_cmpset((volatile uint32_t *)&flag_sleep,
+ IOTHREAD_SLEEP, IOTHREAD_BUSY) == 0);
+
+ return rc;
+}
+
+static inline void
+be_tx_with_lock(struct glue_ctx *ctx)
+{
+ if (rte_atomic32_cmpset((volatile uint32_t *)&flag_sleep,
+ IOTHREAD_SLEEP, IOTHREAD_PREEMPT)) {
+ while (be_tx(ctx) > 0) {};
+ rte_atomic32_set(&flag_sleep, IOTHREAD_SLEEP);
+ }
+}
+
+static inline void
+wake_lookaside_backend(struct glue_ctx *ctx)
+{
+ if (rte_atomic32_read(&flag_sleep) == IOTHREAD_PREEMPT)
+ dev_rxq_wakeup(ctx->port_id);
+}
+
+static inline bool
+io_thread_in_sleep(void)
+{
+ return rte_atomic32_read(&flag_sleep) == IOTHREAD_SLEEP;
+}
+#else
+#define sleep_with_lock k_epoll_wait
+#define be_tx_with_lock(ctx) do {} while(0)
+#define wake_lookaside_backend(ctx) do {} while(0)
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _TLE_GLUE_INTERNAL_H_ */
diff --git a/lib/libtle_glue/log.h b/lib/libtle_glue/log.h
new file mode 100644
index 0000000..da31ea3
--- /dev/null
+++ b/lib/libtle_glue/log.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2019 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _GLUE_LOG_H_
+#define _GLUE_LOG_H_
+
+#include <arpa/inet.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <rte_vect.h>
+#include <rte_memcpy.h>
+#include <rte_spinlock.h>
+#include <rte_log.h>
+#include <rte_errno.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * logging related macros.
+ */
+
+#define GLUE_LOG(lvl, fmt, args...) RTE_LOG(lvl, USER1, fmt "\n", ##args)
+
+#define DUMMY_MACRO do {} while (0)
+
+#ifdef ENABLE_DEBUG
+#define GLUE_DEBUG(fmt, arg...) fprintf(stderr, fmt "\n", ##arg)
+#else
+#define GLUE_DEBUG(fmt, arg...) DUMMY_MACRO
+#endif
+
+#ifdef ENABLE_TRACE
+#define TRACE(fmt, arg...) fprintf(stderr, fmt "\n", ##arg)
+#define PKT_DUMP(p) rte_pktmbuf_dump(stderr, (p), 64)
+#else
+#define TRACE(fmt, arg...) DUMMY_MACRO
+#define PKT_DUMP(p) DUMMY_MACRO
+#endif
+
+#ifdef DEBUG_ARP
+static inline void
+print_arp(int af, const void *src, const struct ether_addr *mac,
+ const char *action)
+{
+ char str_ip[64];
+ char str_mac[32];
+ socklen_t sz;
+
+ ether_format_addr(str_mac, sizeof(str_mac), mac);
+ sz = (af == AF_INET) ? sizeof(struct in_addr) : sizeof(struct in6_addr);
+ inet_ntop(af, src, str_ip, sz);
+ RTE_LOG(INFO, "%s ARP entry: %s\tmac=%s", action, str_ip, str_mac);
+}
+#else
+#define print_arp(arg...) DUMMY_MACRO
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _GLUE_LOG_H_ */
diff --git a/lib/libtle_glue/ndp.h b/lib/libtle_glue/ndp.h
new file mode 100644
index 0000000..a61ff5b
--- /dev/null
+++ b/lib/libtle_glue/ndp.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2019 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _TLE_NDP_H_
+#define _TLE_NDP_H_
+
+#define ND_OPT_SOURCE_LINKLAYER_ADDR 1
+#define ND_OPT_TARGET_LINKLAYER_ADDR 2
+#define ND_OPT_PREFIX_INFORMATION 3
+#define ND_OPT_REDIRECTED_HEADER 4
+#define ND_OPT_MTU 5
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _TLE_NDP_H_ */
diff --git a/lib/libtle_glue/packetdrill.c b/lib/libtle_glue/packetdrill.c
new file mode 100644
index 0000000..79d1d52
--- /dev/null
+++ b/lib/libtle_glue/packetdrill.c
@@ -0,0 +1,544 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <stdarg.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <arpa/inet.h>
+
+#include "packetdrill.h"
+#include "tle_glue.h"
+#include "internal.h"
+#include "fd.h"
+
+#include <rte_arp.h>
+#include <rte_common.h>
+#include <rte_ethdev.h>
+#include <rte_ip.h>
+#include <rte_vhost.h>
+
+static int vhost_vid;
+enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
+static const char *sockname = "/tmp/sock0";
+
+static int
+new_device(int vid)
+{
+ vhost_vid = vid;
+
+ /* Disable notifications. */
+ rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
+ rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
+
+ return 0;
+}
+
+static void
+destroy_device(int vid)
+{
+ RTE_SET_USED(vid);
+}
+
+static const struct vhost_device_ops device_ops =
+{
+ .new_device = new_device,
+ .destroy_device = destroy_device,
+};
+
+static void
+vhost_init(void)
+{
+ unlink(sockname);
+
+ if (rte_vhost_driver_register(sockname, 0) != 0)
+ rte_exit(EXIT_FAILURE, "failed to register vhost driver \n");
+
+ if (rte_vhost_driver_callback_register(sockname, &device_ops) != 0)
+ rte_exit(EXIT_FAILURE, "failed to register vhost driver callbacks.\n");
+
+ if (rte_vhost_driver_start(sockname) < 0)
+ rte_exit(EXIT_FAILURE, "failed to start vhost driver.\n");
+
+ rte_log_set_level(RTE_LOGTYPE_USER1, RTE_LOG_NOTICE);
+}
+
+static uint64_t
+now_usecs(void)
+{
+ struct timeval tv;
+
+ gettimeofday(&tv, NULL);
+ return ((uint64_t) tv.tv_sec * 1000000) + tv.tv_usec;
+}
+
+static void
+pd_free(void *userdata)
+{
+ RTE_SET_USED(userdata);
+}
+
+static int
+pd_socket(void *userdata, int domain, int type, int protocol)
+{
+ RTE_SET_USED(userdata);
+ return PRE(socket)(domain, type, protocol);
+}
+
+static int
+pd_bind(void *userdata, int sockfd, const struct sockaddr *addr,
+ socklen_t addrlen)
+{
+ RTE_SET_USED(userdata);
+ return PRE(bind)(sockfd, addr, addrlen);
+}
+
+static int
+pd_listen(void *userdata, int sockfd, int backlog)
+{
+ RTE_SET_USED(userdata);
+ return PRE(listen)(sockfd, backlog);
+}
+
+static int
+pd_accept(void *userdata, int sockfd, struct sockaddr *addr,
+ socklen_t *addrlen)
+{
+ RTE_SET_USED(userdata);
+ return PRE(accept)(sockfd, addr, addrlen);
+}
+
+static int
+pd_connect(void *userdata, int sockfd, const struct sockaddr *addr,
+ socklen_t addrlen)
+{
+ RTE_SET_USED(userdata);
+ return PRE(connect)(sockfd, addr, addrlen);
+}
+
+static ssize_t
+pd_read(void *userdata, int fd, void *buf, size_t count)
+{
+ RTE_SET_USED(userdata);
+ return PRE(read)(fd, buf, count);
+}
+
+static ssize_t
+pd_readv(void *userdata, int fd, const struct iovec *iov, int iovcnt)
+{
+ RTE_SET_USED(userdata);
+ return PRE(readv)(fd, iov, iovcnt);
+}
+
+static ssize_t
+pd_recv(void *userdata, int sockfd, void *buf, size_t len, int flags)
+{
+ RTE_SET_USED(userdata);
+ return PRE(recv)(sockfd, buf, len, flags);
+}
+
+static ssize_t
+pd_recvfrom(void *userdata, int sockfd, void *buf, size_t len,
+ int flags, struct sockaddr *src_addr, socklen_t *addrlen)
+{
+ RTE_SET_USED(userdata);
+ return PRE(recvfrom)(sockfd, buf, len, flags, src_addr, addrlen);
+}
+
+static ssize_t
+pd_recvmsg(void *userdata, int sockfd, struct msghdr *msg, int flags)
+{
+ RTE_SET_USED(userdata);
+ return PRE(recvmsg)(sockfd, msg, flags);
+}
+
+static ssize_t
+pd_write(void *userdata, int fd, const void *buf, size_t count)
+{
+ RTE_SET_USED(userdata);
+ return PRE(write)(fd, buf, count);
+}
+
+static ssize_t
+pd_writev(void *userdata, int fd, const struct iovec *iov, int iovcnt)
+{
+ RTE_SET_USED(userdata);
+ return PRE(writev)(fd, iov, iovcnt);
+}
+
+static ssize_t
+pd_send(void *userdata, int sockfd, const void *buf, size_t len, int flags)
+{
+ RTE_SET_USED(userdata);
+ return PRE(send)(sockfd, buf, len, flags);
+}
+
+static ssize_t
+pd_sendto(void *userdata, int sockfd, const void *buf, size_t len, int flags,
+ const struct sockaddr *dest_addr, socklen_t addrlen)
+{
+ RTE_SET_USED(userdata);
+ return PRE(sendto)(sockfd, buf, len, flags, dest_addr, addrlen);
+}
+
+static ssize_t
+pd_sendmsg(void *userdata, int sockfd, const struct msghdr *msg, int flags)
+{
+ RTE_SET_USED(userdata);
+ return PRE(sendmsg)(sockfd, msg, flags);
+}
+
+static int
+pd_fcntl(void *userdata, int fd, int cmd, ...)
+{
+ void *arg;
+ va_list ap;
+
+ va_start(ap, cmd);
+ arg = va_arg(ap, void *);
+ va_end(ap);
+
+ RTE_SET_USED(userdata);
+ return PRE(fcntl)(fd, cmd, arg);
+}
+
+static int
+pd_ioctl(void *userdata, int fd, unsigned long request, ...)
+{
+ void *arg;
+ va_list ap;
+
+ va_start(ap, request);
+ arg = va_arg(ap, void *);
+ va_end(ap);
+
+ RTE_SET_USED(userdata);
+ return PRE(ioctl)(fd, request, arg);
+}
+
+static int
+pd_close(void *userdata, int fd)
+{
+ RTE_SET_USED(userdata);
+ return PRE(close)(fd);
+}
+
+static int
+pd_shutdown(void *userdata, int sockfd, int how)
+{
+ RTE_SET_USED(userdata);
+ return PRE(shutdown)(sockfd, how);
+}
+
+static int
+pd_getsockopt(void *userdata, int sockfd, int level, int optname,
+ void *optval, socklen_t *optlen)
+{
+ RTE_SET_USED(userdata);
+ return PRE(getsockopt)(sockfd, level, optname, optval, optlen);
+}
+
+static int
+pd_setsockopt(void *userdata, int sockfd, int level, int optname,
+ const void *optval, socklen_t optlen)
+{
+ RTE_SET_USED(userdata);
+ return PRE(setsockopt)(sockfd, level, optname, optval, optlen);
+}
+
+static int
+pd_poll(void *userdata, struct pollfd *fds, nfds_t nfds, int timeout)
+{
+ RTE_SET_USED(userdata);
+ return PRE(poll)(fds, nfds, timeout);
+}
+
+static struct rte_mbuf *
+from_buf_to_mbuf(const void *buf, size_t count)
+{
+ struct rte_mempool *mp = get_mempool_by_socket(0);
+ uint16_t nb_mbufs = (count + RTE_MBUF_DEFAULT_DATAROOM - 1) /
+ RTE_MBUF_DEFAULT_DATAROOM;
+ struct rte_mbuf *mbufs[nb_mbufs + 1];
+ uint16_t i, copy_len;
+ size_t done = 0;
+ char *dst;
+
+ if (unlikely(rte_pktmbuf_alloc_bulk(mp, mbufs, nb_mbufs) < 0))
+ rte_exit(EXIT_FAILURE, "allocate mbuf fails\n");
+
+ for (i = 0; i < nb_mbufs; ++i) {
+ copy_len = RTE_MIN((size_t)RTE_MBUF_DEFAULT_DATAROOM,
+ count - done);
+ dst = rte_pktmbuf_mtod(mbufs[i], char *);
+ rte_memcpy(dst, (const char *)buf + done, copy_len);
+ done += copy_len;
+ mbufs[i]->data_len = copy_len;
+ if (i > 0)
+ mbufs[i-1]->next = mbufs[i];
+ }
+
+ mbufs[0]->pkt_len = count;
+ mbufs[0]->nb_segs = nb_mbufs;
+
+ return mbufs[0];
+}
+
+/* Send @count bytes of data starting from @buf to the TCP stack.
+ * Return 0 on success or -1 on error.
+ */
+static int
+pd_netdev_send(void *userdata, const void *buf, size_t count)
+{
+ struct ether_hdr *hdr;
+ struct rte_mbuf *m;
+
+ RTE_SET_USED(userdata);
+
+ m = from_buf_to_mbuf(buf, count);
+
+ // add l2 header
+ hdr = (struct ether_hdr *)rte_pktmbuf_prepend(m, sizeof(struct ether_hdr));
+ hdr->ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv4);
+
+ if (rte_vhost_enqueue_burst(vhost_vid, VIRTIO_RXQ, &m, 1) == 1)
+ return 0;
+
+ return -1;
+}
+
+static inline struct rte_mbuf *
+from_mbuf_to_buf(struct rte_mbuf *m, char *buf, size_t len, int ispeek, int needcpy)
+{
+ void *src;
+ uint32_t done = 0;
+ uint32_t left = len, orig_pkt_len;
+ uint16_t copy_len, seg_len;
+ struct rte_mbuf *m_next, *orig_pkt;
+
+ if (len == 0)
+ return m;
+
+ orig_pkt = m;
+ orig_pkt_len = m->pkt_len;
+
+ do {
+ seg_len = rte_pktmbuf_data_len(m);
+ copy_len = RTE_MIN(seg_len, left);
+ src = rte_pktmbuf_mtod(m, void *);
+ if (needcpy)
+ rte_memcpy(buf + done, src, copy_len);
+ done += copy_len;
+ left -= copy_len;
+ if (copy_len < seg_len) {
+ if (!ispeek) {
+ rte_pktmbuf_adj(m, copy_len);
+ }
+ break;
+ }
+ m_next = m->next;
+ if (!ispeek) {
+ rte_pktmbuf_free_seg(m);
+ }
+ m = m_next;
+ } while (left && m);
+
+ if (m && !ispeek)
+ m->pkt_len = orig_pkt_len - done;
+
+ if(ispeek)
+ return orig_pkt;
+ else
+ return m;
+}
+
+/* Sniff the next packet leaving the TCP stack.
+ * Put packet data in @buf. @count is passed in as the buffer size.
+ * The actual number of bytes received should be put in @count.
+ * Set @count to 0 if received nothing.
+ * Set @time_usecs to the receive timestamp.
+ * Return 0 on success or -1 on error. */
+static int
+pd_netdev_recv(void *userdata, void *buf, size_t *count, long long *time_usecs)
+{
+ struct rte_mbuf *m;
+ struct rte_mempool *mp = get_mempool_by_socket(0);
+
+ RTE_SET_USED(userdata);
+
+ while (rte_vhost_dequeue_burst(vhost_vid, VIRTIO_TXQ, mp, &m, 1) == 0);
+
+ // remove l2 header
+ rte_pktmbuf_adj(m, sizeof(struct ether_hdr));
+
+ *count = m->pkt_len;
+ from_mbuf_to_buf(m, buf, *count, 0, 1);
+
+ *time_usecs = now_usecs();
+ return 0;
+}
+
+static int
+pd_usleep(void *userdata, useconds_t usec)
+{
+ RTE_SET_USED(userdata);
+ return usleep(usec);
+}
+
+static int
+pd_gettimeofday(void *userdata, struct timeval *tv, struct timezone *tz)
+{
+ RTE_SET_USED(userdata);
+ return gettimeofday(tv, tz);
+}
+
+static int
+pd_epoll_create(void *userdata, int size)
+{
+ RTE_SET_USED(userdata);
+ return PRE(epoll_create)(size);
+}
+
+static int
+pd_epoll_ctl(void *userdata, int epfd, int op, int fd,
+ struct epoll_event *event)
+{
+ RTE_SET_USED(userdata);
+ return PRE(epoll_ctl)(epfd, op, fd, event);
+}
+
+static int
+pd_epoll_wait(void *userdata, int epfd, struct epoll_event *events,
+ int maxevents, int timeout)
+{
+ RTE_SET_USED(userdata);
+ return PRE(epoll_wait)(epfd, events, maxevents, timeout);
+}
+
+static int
+pd_pipe(void *userdata, int pipefd[2])
+{
+ RTE_SET_USED(userdata);
+ return pipe(pipefd);
+}
+
+static int
+pd_splice(void *userdata, int fd_in, loff_t *off_in, int fd_out,
+ loff_t *off_out, size_t len, unsigned int flags)
+{
+ RTE_SET_USED(userdata);
+ return PRE(splice)(fd_in, off_in, fd_out, off_out, len, flags);
+}
+
+static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
+
+static void *
+io(void *arg)
+{
+ int epfd;
+ struct in_addr ipv4;
+ struct ether_addr mac = { .addr_bytes = { 0xee, 0xff, 0xff, 0xff, 0xff, 0xff}, };
+ struct epoll_event events[128];
+
+ RTE_SET_USED(arg);
+
+ setenv(DPDK_IP, "192.168.0.2", 1);
+ setenv(DPDK_IP_MASK, "16", 1);
+ setenv(DPDK_IP_GATEWAY, "192.168.0.1", 1);
+
+ setenv(DPDK_IPV6, "fd3d:fa7b:d17d::0", 1);
+ setenv(DPDK_IPV6_MASK, "48", 1);
+ setenv(DPDK_IPV6_GATEWAY, "fd3d:fa7b:d17d:8888::0", 1);
+
+ epfd = PRE(epoll_create)(0);
+
+ inet_pton(AF_INET, "192.168.0.1", &ipv4);
+
+ ipv4_dst_add(default_ctx, &ipv4, &mac);
+
+ pthread_mutex_unlock(&lock);
+
+ while (1) {
+ PRE(epoll_wait)(epfd, events, 128, 0);
+ }
+
+ return NULL;
+}
+
+void
+packetdrill_interface_init(const char *flags,
+ struct packetdrill_interface *ifc)
+{
+ int argc = 0;
+ char *argv[16];
+ pthread_t tid;
+
+ RTE_SET_USED(flags);
+
+ argv[argc++] = strdup("test");
+ argv[argc++] = strdup("-l");
+ argv[argc++] = strdup("0");
+ argv[argc++] = strdup("--no-pci");
+ argv[argc++] = strdup("--in-memory");
+ argv[argc++] = strdup("--single-file-segments");
+ argv[argc++] = strdup("--");
+
+ if (rte_eal_init(argc, argv) < 0)
+ rte_exit(EXIT_FAILURE, "Failed to init DPDK\n");
+
+ fd_init();
+
+ vhost_init();
+
+ if (rte_eal_hotplug_add("vdev", "virtio_user0", "path=/tmp/sock0") < 0)
+ rte_exit(EXIT_FAILURE, "hot plug virtio-user failed\n");
+
+ pthread_mutex_lock(&lock);
+
+ pthread_create(&tid, NULL, io, NULL);
+
+ pthread_mutex_lock(&lock);
+
+ ifc->free = pd_free;
+ ifc->socket = pd_socket;
+ ifc->bind = pd_bind;
+ ifc->listen = pd_listen;
+ ifc->accept = pd_accept;
+ ifc->connect = pd_connect;
+ ifc->read = pd_read;
+ ifc->readv = pd_readv;
+ ifc->recv = pd_recv;
+ ifc->recvfrom = pd_recvfrom;
+ ifc->recvmsg = pd_recvmsg;
+ ifc->write = pd_write;
+ ifc->writev = pd_writev;
+ ifc->send = pd_send;
+ ifc->sendto = pd_sendto;
+ ifc->sendmsg = pd_sendmsg;
+ ifc->fcntl = pd_fcntl;
+ ifc->ioctl = pd_ioctl;
+ ifc->close = pd_close;
+ ifc->shutdown = pd_shutdown;
+ ifc->getsockopt = pd_getsockopt;
+ ifc->setsockopt = pd_setsockopt;
+ ifc->poll = pd_poll;
+ ifc->netdev_send = pd_netdev_send;
+ ifc->netdev_receive = pd_netdev_recv;
+ ifc->usleep = pd_usleep;
+ ifc->gettimeofday = pd_gettimeofday;
+ ifc->epoll_create = pd_epoll_create;
+ ifc->epoll_ctl = pd_epoll_ctl;
+ ifc->epoll_wait = pd_epoll_wait;
+ ifc->pipe = pd_pipe;
+ ifc->splice = pd_splice;
+}
diff --git a/lib/libtle_glue/packetdrill.h b/lib/libtle_glue/packetdrill.h
new file mode 100644
index 0000000..6f84a87
--- /dev/null
+++ b/lib/libtle_glue/packetdrill.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright 2015 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: xiaoj@google.com (Xiao Jia)
+ *
+ * Interface for packetdrill.
+ *
+ * To be tested against as a shared object (*.so) file, implement this
+ * interface, export a function "packetdrill_interface_init", and
+ * initialize the interface struct passed in with your own functions.
+ */
+
+#ifndef __PACKETDRILL_H__
+#define __PACKETDRILL_H__
+
+#include <poll.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/epoll.h>
+#include <unistd.h>
+
+struct packetdrill_interface {
+ void *userdata;
+ void (*free)(void *userdata);
+ int (*socket)(void *userdata, int domain, int type, int protocol);
+ int (*bind)(void *userdata, int sockfd, const struct sockaddr *addr,
+ socklen_t addrlen);
+ int (*listen)(void *userdata, int sockfd, int backlog);
+ int (*accept)(void *userdata, int sockfd, struct sockaddr *addr,
+ socklen_t *addrlen);
+ int (*connect)(void *userdata, int sockfd, const struct sockaddr *addr,
+ socklen_t addrlen);
+ ssize_t (*read)(void *userdata, int fd, void *buf, size_t count);
+ ssize_t (*readv)(void *userdata, int fd, const struct iovec *iov,
+ int iovcnt);
+ ssize_t (*recv)(void *userdata, int sockfd, void *buf, size_t len,
+ int flags);
+ ssize_t (*recvfrom)(void *userdata, int sockfd, void *buf, size_t len,
+ int flags, struct sockaddr *src_addr,
+ socklen_t *addrlen);
+ ssize_t (*recvmsg)(void *userdata, int sockfd, struct msghdr *msg,
+ int flags);
+ ssize_t (*write)(void *userdata, int fd, const void *buf, size_t count);
+ ssize_t (*writev)(void *userdata, int fd, const struct iovec *iov,
+ int iovcnt);
+ ssize_t (*send)(void *userdata, int sockfd, const void *buf, size_t len,
+ int flags);
+ ssize_t (*sendto)(void *userdata, int sockfd, const void *buf,
+ size_t len, int flags,
+ const struct sockaddr *dest_addr, socklen_t addrlen);
+ ssize_t (*sendmsg)(void *userdata, int sockfd, const struct msghdr *msg,
+ int flags);
+ int (*fcntl)(void *userdata, int fd, int cmd, ...);
+ int (*ioctl)(void *userdata, int fd, unsigned long request, ...);
+ int (*close)(void *userdata, int fd);
+ int (*shutdown)(void *userdata, int sockfd, int how);
+ int (*getsockopt)(void *userdata, int sockfd, int level, int optname,
+ void *optval, socklen_t *optlen);
+ int (*setsockopt)(void *userdata, int sockfd, int level, int optname,
+ const void *optval, socklen_t optlen);
+ int (*poll)(void *userdata, struct pollfd *fds, nfds_t nfds,
+ int timeout);
+ /* Send @count bytes of data starting from @buf to the TCP stack.
+ * Return 0 on success or -1 on error. */
+ int (*netdev_send)(void *userdata, const void *buf, size_t count);
+ /* Sniff the next packet leaving the TCP stack.
+ * Put packet data in @buf. @count is passed in as the buffer size.
+ * The actual number of bytes received should be put in @count.
+ * Set @count to 0 if received nothing.
+ * Set @time_usecs to the receive timestamp.
+ * Return 0 on success or -1 on error. */
+ int (*netdev_receive)(void *userdata, void *buf, size_t *count,
+ long long *time_usecs);
+ int (*usleep)(void *userdata, useconds_t usec);
+ int (*gettimeofday)(void *userdata, struct timeval *tv,
+ struct timezone *tz);
+ int (*epoll_create)(void *userdata, int size);
+ int (*epoll_ctl)(void *userdata, int epfd, int op, int fd,
+ struct epoll_event *event);
+ int (*epoll_wait)(void *userdata, int epfd, struct epoll_event *events,
+ int maxevents, int timeout);
+ int (*pipe)(void *userdata, int pipefd[2]);
+ int (*splice)(void *userdata, int fd_in, loff_t *off_in, int fd_out,
+ loff_t *off_out, size_t len, unsigned int flags);
+};
+
+typedef void (*packetdrill_interface_init_t)(const char *flags,
+ struct packetdrill_interface *);
+
+void
+packetdrill_interface_init(const char *flags, struct packetdrill_interface *ifc);
+
+#endif /* __PACKETDRILL_H__ */
diff --git a/lib/libtle_glue/poll.c b/lib/libtle_glue/poll.c
new file mode 100644
index 0000000..ebc0110
--- /dev/null
+++ b/lib/libtle_glue/poll.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <signal.h>
+#include <poll.h>
+
+#include "fd.h"
+#include "ctx.h"
+#include "sym.h"
+#include "log.h"
+#include "util.h"
+#include "internal.h"
+#include "tle_glue.h"
+
+int
+PRE(poll)(struct pollfd *fds, nfds_t nfds, int timeout)
+{
+ int efd;
+ int total = 0, j;
+ int tmp_ev;
+ uint32_t i;
+ uint32_t k_n = 0;
+ int k_fds[nfds];
+ struct sock *so;
+ struct glue_ctx *ctx;
+ struct epoll_event k_ev;
+ struct epoll_event events[nfds];
+
+ for (i = 0; i < nfds; ++i) {
+ if (is_kernel_fd(fds[i].fd)) {
+ k_fds[k_n++] = i;
+ continue;
+ }
+
+ so = fd2sock(fds[i].fd);
+ if (!so->valid)
+ continue;
+
+ fds[i].revents = fd_ready(fds[i].fd, fds[i].events);
+ if (fds[i].revents) {
+ total++;
+ continue;
+ }
+
+ /* We fill sock->event here as we need this when
+ * we filter events in poll_common(). But it was
+ * originally set by epoll_ctl(). Now we have to
+ * assume that there are no application which
+ * uses epoll and poll at the same time.
+ */
+ so->event.events = fds[i].events;
+ so->event.data.u32 = i; /* store idx */
+ }
+
+ if (k_n == nfds)
+ return k_poll(fds, nfds, timeout);
+
+ if (total > 0)
+ return total;
+
+ /* thread <> context binding happens here */
+ if (RTE_PER_LCORE(glue_ctx) == NULL) {
+ ctx = &ctx_array[glue_ctx_alloc()];
+ RTE_PER_LCORE(glue_ctx) = ctx;
+ } else
+ ctx = RTE_PER_LCORE(glue_ctx);
+
+ total = poll_common(ctx, events, nfds, 0, -1);
+
+ /* We assume kernel I/O events are not as important as user ones */
+ if (total > 0)
+ goto format;
+
+ efd = k_epoll_create(1);
+ if (efd < 0)
+ rte_panic("k_epoll_create failed %d", errno);
+
+ for (i = 0; i < k_n; ++i) {
+ k_ev.events = fds[k_fds[i]].events;
+ k_ev.data.u32 = k_fds[i]; /* store idx */
+ k_epoll_ctl(efd, EPOLL_CTL_ADD, fds[k_fds[i]].fd, &k_ev);
+ }
+
+ total = poll_common(ctx, events, nfds, timeout, efd);
+ k_close(efd);
+format:
+ for (j = 0; j < total; ++j) {
+ tmp_ev = events[j].events;
+ if (tmp_ev == POLLHUP) {
+ tmp_ev |= POLLERR | (fds[events[j].data.u32].events &
+ (POLLIN | POLLOUT));
+ }
+ fds[events[j].data.u32].revents = tmp_ev;
+ }
+
+ return total;
+}
+
+int
+PRE(ppoll)(struct pollfd *fds, nfds_t nfds,
+ const struct timespec *tmo_p, const sigset_t *sigmask)
+{
+ int timeout;
+
+ if (sigmask != NULL)
+ rte_panic("ppoll with signal is not supported");
+
+ if (tmo_p == NULL)
+ timeout = -1;
+ else
+ timeout = tmo_p->tv_sec * 1000 + tmo_p->tv_nsec / 1000000;
+
+ return poll(fds, nfds, timeout);
+}
+
+extern int __poll_chk(struct pollfd *fds, nfds_t nfds, int timeout,
+ __SIZE_TYPE__ fdslen);
+int
+__poll_chk(struct pollfd *fds, nfds_t nfds, int timeout,
+ __SIZE_TYPE__ fdslen __rte_unused)
+{
+ return poll(fds, nfds, timeout);
+}
diff --git a/lib/libtle_glue/port.c b/lib/libtle_glue/port.c
new file mode 100644
index 0000000..7a4cf2e
--- /dev/null
+++ b/lib/libtle_glue/port.c
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <sys/eventfd.h>
+#include <unistd.h>
+
+#include <rte_ethdev.h>
+#include <rte_eth_ring.h>
+
+#include "log.h"
+#include "ctx.h"
+#include "config.h"
+#include "internal.h"
+
+int stopped;
+
+static struct rte_mempool *mpool[RTE_MAX_NUMA_NODES];
+
+struct rte_mempool *
+get_mempool_by_socket(int32_t socket_id)
+{
+ struct rte_mempool *mp;
+ char name[RTE_MEMPOOL_NAMESIZE];
+
+ if (socket_id == SOCKET_ID_ANY)
+ socket_id = 0;
+
+ if (mpool[socket_id])
+ return mpool[socket_id];
+
+ snprintf(name, sizeof(name), "MP%u", socket_id);
+ mp = rte_pktmbuf_dynamic_pool_create(name, MAX_MBUFS - 1,
+ MBUF_PERCORE_CACHE, 0,
+ RTE_MBUF_DEFAULT_BUF_SIZE,
+ socket_id, MBUF_DYNAMIC_SIZE);
+
+ if (mp == NULL)
+ rte_panic("Failed to create mbuf mempool");
+
+ mpool[socket_id] = mp;
+ return mp;
+}
+
+static void
+update_rss_conf(uint16_t port_id)
+{
+ struct rte_eth_rss_conf rss_conf = {
+ .rss_key = NULL,
+ .rss_key_len = 0,
+ .rss_hf = ETH_RSS_IP | ETH_RSS_TCP | ETH_RSS_UDP,
+ };
+
+ if (rte_eth_dev_rss_hash_update(port_id, &rss_conf) < 0)
+ rte_panic("Failed to update rss hash");
+}
+
+static void
+queue_init(uint16_t port_id, uint16_t nb_queues,
+ struct rte_eth_dev_info *dev_info,
+ struct rte_eth_conf *port_conf)
+{
+ uint16_t q;
+ int32_t socket_id, rc;
+ uint16_t nb_rxd = 1024, nb_txd = 1024;
+ struct rte_mempool *mp;
+ struct rte_eth_txconf txq_conf = dev_info->default_txconf;
+ struct rte_eth_rxconf rxq_conf = dev_info->default_rxconf;
+
+ socket_id = rte_eth_dev_socket_id(port_id);
+ mp = get_mempool_by_socket(socket_id);
+
+ dev_info->default_rxconf.rx_drop_en = 1;
+
+ rc = rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, &nb_txd);
+ if (rc < 0)
+ rte_panic("Cannot adjust number of desc");
+
+ rxq_conf.offloads = port_conf->rxmode.offloads;
+ txq_conf.offloads = port_conf->txmode.offloads;
+
+ /* faster free of tx entries */
+ txq_conf.tx_free_thresh = nb_txd - 64;
+
+ for (q = 0; q < nb_queues; q++) {
+ rc = rte_eth_rx_queue_setup(port_id, q, nb_rxd,
+ socket_id, &rxq_conf, mp);
+ if (rc < 0)
+ rte_panic("rx queue=%u setup failed: %d", q, rc);
+
+ rc = setup_rx_cb(port_id, q);
+ if (rc < 0)
+ rte_panic("rx queue=%u rx setup failed: %d", q, rc);
+ }
+
+ for (q = 0; q < nb_queues; q++) {
+ rc = rte_eth_tx_queue_setup(port_id, q, nb_txd,
+ socket_id, &txq_conf);
+ if (rc < 0)
+ rte_panic("tx queue=%u setup failed: %d", q, rc);
+ }
+}
+
+uint64_t rx_offload =
+ DEV_RX_OFFLOAD_IPV4_CKSUM |
+ DEV_RX_OFFLOAD_UDP_CKSUM |
+ DEV_RX_OFFLOAD_TCP_CKSUM;
+/* nice to have:
+ DEV_RX_OFFLOAD_CRC_STRIP |
+ DEV_RX_OFFLOAD_TCP_LRO |
+ DEV_RX_OFFLOAD_HEADER_SPLIT |
+ DEV_RX_OFFLOAD_SCATTER |
+ DEV_RX_OFFLOAD_TIMESTAMP
+*/
+
+uint64_t tx_offload =
+ DEV_TX_OFFLOAD_UDP_CKSUM |
+ DEV_TX_OFFLOAD_TCP_CKSUM |
+ DEV_TX_OFFLOAD_TCP_TSO |
+ DEV_TX_OFFLOAD_MULTI_SEGS;
+
+int
+dev_rxq_wakeup(uint16_t port_id)
+{
+ int fd;
+ uint16_t qid;
+ uint32_t vec, efd_idx;
+ struct rte_eth_dev *dev;
+ struct rte_intr_handle *intr_handle;
+
+ RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV);
+
+ dev = &rte_eth_devices[port_id];
+ intr_handle = dev->intr_handle;
+ if (!intr_handle)
+ return -ENOTSUP;
+ if (!intr_handle->intr_vec)
+ return -EPERM;
+
+ for (qid = 0; qid < dev->data->nb_rx_queues; qid++) {
+ vec = intr_handle->intr_vec[qid];
+ efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ?
+ (vec - RTE_INTR_VEC_RXTX_OFFSET) : vec;
+ fd = intr_handle->efds[efd_idx];
+ if (eventfd_write(fd, (eventfd_t) 1) < 0)
+ return -errno;
+ }
+
+ return 0;
+}
+
+void
+port_reconfig(void)
+{
+ int32_t rc;
+ struct rte_eth_dev_info dev_info;
+ uint16_t port_id = 0; /* We use and only use port 0 */
+ uint16_t nb_port;
+ uint16_t nb_queues = nb_ctx;
+
+ struct rte_eth_conf port_conf = {
+ .intr_conf = {
+ .rxq = 1,
+ },
+ };
+
+ /* 0. dev number check */
+ nb_port = rte_eth_dev_count_avail();
+ if (nb_port < 1 || nb_port >2)
+ rte_panic("One port is mandatory with an optional loopback device\n");
+
+ stopped = 1;
+ rte_wmb();
+ /* wake up all rxqs */
+ if (nb_ctx > 1)
+ dev_rxq_wakeup(port_id);
+
+ usleep(1); /* fix me: this cannot gurantee correctness */
+
+ rte_eth_dev_stop(port_id);
+
+ /* 1. offloading check and set*/
+ rte_eth_dev_info_get(port_id, &dev_info);
+ rx_offload &= dev_info.rx_offload_capa;
+ port_conf.rxmode.offloads = rx_offload;
+ tx_offload &= dev_info.tx_offload_capa;
+ port_conf.txmode.offloads = tx_offload;
+
+ GLUE_LOG(INFO, "configure queues = %d, offloads: rx = %"PRIx64", tx = %"PRIx64,
+ nb_queues, rx_offload, tx_offload);
+
+ /* 2. dev configure */
+ rc = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf);
+ if (rc != 0)
+ rte_panic("Failed to configure device, %d", rc);
+
+ /* 3. queue setup */
+ queue_init(port_id, nb_queues, &dev_info, &port_conf);
+
+ /* 4. rss conf */
+ if (nb_queues > 1)
+ update_rss_conf(port_id);
+
+ /* 5. dev start */
+ if (rte_eth_dev_start(port_id) < 0)
+ rte_panic("Failed to start device");
+
+ stopped = 0;
+}
+
+uint16_t
+create_loopback(uint32_t socket_id)
+{
+ int ret;
+ struct rte_ring* lb_queue;
+ static uint16_t lb_port_id = 0xFFFF;
+ const char *ring_name = "loopback-ring";
+
+ if (lb_port_id != 0xFFFF)
+ return lb_port_id;
+
+ lb_queue = rte_ring_create(ring_name, MAX_PKTS_BURST * 8, socket_id,
+ RING_F_SP_ENQ | RING_F_SC_DEQ);
+ if (!lb_queue)
+ rte_panic("Failed to create ring for loopback\n");
+ ret = rte_eth_from_ring(lb_queue);
+ if (ret < 0)
+ rte_panic("Failed to create ethdev from ring\n");
+ lb_port_id = ret;
+
+ if (setup_rx_cb(lb_port_id, 0) < 0)
+ rte_panic("Failed to set up rx cb for loopback\n");
+
+ return lb_port_id;
+}
diff --git a/lib/libtle_glue/rxcb.c b/lib/libtle_glue/rxcb.c
new file mode 100644
index 0000000..51f31c9
--- /dev/null
+++ b/lib/libtle_glue/rxcb.c
@@ -0,0 +1,834 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <rte_ethdev.h>
+#include <rte_arp.h>
+#include <rte_ip.h>
+#include <rte_tcp.h>
+#include <rte_udp.h>
+
+#include <netinet/in.h>
+#include <netinet/ip6.h>
+
+#include "log.h"
+#include "ctx.h"
+#include "internal.h"
+
+struct ptype2cb {
+ uint32_t mask;
+ const char *name;
+ rte_rx_callback_fn fn;
+};
+
+enum {
+ ETHER_ARP_PTYPE = 0x1,
+ IPV4_PTYPE = 0x2,
+ IPV4_EXT_PTYPE = 0x4,
+ IPV6_PTYPE = 0x8,
+ IPV6_EXT_PTYPE = 0x10,
+ TCP_PTYPE = 0x20,
+ UDP_PTYPE = 0x40,
+ ICMP_PTYPE = 0x80,
+};
+
+static inline uint64_t
+_mbuf_tx_offload(uint64_t il2, uint64_t il3, uint64_t il4, uint64_t tso,
+ uint64_t ol3, uint64_t ol2)
+{
+ return il2 | il3 << 7 | il4 << 16 | tso << 24 | ol3 << 40 | ol2 << 49;
+}
+
+static inline int32_t
+fill_pkt_hdr_len(struct rte_mbuf *m, uint32_t l2, uint32_t l3, uint32_t l4)
+{
+ if (l2 + l3 + l4 > m->pkt_len)
+ return -1;
+ m->tx_offload = _mbuf_tx_offload(l2, l3, l4, 0, 0, 0);
+ return 0;
+}
+
+static inline int
+is_ipv4_frag(const struct ipv4_hdr *iph)
+{
+ const uint16_t mask = rte_cpu_to_be_16(~IPV4_HDR_DF_FLAG);
+
+ return ((mask & iph->fragment_offset) != 0);
+}
+
+static inline uint32_t
+get_tcp_header_size(struct rte_mbuf *m, uint32_t l2_len, uint32_t l3_len)
+{
+ const struct tcp_hdr *tcp;
+
+ tcp = rte_pktmbuf_mtod_offset(m, struct tcp_hdr *, l2_len + l3_len);
+ return (tcp->data_off >> 4) * 4;
+}
+
+static inline int32_t
+adjust_ipv4_pktlen(struct rte_mbuf *m, uint32_t l2_len)
+{
+ uint32_t plen, trim;
+ const struct ipv4_hdr *iph;
+
+ iph = rte_pktmbuf_mtod_offset(m, const struct ipv4_hdr *, l2_len);
+ plen = rte_be_to_cpu_16(iph->total_length) + l2_len;
+ if (plen < m->pkt_len) {
+ trim = m->pkt_len - plen;
+ rte_pktmbuf_trim(m, trim);
+ } else if (plen > m->pkt_len)
+ return -1;
+
+ return 0;
+}
+
+static inline int32_t
+adjust_ipv6_pktlen(struct rte_mbuf *m, uint32_t l2_len)
+{
+ uint32_t plen, trim;
+ const struct ipv6_hdr *iph;
+
+ iph = rte_pktmbuf_mtod_offset(m, const struct ipv6_hdr *, l2_len);
+ plen = rte_be_to_cpu_16(iph->payload_len) + sizeof(*iph) + l2_len;
+ if (plen < m->pkt_len) {
+ trim = m->pkt_len - plen;
+ rte_pktmbuf_trim(m, trim);
+ } else if (plen > m->pkt_len)
+ return -1;
+
+ return 0;
+}
+
+static inline uint32_t
+get_ipv4_hdr_len(struct rte_mbuf *m, uint32_t l2, uint32_t proto, uint32_t frag)
+{
+ const struct ipv4_hdr *iph;
+ int32_t dlen, len;
+
+ dlen = rte_pktmbuf_data_len(m);
+ dlen -= l2;
+
+ iph = rte_pktmbuf_mtod_offset(m, const struct ipv4_hdr *, l2);
+ len = (iph->version_ihl & IPV4_HDR_IHL_MASK) * IPV4_IHL_MULTIPLIER;
+
+ if (frag != 0 && is_ipv4_frag(iph)) {
+ m->packet_type &= ~RTE_PTYPE_L4_MASK;
+ m->packet_type |= RTE_PTYPE_L4_FRAG;
+ }
+
+ if (len > dlen || (proto <= IPPROTO_MAX && iph->next_proto_id != proto))
+ m->packet_type = RTE_PTYPE_UNKNOWN;
+
+ return len;
+}
+
+static inline uint32_t
+get_ipv6x_hdr_len(struct rte_mbuf *m, uint32_t l2, uint32_t *fproto)
+{
+ const struct ipv6_hdr *ip6h;
+ const struct ip6_ext *ipx;
+ uint32_t nproto;
+ int32_t dlen, len, ofs;
+
+ ip6h = rte_pktmbuf_mtod_offset(m, struct ipv6_hdr*, l2);
+ nproto = ip6h->proto;
+ len = sizeof(struct ipv6_hdr);
+
+ dlen = rte_pktmbuf_data_len(m);
+ dlen -= l2;
+
+ ofs = l2 + len;
+ ipx = rte_pktmbuf_mtod_offset(m, const struct ip6_ext *, ofs);
+
+ while (ofs > 0 && len < dlen) {
+ switch (nproto) {
+ case IPPROTO_HOPOPTS:
+ case IPPROTO_ROUTING:
+ case IPPROTO_DSTOPTS:
+ ofs = (ipx->ip6e_len + 1) << 3;
+ break;
+ case IPPROTO_AH:
+ ofs = (ipx->ip6e_len + 2) << 2;
+ break;
+ case IPPROTO_FRAGMENT:
+ /*
+ * tso_segsz is not used by RX, so use it as temporary
+ * buffer to store the fragment offset.
+ */
+ m->tso_segsz = l2 + len;
+ ofs = sizeof(struct ip6_frag);
+ m->packet_type &= ~RTE_PTYPE_L4_MASK;
+ m->packet_type |= RTE_PTYPE_L4_FRAG;
+ break;
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ case IPPROTO_ICMPV6:
+ ofs = 0;
+ if (*fproto == 0)
+ *fproto = nproto;
+ break;
+ default:
+ ofs = 0;
+ }
+
+ if (ofs > 0) {
+ nproto = ipx->ip6e_nxt;
+ len += ofs;
+ ipx += ofs / sizeof(*ipx);
+ }
+ }
+
+ /* unrecognized or invalid packet. */
+ if (*fproto == 0 || len > dlen)
+ m->packet_type = RTE_PTYPE_UNKNOWN;
+
+ return len;
+}
+
+static inline uint32_t
+get_ipv6_hdr_len(struct rte_mbuf *m, uint32_t l2, uint32_t fproto)
+{
+ const struct ipv6_hdr *iph;
+
+ iph = rte_pktmbuf_mtod_offset(m, const struct ipv6_hdr *,
+ sizeof(struct ether_hdr));
+
+ if (iph->proto == fproto)
+ return sizeof(struct ipv6_hdr);
+ else
+ return get_ipv6x_hdr_len(m, l2, &fproto);
+}
+
+static inline struct rte_mbuf*
+process_ipv4_frag(struct rte_mbuf *m, struct glue_ctx *ctx,
+ uint32_t l2_len, uint32_t l3_len)
+{
+ struct ipv4_hdr* iph;
+
+ m->l2_len = l2_len;
+ m->l3_len = l3_len;
+ /* fixme: ip checksum should be checked here.
+ * After reassemble, the ip checksum would be invalid.
+ */
+ m = rte_ipv4_frag_reassemble_packet(ctx->frag_tbl,
+ &ctx->frag_dr, m, rte_rdtsc(),
+ rte_pktmbuf_mtod_offset(m, struct ipv4_hdr*, m->l2_len));
+ rte_ip_frag_free_death_row(&ctx->frag_dr, 3);
+ if (m == NULL)
+ return NULL;
+ iph = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr*, m->l2_len);
+ switch (iph->next_proto_id) {
+ case IPPROTO_TCP:
+ m->packet_type &= ~RTE_PTYPE_L4_MASK;
+ m->packet_type |= RTE_PTYPE_L4_TCP;
+ break;
+ case IPPROTO_UDP:
+ m->packet_type &= ~RTE_PTYPE_L4_MASK;
+ m->packet_type |= RTE_PTYPE_L4_UDP;
+ break;
+ }
+ return m;
+}
+
+static inline struct rte_mbuf*
+process_ipv6_frag(struct rte_mbuf *m, struct glue_ctx *ctx,
+ uint32_t l2_len, uint32_t l3_len)
+{
+ struct ipv6_hdr* ip6h;
+
+ m->l2_len = l2_len;
+ m->l3_len = l3_len;
+ m = rte_ipv6_frag_reassemble_packet(ctx->frag_tbl,
+ &ctx->frag_dr, m, rte_rdtsc(),
+ rte_pktmbuf_mtod_offset(m, struct ipv6_hdr*, l2_len),
+ rte_pktmbuf_mtod_offset(m, struct ipv6_extension_fragment*,
+ m->tso_segsz));
+ rte_ip_frag_free_death_row(&ctx->frag_dr, 3);
+ if (m == NULL)
+ return NULL;
+ ip6h = rte_pktmbuf_mtod_offset(m, struct ipv6_hdr*, m->l2_len);
+ switch (ip6h->proto) {
+ case IPPROTO_TCP:
+ m->packet_type &= ~RTE_PTYPE_L4_MASK;
+ m->packet_type |= RTE_PTYPE_L4_TCP;
+ break;
+ case IPPROTO_UDP:
+ m->packet_type &= ~RTE_PTYPE_L4_MASK;
+ m->packet_type |= RTE_PTYPE_L4_UDP;
+ break;
+ }
+ return m;
+}
+
+static inline struct rte_mbuf *
+fill_ptypes_and_hdr_len(struct glue_ctx *ctx, struct rte_mbuf *m)
+{
+ uint32_t dlen, l2_len, l3_len, l4_len, proto;
+ const struct ether_hdr *eth;
+ uint32_t ptypes;
+ uint16_t etp;
+ int32_t error = 0;
+
+ dlen = rte_pktmbuf_data_len(m);
+
+ /* L2 */
+ l2_len = sizeof(*eth);
+
+ eth = rte_pktmbuf_mtod(m, const struct ether_hdr *);
+ etp = eth->ether_type;
+ while (etp == rte_be_to_cpu_16(ETHER_TYPE_VLAN)) {
+ etp = rte_pktmbuf_mtod_offset(m, struct vlan_hdr*, l2_len)->eth_proto;
+ l2_len += sizeof(struct vlan_hdr);
+ }
+
+ if (etp == rte_be_to_cpu_16(ETHER_TYPE_ARP))
+ return arp_recv(ctx, m, l2_len);
+
+ if (etp == rte_be_to_cpu_16(ETHER_TYPE_IPv4)) {
+ const struct ipv4_hdr *hdr;
+
+ /* L3 */
+ hdr = rte_pktmbuf_mtod_offset(m, const struct ipv4_hdr *, l2_len);
+ error = adjust_ipv4_pktlen(m, l2_len);
+ if (error) {
+ rte_pktmbuf_free(m);
+ return NULL;
+ }
+ l3_len = get_ipv4_hdr_len(m, l2_len, IPPROTO_MAX + 1, 1);
+
+ if ((m->packet_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_FRAG) {
+ m = process_ipv4_frag(m, ctx, l2_len, l3_len);
+ if (m == NULL)
+ return NULL;
+ hdr = rte_pktmbuf_mtod_offset(m, const struct ipv4_hdr*,
+ m->l2_len);
+ l3_len = get_ipv4_hdr_len(m, m->l2_len,
+ IPPROTO_MAX + 1, 0);
+ }
+
+ /* L4 */
+ switch (hdr->next_proto_id) {
+ case IPPROTO_ICMP:
+ return icmp_recv(ctx, m, l2_len, l3_len);
+ case IPPROTO_TCP:
+ ptypes = RTE_PTYPE_L4_TCP |
+ RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L2_ETHER;
+ l4_len = get_tcp_header_size(m, l2_len, l3_len);
+ break;
+ case IPPROTO_UDP:
+ ptypes = RTE_PTYPE_L4_UDP |
+ RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L2_ETHER;
+ l4_len = sizeof(struct udp_hdr);
+ break;
+ default:
+ GLUE_LOG(ERR, "drop ipv4 pkt of unknow L4: (%d)",
+ hdr->next_proto_id);
+ rte_pktmbuf_free(m);
+ return NULL;
+ }
+
+ } else if (etp == rte_be_to_cpu_16(ETHER_TYPE_IPv6) &&
+ dlen >= l2_len + sizeof(struct ipv6_hdr) + sizeof(struct udp_hdr)) {
+ /* L3 */
+ error = adjust_ipv6_pktlen(m, l2_len);
+ if (error) {
+ rte_pktmbuf_free(m);
+ return NULL;
+ }
+ proto = 0;
+ l3_len = get_ipv6x_hdr_len(m, l2_len, &proto);
+
+ if ((m->packet_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_FRAG) {
+ m = process_ipv6_frag(m, ctx, l2_len, l3_len);
+ if (m == NULL)
+ return NULL;
+ l3_len = get_ipv6x_hdr_len(m, m->l2_len, &proto);
+ }
+
+ /* L4 */
+ switch (proto) {
+ case IPPROTO_TCP:
+ ptypes = RTE_PTYPE_L4_TCP |
+ RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L2_ETHER;
+ l4_len = get_tcp_header_size(m, l2_len, l3_len);
+ break;
+ case IPPROTO_UDP:
+ ptypes = RTE_PTYPE_L4_UDP |
+ RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L2_ETHER;
+ l4_len = sizeof(struct udp_hdr);
+ break;
+ case IPPROTO_ICMPV6:
+ return icmp6_recv(ctx, m, l2_len, l3_len);
+ default:
+ GLUE_DEBUG("drop ipv6 pkt of unknown L4: (%x)", proto);
+ rte_pktmbuf_free(m);
+ return NULL;
+ }
+ } else {
+ GLUE_DEBUG("Drop unknown L3 packet: %x", etp);
+ rte_pktmbuf_free(m);
+ return NULL;
+ }
+
+ m->packet_type = ptypes;
+ error = fill_pkt_hdr_len(m, l2_len, l3_len, l4_len);
+ if (error) {
+ rte_pktmbuf_free(m);
+ return NULL;
+ }
+
+ return m;
+}
+
+/* exclude NULLs from the final list of packets. */
+static inline uint32_t
+compress_pkt_list(struct rte_mbuf *pkt[], uint32_t nb_pkt, uint32_t nb_zero)
+{
+ uint32_t i, j, k, l;
+
+ for (j = nb_pkt; nb_zero != 0 && j-- != 0; ) {
+
+ /* found a hole. */
+ if (pkt[j] == NULL) {
+
+ /* find how big is it. */
+ for (i = j; i-- != 0 && pkt[i] == NULL; )
+ ;
+ /* fill the hole. */
+ for (k = j + 1, l = i + 1; k != nb_pkt; k++, l++)
+ pkt[l] = pkt[k];
+
+ nb_pkt -= j - i;
+ nb_zero -= j - i;
+ j = i + 1;
+ }
+ }
+
+ return nb_pkt;
+}
+
+static inline struct rte_mbuf *
+common_fill_hdr_len(struct rte_mbuf *m, uint32_t tp, struct glue_ctx *ctx)
+{
+ uint32_t l4_len, l3_len, l2_len = sizeof(struct ether_hdr);
+ int32_t error = 0;
+
+ switch (tp) {
+ /* possibly fragmented packets. */
+ case (RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L2_ETHER):
+ case (RTE_PTYPE_L3_IPV4_EXT | RTE_PTYPE_L2_ETHER):
+ l3_len = get_ipv4_hdr_len(m, l2_len, IPPROTO_MAX + 1, 1);
+ if ((m->packet_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_FRAG) {
+ m = process_ipv4_frag(m, ctx, l2_len, l3_len);
+ if (m == NULL)
+ return NULL;
+ tp = m->packet_type & (RTE_PTYPE_L2_MASK |
+ RTE_PTYPE_L3_MASK |
+ RTE_PTYPE_L4_MASK);
+ }
+ break;
+ case (RTE_PTYPE_L3_IPV6 | RTE_PTYPE_L2_ETHER):
+ case (RTE_PTYPE_L3_IPV6_EXT | RTE_PTYPE_L2_ETHER):
+ l3_len = get_ipv6_hdr_len(m, l2_len, IPPROTO_MAX + 1);
+ if ((m->packet_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_FRAG) {
+ m = process_ipv6_frag(m, ctx, l2_len, l3_len);
+ if (m == NULL)
+ return NULL;
+ tp = m->packet_type & (RTE_PTYPE_L2_MASK |
+ RTE_PTYPE_L3_MASK |
+ RTE_PTYPE_L4_MASK);
+ }
+ break;
+ }
+
+ switch (tp) {
+ /* non fragmented tcp packets. */
+ case (RTE_PTYPE_L4_TCP | RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L2_ETHER):
+ l3_len = sizeof(struct ipv4_hdr);
+ l4_len = get_tcp_header_size(m, l2_len, l3_len);
+ error = adjust_ipv4_pktlen(m, l2_len);
+ break;
+ case (RTE_PTYPE_L4_TCP | RTE_PTYPE_L3_IPV6 | RTE_PTYPE_L2_ETHER):
+ l3_len = sizeof(struct ipv6_hdr);
+ l4_len = get_tcp_header_size(m, l2_len, l3_len);
+ error = adjust_ipv6_pktlen(m, l2_len);
+ break;
+ case (RTE_PTYPE_L4_TCP | RTE_PTYPE_L3_IPV4_EXT | RTE_PTYPE_L2_ETHER):
+ l3_len = get_ipv4_hdr_len(m, l2_len,
+ IPPROTO_TCP, 0);
+ l4_len = get_tcp_header_size(m, l2_len, l3_len);
+ error = adjust_ipv4_pktlen(m, l2_len);
+ break;
+ case (RTE_PTYPE_L4_TCP | RTE_PTYPE_L3_IPV6_EXT | RTE_PTYPE_L2_ETHER):
+ l3_len = get_ipv6_hdr_len(m, l2_len, IPPROTO_TCP);
+ l4_len = get_tcp_header_size(m, l2_len, l3_len);
+ error = adjust_ipv6_pktlen(m, l2_len);
+ break;
+
+ /* non fragmented udp packets. */
+ case (RTE_PTYPE_L4_UDP | RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L2_ETHER):
+ l3_len = sizeof(struct ipv4_hdr);
+ l4_len = sizeof(struct udp_hdr);
+ error = adjust_ipv4_pktlen(m, l2_len);
+ break;
+ case (RTE_PTYPE_L4_UDP | RTE_PTYPE_L3_IPV6 | RTE_PTYPE_L2_ETHER):
+ l3_len = sizeof(struct ipv6_hdr);
+ l4_len = sizeof(struct udp_hdr);
+ error = adjust_ipv6_pktlen(m, l2_len);
+ break;
+ case (RTE_PTYPE_L4_UDP | RTE_PTYPE_L3_IPV4_EXT | RTE_PTYPE_L2_ETHER):
+ l3_len = get_ipv4_hdr_len(m, l2_len,
+ IPPROTO_UDP, 0);
+ l4_len = sizeof(struct udp_hdr);
+ error = adjust_ipv4_pktlen(m, l2_len);
+ break;
+ case (RTE_PTYPE_L4_UDP | RTE_PTYPE_L3_IPV6_EXT | RTE_PTYPE_L2_ETHER):
+ l3_len = get_ipv6_hdr_len(m, l2_len, IPPROTO_UDP);
+ l4_len = sizeof(struct udp_hdr);
+ error = adjust_ipv6_pktlen(m, l2_len);
+ break;
+ default:
+ GLUE_LOG(ERR, "drop unknown pkt");
+ rte_pktmbuf_free(m);
+ return NULL;
+ }
+
+ if (error) {
+ rte_pktmbuf_free(m);
+ return NULL;
+ }
+ error = fill_pkt_hdr_len(m, l2_len, l3_len, l4_len);
+ if (error) {
+ rte_pktmbuf_free(m);
+ return NULL;
+ }
+ return m;
+}
+
+
+/*
+ * HW can recognize L2-arp/L3 with/without extensions/L4 (i40e)
+ */
+static uint16_t
+type0_rx_callback(uint16_t port,
+ uint16_t queue,
+ struct rte_mbuf *pkt[],
+ uint16_t nb_pkts,
+ uint16_t max_pkts,
+ void *user_param)
+{
+ uint32_t j, tp, l2_len, l3_len;
+ struct glue_ctx *ctx;
+ uint16_t nb_zero = 0;
+
+ RTE_SET_USED(port);
+ RTE_SET_USED(queue);
+ RTE_SET_USED(max_pkts);
+
+ ctx = user_param;
+
+ for (j = 0; j != nb_pkts; j++) {
+ tp = pkt[j]->packet_type & (RTE_PTYPE_L4_MASK |
+ RTE_PTYPE_L3_MASK | RTE_PTYPE_L2_MASK);
+
+ switch (tp) {
+ case (RTE_PTYPE_L2_ETHER_ARP):
+ arp_recv(ctx, pkt[j], sizeof(struct ether_hdr));
+ pkt[j] = NULL;
+ nb_zero++;
+ break;
+ case (RTE_PTYPE_L4_ICMP | RTE_PTYPE_L3_IPV4 |
+ RTE_PTYPE_L2_ETHER):
+ case (RTE_PTYPE_L4_ICMP | RTE_PTYPE_L3_IPV4_EXT |
+ RTE_PTYPE_L2_ETHER):
+ l2_len = sizeof(struct ether_hdr);
+ l3_len = get_ipv4_hdr_len(pkt[j], l2_len, IPPROTO_ICMP, 0);
+ icmp_recv(ctx, pkt[j], l2_len, l3_len);
+ pkt[j] = NULL;
+ nb_zero++;
+ break;
+ case (RTE_PTYPE_L4_ICMP | RTE_PTYPE_L3_IPV6 |
+ RTE_PTYPE_L2_ETHER):
+ case (RTE_PTYPE_L4_ICMP | RTE_PTYPE_L3_IPV6_EXT |
+ RTE_PTYPE_L2_ETHER):
+ l2_len = sizeof(struct ether_hdr);
+ l3_len = get_ipv6_hdr_len(pkt[j], l2_len, IPPROTO_ICMPV6);
+ icmp6_recv(ctx, pkt[j], l2_len, l3_len);
+ pkt[j] = NULL;
+ nb_zero++;
+ break;
+ default:
+ if (common_fill_hdr_len(pkt[j], tp, ctx) == NULL) {
+ pkt[j] = NULL;
+ nb_zero++;
+ }
+ break;
+ }
+ }
+
+ if (nb_zero == 0)
+ return nb_pkts;
+
+ return compress_pkt_list(pkt, nb_pkts, nb_zero);
+}
+
+/*
+ * HW can recognize L2/L3/L4 and fragments; but cannot recognize ARP
+ * nor ICMP (ixgbe).
+ */
+static uint16_t
+type1_rx_callback(uint16_t port,
+ uint16_t queue,
+ struct rte_mbuf *pkt[],
+ uint16_t nb_pkts,
+ uint16_t max_pkts,
+ void *user_param)
+{
+ uint32_t j, tp, l2_len, l3_len;
+ struct glue_ctx *ctx;
+ uint16_t nb_zero = 0;
+ const struct ether_hdr *eth;
+ const struct ipv4_hdr *ip4;
+ const struct ipv6_hdr *ip6;
+ uint16_t etp;
+
+ RTE_SET_USED(port);
+ RTE_SET_USED(queue);
+ RTE_SET_USED(max_pkts);
+
+ ctx = user_param;
+
+ for (j = 0; j != nb_pkts; j++) {
+ tp = pkt[j]->packet_type & (RTE_PTYPE_L4_MASK | RTE_PTYPE_L3_MASK |
+ RTE_PTYPE_L2_MASK);
+
+ switch (tp) {
+ case RTE_PTYPE_L2_ETHER:
+ eth = rte_pktmbuf_mtod(pkt[j], const struct ether_hdr *);
+ etp = eth->ether_type;
+ if (etp == rte_be_to_cpu_16(ETHER_TYPE_ARP))
+ arp_recv(ctx, pkt[j], sizeof(*eth));
+ pkt[j] = NULL;
+ nb_zero++;
+ break;
+ case (RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L2_ETHER):
+ case (RTE_PTYPE_L3_IPV4_EXT | RTE_PTYPE_L2_ETHER):
+ ip4 = rte_pktmbuf_mtod_offset(pkt[j],
+ const struct ipv4_hdr *,
+ sizeof(*eth));
+ if (ip4->next_proto_id == IPPROTO_ICMP) {
+ l2_len = sizeof(struct ether_hdr);
+ l3_len = get_ipv4_hdr_len(pkt[j], l2_len,
+ IPPROTO_ICMP, 0);
+ icmp_recv(ctx, pkt[j], l2_len, l3_len);
+ } else
+ rte_pktmbuf_free(pkt[j]);
+
+ pkt[j] = NULL;
+ nb_zero++;
+ break;
+ case (RTE_PTYPE_L3_IPV6 | RTE_PTYPE_L2_ETHER):
+ case (RTE_PTYPE_L3_IPV6_EXT | RTE_PTYPE_L2_ETHER):
+ ip6 = rte_pktmbuf_mtod_offset(pkt[j],
+ const struct ipv6_hdr *,
+ sizeof(*eth));
+ if (ip6->proto == IPPROTO_ICMPV6) {
+ l2_len = sizeof(struct ether_hdr);
+ l3_len = get_ipv6_hdr_len(pkt[j], l2_len,
+ IPPROTO_ICMPV6);
+ icmp6_recv(ctx, pkt[j], l2_len, l3_len);
+ } else
+ rte_pktmbuf_free(pkt[j]);
+
+ pkt[j] = NULL;
+ nb_zero++;
+ break;
+ default:
+ if (common_fill_hdr_len(pkt[j], tp, ctx) == NULL) {
+ pkt[j] = NULL;
+ nb_zero++;
+ }
+ break;
+ }
+ }
+
+ if (nb_zero == 0)
+ return nb_pkts;
+
+ return compress_pkt_list(pkt, nb_pkts, nb_zero);
+}
+
+/*
+ * generic, assumes HW doesn't recognize any packet type.
+ */
+uint16_t
+typen_rx_callback(uint16_t port,
+ uint16_t queue,
+ struct rte_mbuf *pkt[],
+ uint16_t nb_pkts,
+ uint16_t max_pkts,
+ void *user_param)
+{
+ uint32_t j;
+ uint16_t nb_zero;
+ struct glue_ctx *ctx;
+
+ RTE_SET_USED(port);
+ RTE_SET_USED(queue);
+ RTE_SET_USED(max_pkts);
+
+ ctx = user_param;
+
+ nb_zero = 0;
+ for (j = 0; j != nb_pkts; j++) {
+ /* fix me: now we avoid checking ip checksum */
+ pkt[j]->ol_flags &= (~PKT_RX_IP_CKSUM_BAD);
+ pkt[j]->packet_type = 0;
+ pkt[j] = fill_ptypes_and_hdr_len(ctx, pkt[j]);
+ nb_zero += (pkt[j] == NULL);
+ }
+
+ if (nb_zero == 0)
+ return nb_pkts;
+
+ return compress_pkt_list(pkt, nb_pkts, nb_zero);
+}
+
+static uint32_t
+get_ptypes(uint16_t port_id)
+{
+ uint32_t smask;
+ int32_t i, rc;
+ const uint32_t pmask =
+ RTE_PTYPE_L2_MASK | RTE_PTYPE_L3_MASK | RTE_PTYPE_L4_MASK;
+
+ smask = 0;
+ rc = rte_eth_dev_get_supported_ptypes(port_id, pmask, NULL, 0);
+ if (rc < 0) {
+ RTE_LOG(ERR, USER1,
+ "%s(port=%u) failed to get supported ptypes;\n",
+ __func__, port_id);
+ return smask;
+ }
+
+ uint32_t ptype[rc];
+ rc = rte_eth_dev_get_supported_ptypes(port_id, pmask, ptype, rc);
+
+ for (i = 0; i != rc; i++) {
+ switch (ptype[i]) {
+ case RTE_PTYPE_L2_ETHER_ARP:
+ smask |= ETHER_ARP_PTYPE;
+ break;
+ case RTE_PTYPE_L3_IPV4:
+ case RTE_PTYPE_L3_IPV4_EXT_UNKNOWN:
+ smask |= IPV4_PTYPE;
+ break;
+ case RTE_PTYPE_L3_IPV4_EXT:
+ smask |= IPV4_EXT_PTYPE;
+ break;
+ case RTE_PTYPE_L3_IPV6:
+ case RTE_PTYPE_L3_IPV6_EXT_UNKNOWN:
+ smask |= IPV6_PTYPE;
+ break;
+ case RTE_PTYPE_L3_IPV6_EXT:
+ smask |= IPV6_EXT_PTYPE;
+ break;
+ case RTE_PTYPE_L4_TCP:
+ smask |= TCP_PTYPE;
+ break;
+ case RTE_PTYPE_L4_UDP:
+ smask |= UDP_PTYPE;
+ break;
+ case RTE_PTYPE_L4_ICMP:
+ smask |= ICMP_PTYPE;
+ break;
+ }
+ }
+
+ return smask;
+}
+
+/* In rx callbacks, we need to check and make sure below things are done,
+ * either by hw or by sw:
+ * 1. filter out arp packets, and handle arp packets properly
+ * - for arp request packet, reply arp if it's requesting myself.
+ * 2. fill l2, l3, l4 header length
+ *
+ * 3. GSO/GRO setup (TODO)
+ *
+ */
+int
+setup_rx_cb(uint16_t port_id, uint16_t qid)
+{
+ int32_t rc;
+ uint32_t i, n, smask;
+ const void *cb;
+ struct glue_ctx *ctx;
+ const struct ptype2cb *ptype2cb;
+
+ static const struct ptype2cb tcp_arp_ptype2cb[] = {
+ { /* i40e */
+ .mask = ETHER_ARP_PTYPE |
+ ICMP_PTYPE |
+ IPV4_PTYPE | IPV4_EXT_PTYPE |
+ IPV6_PTYPE | IPV6_EXT_PTYPE |
+ TCP_PTYPE | UDP_PTYPE,
+ .name = "HW l2-arp/l3x/l4-tcp ptype",
+ .fn = type0_rx_callback,
+ },
+ { /* ixgbe does not support ARP ptype */
+ .mask = IPV4_PTYPE | IPV4_EXT_PTYPE |
+ IPV6_PTYPE | IPV6_EXT_PTYPE |
+ TCP_PTYPE | UDP_PTYPE,
+ .name = "HW l3x/l4-tcp ptype",
+ .fn = type1_rx_callback,
+ },
+ { /* virtio */
+ .mask = 0,
+ .name = "HW does not support any ptype",
+ .fn = typen_rx_callback,
+ },
+ };
+
+ ctx = glue_ctx_lookup(port_id, qid);
+ if (ctx == NULL) {
+ GLUE_LOG(ERR, "no ctx fount by port(%d) and queue (%d)",
+ port_id, qid);
+ return -EINVAL;
+ }
+
+ smask = get_ptypes(port_id);
+
+ ptype2cb = tcp_arp_ptype2cb;
+ n = RTE_DIM(tcp_arp_ptype2cb);
+
+ for (i = 0; i != n; i++) {
+ if ((smask & ptype2cb[i].mask) == ptype2cb[i].mask) {
+ cb = rte_eth_add_rx_callback(port_id, qid,
+ ptype2cb[i].fn, ctx);
+ rc = -rte_errno;
+ GLUE_LOG(ERR, "%s(port=%u), setup RX callback \"%s\";",
+ __func__, port_id, ptype2cb[i].name);
+ return ((cb == NULL) ? rc : 0);
+ }
+ }
+
+ GLUE_LOG(ERR, "%s(port=%u) failed to find an appropriate callback",
+ __func__, port_id);
+ return -ENOENT;
+}
diff --git a/lib/libtle_glue/rxtx.c b/lib/libtle_glue/rxtx.c
new file mode 100644
index 0000000..b80a3ac
--- /dev/null
+++ b/lib/libtle_glue/rxtx.c
@@ -0,0 +1,573 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "sym.h"
+
+#include <rte_common.h>
+#include <rte_mbuf.h>
+#include <rte_ip.h>
+#include <rte_udp.h>
+#include <rte_atomic.h>
+
+#include <tle_tcp.h>
+
+#include <stddef.h>
+#include <fcntl.h>
+
+#include "tle_glue.h"
+#include "fd.h"
+#include "util.h"
+#include "internal.h"
+
+rte_atomic32_t thr_cnt;
+
+#define MAX_UDP_PKT_LEN ((2 << 16) - 1 - sizeof(struct ipv4_hdr) - sizeof(struct udp_hdr))
+
+static inline struct rte_mbuf *
+from_mbuf_to_buf(struct rte_mbuf *m, char *buf,
+ size_t len, int ispeek, int needcpy)
+{
+ void *src;
+ uint32_t done = 0;
+ uint32_t left = len, orig_pkt_len;
+ uint16_t copy_len, seg_len, segs;
+ struct rte_mbuf *m_next, *orig_pkt;
+
+ if (len == 0)
+ return m;
+
+ orig_pkt = m;
+ orig_pkt_len = m->pkt_len;
+ segs = m->nb_segs;
+
+ do {
+ seg_len = rte_pktmbuf_data_len(m);
+ copy_len = RTE_MIN(seg_len, left);
+ src = rte_pktmbuf_mtod(m, void *);
+ if (needcpy)
+ rte_memcpy(buf + done, src, copy_len);
+ done += copy_len;
+ left -= copy_len;
+ if (copy_len < seg_len) {
+ if (!ispeek)
+ rte_pktmbuf_adj(m, copy_len);
+ break;
+ }
+ m_next = m->next;
+ if (!ispeek) {
+ rte_pktmbuf_free_seg(m);
+ segs--;
+ }
+ m = m_next;
+ } while (left && m);
+
+ if (m && !ispeek) {
+ m->nb_segs = segs;
+ m->pkt_len = orig_pkt_len - done;
+ }
+
+ if(ispeek)
+ return orig_pkt;
+ else
+ return m;
+}
+
+static inline bool
+is_peer_closed(struct sock *so)
+{
+ if (errno == EAGAIN && tle_event_state(&so->erev) == TLE_SEV_UP)
+ return true;
+
+ return false;
+}
+
+static ssize_t
+_recv(int sockfd, void *buf, size_t len, struct sockaddr *src_addr, int flags)
+{
+ int rx;
+ ssize_t rc;
+ ssize_t recvlen;
+ size_t tmplen;
+ struct sock *so;
+ struct rte_mbuf *m;
+ struct epoll_event event;
+ int needcpy;
+
+ if (RTE_PER_LCORE(_lcore_id) == LCORE_ID_ANY) {
+ RTE_PER_LCORE(_lcore_id) = rte_atomic32_add_return(&thr_cnt, 1);
+ }
+
+ so = fd2sock(sockfd);
+
+ if (so->s == NULL) {
+ if (IS_UDP(so) && is_nonblock(so, flags))
+ errno = EAGAIN;
+ else
+ errno = ENOTCONN;
+ return -1;
+ }
+
+ if (so->rx_left) {
+ m = so->rx_left;
+ so->rx_left = NULL;
+ if (src_addr) {
+ OPS(so)->getname(so, src_addr, 1);
+ /* fixme: cannot get addr for UDP in this way */
+ }
+ } else {
+ rc = OPS(so)->recv(so->s, &m, 1, src_addr);
+ if (rc == 0) {
+ if (is_nonblock(so, flags)) {
+ /* socket closed, return 0 */
+ if (is_peer_closed(so)) {
+ GLUE_DEBUG("peer closed: %d", sockfd);
+ return 0;
+ }
+
+ /* According to linux stack,
+ * receive from shutdown tcp socket returns 0.
+ * And receive from shutdown udp socket generate
+ * EAGAIN. In special case, we return ESHUTDOWN
+ * to notify upper application.
+ */
+ if (so->shutdown & RECV_SHUTDOWN) {
+ if (so->proto == PROTO_TCP)
+ return 0;
+ else {
+#ifdef LOOK_ASIDE_BACKEND
+ errno = ESHUTDOWN;
+#else
+ errno = EAGAIN;
+#endif
+ return -1;
+ }
+ }
+ return -1;
+ }
+
+ do {
+ /* in blocking mode, recv from shutdown socket
+ * return 0 immediately */
+ if (so->shutdown & RECV_SHUTDOWN)
+ return 0;
+
+ /* some error occured, return -1 */
+ if (errno != EAGAIN)
+ return -1;
+
+ /* socket closed, return 0 */
+ if (is_peer_closed(so)) {
+ GLUE_DEBUG("peer closed: %d", sockfd);
+ return 0;
+ }
+
+ epoll_kernel_wait(CTX(so), -1, &event, 1, 1, &rx);
+
+ be_process(CTX(so));
+ } while((rc = OPS(so)->recv(so->s, &m, 1, src_addr)) == 0);
+ }
+ }
+
+ /* get one pkt */
+ if (!so->option.timestamp)
+ so->s->timestamp = m->timestamp;
+
+ needcpy = 1;
+ recvlen = RTE_MIN(m->pkt_len, len);
+ if (flags & MSG_TRUNC) {
+ if (IS_UDP(so))
+ recvlen = m->pkt_len;
+ else
+ /* According to linux manual, data will be discarded
+ * if recv TCP stream with MSG_TRUNC flag */
+ needcpy = 0;
+ }
+
+ so->rx_left = from_mbuf_to_buf(m, buf, len, flags & MSG_PEEK, needcpy);
+
+ if (((flags & MSG_PEEK) == 0) && IS_UDP(so) && so->rx_left) {
+ rte_pktmbuf_free(so->rx_left);
+ so->rx_left = NULL;
+ }
+
+ /* UDP socket only receive one pkt at one time */
+ if (IS_UDP(so) || (flags & MSG_PEEK)) {
+ return recvlen;
+ }
+ /* TCP socket: try best to fill buf */
+ len -= recvlen;
+ buf = (char*)buf + recvlen;
+ while (len) {
+ if (OPS(so)->recv(so->s, &m, 1, src_addr) == 0)
+ break;
+
+ tmplen = (m->pkt_len < len) ? m->pkt_len : len;
+ so->rx_left = from_mbuf_to_buf(m, buf, tmplen, 0, needcpy);
+ len -= tmplen;
+ recvlen += tmplen;
+ buf = (char*)buf + tmplen;
+ }
+
+ if (so->rx_left)
+ tle_event_raise(&so->rxev);
+
+ /* may send window increase ACK after receive*/
+ if (recvlen > 0)
+ be_tx_with_lock(CTX(so));
+
+ return recvlen;
+}
+
+ssize_t PRE(recv)(int sockfd, void *buf, size_t len, int flags)
+{
+ if (is_kernel_fd(sockfd))
+ return k_read(sockfd, buf, len);
+
+ return _recv(sockfd, buf, len, NULL, flags);
+}
+
+ssize_t PRE(recvfrom)(int sockfd, void *buf, size_t len, int flags,
+ struct sockaddr *src_addr, socklen_t *addrlen)
+{
+ ssize_t rc;
+ if (is_kernel_fd(sockfd))
+ return k_recv(sockfd, buf, len, flags);
+
+ if (src_addr && !addrlen) {
+ errno = EINVAL;
+ return -1;
+ }
+ rc = _recv(sockfd, buf, len, src_addr, flags);
+ if (rc >= 0 && src_addr) {
+ if (src_addr->sa_family == AF_INET) {
+ *addrlen = sizeof(struct sockaddr_in);
+ } else {
+ *addrlen = sizeof(struct sockaddr_in6);
+ }
+ }
+ return rc;
+}
+
+#define RECV_CONTINUE (-2)
+static inline ssize_t
+try_recvmsg(struct sock *so, struct msghdr *msg, int flags)
+{
+ ssize_t sz;
+
+ if (so->s == NULL) {
+ if (IS_UDP(so) && is_nonblock(so, flags))
+ errno = EAGAIN;
+ else
+ errno = ENOTCONN;
+ return -1;
+ }
+
+ sz = OPS(so)->readv(so->s, msg, flags);
+ if (sz >= 0) { /* get data */
+ /* may send window increase ACK after receive*/
+ if (sz > 0)
+ be_tx_with_lock(CTX(so));
+ return sz;
+ }
+ else if (errno != EAGAIN) /* error occurred */
+ return -1;
+ else if (is_peer_closed(so)) {
+ GLUE_DEBUG("peer closed: %d", so->fd);
+ return 0;
+ } else if (is_nonblock(so, flags))
+ return -1;
+
+ return RECV_CONTINUE;
+}
+
+ssize_t PRE(recvmsg)(int sockfd, struct msghdr *msg, int flags)
+{
+ ssize_t sz;
+ struct sock *so;
+
+ if (is_kernel_fd(sockfd))
+ return k_recvmsg(sockfd, msg, flags);
+
+ so = fd2sock(sockfd);
+
+ if (so->rx_left == NULL && OPS(so)->readv &&
+ (flags & MSG_PEEK) == 0 &&
+ ((flags & MSG_TRUNC) == 0 || so->proto == PROTO_UDP)) {
+ /* udp_readv supports MSG_TRUNC, tcp_readv not yet.
+ * so only udp socket implement with readv interface.
+ */
+ sz = try_recvmsg(so, msg, flags);
+ if (sz != RECV_CONTINUE)
+ return sz;
+ }
+
+ /* 1. rx_left != NULL; 2. get no data, fall back to blocking read */
+
+ if (so->rx_left != NULL && msg != NULL && msg->msg_control != NULL) {
+ if (so->option.timestamp)
+ tle_set_timestamp(msg, so->rx_left);
+ else
+ msg->msg_controllen = 0;
+ }
+
+ sz = PRE(recvfrom)(sockfd, msg->msg_iov[0].iov_base,
+ msg->msg_iov[0].iov_len, flags,
+ (struct sockaddr *)msg->msg_name,
+ &msg->msg_namelen);
+
+ return sz;
+}
+
+ssize_t PRE(read)(int fd, void *buf, size_t count)
+{
+ if (is_kernel_fd(fd))
+ return k_read(fd, buf, count);
+
+ return _recv(fd, buf, count, NULL, 0);
+}
+
+#define DECONST(type, var) ((type)(uintptr_t)(const void *)(var))
+
+ssize_t PRE(readv)(int fd, const struct iovec *iov, int iovcnt)
+{
+ ssize_t sz;
+ struct sock *so;
+ struct msghdr msg;
+
+ if (is_kernel_fd(fd))
+ return k_readv(fd, iov, iovcnt);
+
+ if (RTE_PER_LCORE(_lcore_id) == LCORE_ID_ANY) {
+ RTE_PER_LCORE(_lcore_id) = rte_atomic32_add_return(&thr_cnt, 1);
+ }
+
+ so = fd2sock(fd);
+
+ if (so->rx_left == NULL && OPS(so)->readv) {
+ memset(&msg, 0, sizeof(msg));
+ msg.msg_iov = DECONST(struct iovec *, iov);
+ msg.msg_iovlen = iovcnt;
+ sz = try_recvmsg(so, &msg, 0);
+ if (sz != RECV_CONTINUE)
+ return sz;
+ }
+
+ /* 1. rx_left != NULL; 2. get no data, fall back to blocking read */
+
+ /* fixme: when so->rx_left != NULL, also needs readv.
+ * maybe need to modify readv interface args of ops */
+ return _recv(fd, iov[0].iov_base, iov[0].iov_len, NULL, 0);
+}
+
+static ssize_t
+_send(int sockfd, const void *buf, size_t len,
+ const struct sockaddr *peer, int flags)
+{
+ struct sock *so = fd2sock(sockfd);
+ struct rte_mempool *mp = get_mempool_by_socket(0); /* fix me */
+ uint16_t nb_mbufs = (len + RTE_MBUF_DEFAULT_DATAROOM - 1)
+ / RTE_MBUF_DEFAULT_DATAROOM;
+ uint16_t i, cnt, copy_len;
+ int rc;
+ struct rte_mbuf *mbufs[nb_mbufs + 1];
+ size_t done = 0;
+ uint32_t left = 0;
+ char *dst;
+ int blocking = !is_nonblock(so, flags);
+
+ if (RTE_PER_LCORE(_lcore_id) == LCORE_ID_ANY) {
+ RTE_PER_LCORE(_lcore_id) = rte_atomic32_add_return(&thr_cnt, 1);
+ }
+
+ if (!blocking && len > def_sndbuf && so->proto == PROTO_TCP) {
+ len = def_sndbuf;
+ nb_mbufs = (len + RTE_MBUF_DEFAULT_DATAROOM - 1)
+ / RTE_MBUF_DEFAULT_DATAROOM;
+ }
+
+ if (unlikely(len == 0)) {
+ if (so->proto == PROTO_TCP)
+ return 0;
+ else
+ nb_mbufs = 1;
+ }
+
+ if (unlikely(len > MAX_UDP_PKT_LEN && IS_UDP(so))) {
+ errno = EMSGSIZE;
+ return -1;
+ }
+
+ if (blocking)
+ be_process(get_ctx());
+
+ if (unlikely(rte_pktmbuf_alloc_bulk(mp, mbufs, nb_mbufs) < 0)) {
+ errno = ENOMEM;
+ return -1;
+ }
+
+ for (i = 0; i < nb_mbufs; ++i) {
+ copy_len = RTE_MIN((size_t)RTE_MBUF_DEFAULT_DATAROOM,
+ len - done);
+ dst = rte_pktmbuf_mtod(mbufs[i], char *);
+ rte_memcpy(dst, (const char *)buf + done, copy_len);
+ done += copy_len;
+ mbufs[i]->data_len = copy_len;
+ mbufs[i]->pkt_len = copy_len;
+ }
+
+ cnt = 0;
+do_send:
+ rc = OPS(so)->send(so, mbufs + cnt, nb_mbufs - cnt, peer);
+
+ cnt += rc;
+
+ if (cnt > 0)
+ be_tx_with_lock(CTX(so));
+
+ if (cnt > 0 && blocking)
+ be_process(get_ctx());
+
+ if (blocking &&
+ cnt < nb_mbufs &&
+ (rc > 0 || errno == EAGAIN) &&
+ tle_event_state(&so->erev) != TLE_SEV_UP) {
+ be_process(get_ctx());
+ goto do_send;
+ }
+
+ for (i = cnt; i < nb_mbufs; ++i) {
+ left += mbufs[i]->pkt_len;
+ rte_pktmbuf_free_seg(mbufs[i]);
+ }
+
+ if (cnt == 0)
+ return -1;
+ else
+ return len - left;
+}
+
+ssize_t PRE(send)(int sockfd, const void *buf, size_t len, int flags)
+{
+ if (is_kernel_fd(sockfd))
+ return k_write(sockfd, buf, len);
+
+ /* MSG_NOSIGNAL means "Do not generate SIGPIPE". Ignore this flag */
+ flags &= ~MSG_NOSIGNAL;
+
+ return _send(sockfd, buf, len, NULL, flags);
+}
+
+ssize_t PRE(sendto)(int sockfd, const void *buf, size_t len, int flags,
+ const struct sockaddr *dest_addr, socklen_t addrlen)
+{
+ if (is_kernel_fd(sockfd))
+ return k_sendto(sockfd, buf, len, flags, dest_addr, addrlen);
+
+ /* MSG_NOSIGNAL means "Do not generate SIGPIPE". Ignore this flag */
+ flags &= ~MSG_NOSIGNAL;
+
+ return _send(sockfd, buf, len, dest_addr, flags);
+}
+
+ssize_t PRE(sendmsg)(int sockfd, const struct msghdr *msg, int flags)
+{
+ ssize_t ret;
+ struct sock *so;
+
+ if (is_kernel_fd(sockfd))
+ return k_sendmsg(sockfd, msg, flags);
+
+ /* MSG_NOSIGNAL means "Do not generate SIGPIPE". Ignore this flag */
+ flags &= ~MSG_NOSIGNAL;
+
+ so = fd2sock(sockfd);
+ if (OPS(so)->writev) {
+ ret = OPS(so)->writev(so, msg->msg_iov, msg->msg_iovlen,
+ msg->msg_name);
+ if (ret < 0) {
+ if (errno != EAGAIN || is_nonblock(so, flags))
+ return -1;
+ } else {
+ /* TODO: blocking && ret < total length */
+ be_tx_with_lock(CTX(so));
+ return ret;
+ }
+
+ /* fall through to blocking send */
+ }
+
+ return _send(sockfd, msg->msg_iov[0].iov_base, msg->msg_iov[0].iov_len,
+ (struct sockaddr *)msg->msg_name, flags);
+}
+
+ssize_t PRE(write)(int fd, const void *buf, size_t count)
+{
+ if (is_kernel_fd(fd))
+ return k_write(fd, buf, count);
+
+ return _send(fd, buf, count, NULL, 0);
+}
+
+ssize_t PRE(writev)(int fd, const struct iovec *iov, int iovcnt)
+{
+ ssize_t ret;
+ struct sock *so;
+
+ if (is_kernel_fd(fd))
+ return k_writev(fd, iov, iovcnt);
+
+ if (RTE_PER_LCORE(_lcore_id) == LCORE_ID_ANY) {
+ RTE_PER_LCORE(_lcore_id) = rte_atomic32_add_return(&thr_cnt, 1);
+ }
+
+ so = fd2sock(fd);
+ if (OPS(so)->writev) {
+ ret = OPS(so)->writev(so, iov, iovcnt, NULL);
+ if (ret < 0) {
+ if (errno != EAGAIN || is_nonblock(so, 0))
+ return -1;
+ } else {
+ /* TODO: blocking && ret < total length */
+ be_tx_with_lock(CTX(so));
+ return ret;
+ }
+
+ /* fall through to blocking send */
+ }
+
+ return _send(fd, iov[0].iov_base, iov[0].iov_len, NULL, 0);
+}
+
+/* advanced functions */
+ssize_t PRE(splice)(int fd_in, loff_t *off_in, int fd_out,
+ loff_t *off_out, size_t len, unsigned int flags)
+{
+ if (is_kernel_fd(fd_in) && is_kernel_fd(fd_out))
+ return k_splice(fd_in, off_in, fd_out, off_out, len, flags);
+
+ rte_panic("splice is not supported yet");
+ errno = EOPNOTSUPP;
+ return -1;
+}
+
+ssize_t PRE(sendfile)(int out_fd, int in_fd, off_t *offset, size_t count)
+{
+ if (is_kernel_fd(out_fd) && is_kernel_fd(in_fd))
+ return k_sendfile(out_fd, in_fd, offset, count);
+
+ rte_panic("sendfile is not supported yet");
+ errno = EOPNOTSUPP;
+ return -1;
+}
diff --git a/lib/libtle_glue/select.c b/lib/libtle_glue/select.c
new file mode 100644
index 0000000..b3b8539
--- /dev/null
+++ b/lib/libtle_glue/select.c
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <signal.h>
+#include <sys/select.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "fd.h"
+#include "ctx.h"
+#include "sym.h"
+#include "log.h"
+#include "util.h"
+#include "internal.h"
+#include "tle_glue.h"
+
+#define FD_ZERO_N(s, n) do { memset((s)->fds_bits, 0, n/sizeof(long)); } while(0)
+
+static int
+fdset_to_events_user(int nfds, fd_set *fdset, int *total, int event)
+{
+ int i, num = 0;
+ struct sock *so;
+ const struct tle_event *ev;
+
+ for (i = fd_table.fd_base; i < nfds; ++i) {
+ if (!FD_ISSET(i, fdset))
+ continue;
+
+ so = fd2sock(i); /* fix me: check if fd is opened */
+
+ switch (event) {
+ case EPOLLIN:
+ ev = &so->rxev;
+ break;
+ case EPOLLOUT:
+ ev = &so->txev;
+ break;
+ case EPOLLERR:
+ ev = &so->erev;
+ break;
+ default:
+ rte_panic("non-sense value\n");
+ }
+ /* Check event is ready */
+ if (TLE_SEV_UP == tle_event_state(ev)) {
+ *total = *total + 1;
+ } else {
+ FD_CLR(i, fdset);
+ num++;
+ }
+
+ /* We fill sock->event here as we need this when
+ * we filter events in poll_common(). But it was
+ * originally set by epoll_ctl(). Now we have to
+ * assume that there are no application which
+ * uses epoll/poll/select at the same time.
+ */
+ so->event.events |= event;
+ so->event.data.u32 = i;
+ }
+
+ return num;
+}
+
+static int
+fdset_to_events_kernel(int nfds, fd_set *fdset, int efd, int event)
+{
+ int i, num = 0;
+ struct epoll_event k_ev;
+
+ for (i = 0; i < nfds; ++i) {
+ if (!FD_ISSET(i, fdset))
+ continue;
+
+ k_ev.events = event;
+ k_ev.data.u32 = i;
+ k_epoll_ctl(efd, EPOLL_CTL_ADD, i, &k_ev);
+ num++;
+ }
+
+ return num;
+}
+
+int
+PRE(select)(int nfds, fd_set *readfds, fd_set *writefds,
+ fd_set *exceptfds, struct timeval *timeout)
+{
+ int to;
+ struct glue_ctx *ctx;
+ int j, efd, total = 0, max = 0;
+
+ /* thread <> context binding happens here */
+ if (RTE_PER_LCORE(glue_ctx) == NULL) {
+ ctx = &ctx_array[glue_ctx_alloc()];
+ RTE_PER_LCORE(glue_ctx) = ctx;
+ } else
+ ctx = RTE_PER_LCORE(glue_ctx);
+
+ /* step 0, process some packets */
+ be_process(ctx);
+
+ /* step 1, check if any userspace events are ready */
+
+ if (readfds)
+ max += fdset_to_events_user(nfds, readfds,
+ &total, EPOLLIN);
+ if (writefds)
+ max += fdset_to_events_user(nfds, writefds,
+ &total, EPOLLOUT);
+ if (exceptfds)
+ max += fdset_to_events_user(nfds, writefds,
+ &total, EPOLLERR);
+ if (total > 0) {
+ /* userspace events go firstly */
+ if (readfds)
+ FD_ZERO_N(readfds, fd_table.fd_base);
+ if (writefds)
+ FD_ZERO_N(writefds, fd_table.fd_base);
+ if (exceptfds)
+ FD_ZERO_N(exceptfds, fd_table.fd_base);
+
+ return total;
+ }
+
+ /* step 2, only wait for kernel events? */
+ if (max == 0)
+ return k_select(nfds, readfds, writefds, exceptfds, timeout);
+
+ /* step 3, slow path: wait for I/O and kernel events */
+ efd = k_epoll_create(1);
+ if (efd < 0)
+ rte_panic("k_epoll_create failed %d", errno);
+
+ nfds = RTE_MIN(nfds, fd_table.fd_base);
+ if (readfds)
+ max += fdset_to_events_kernel(nfds, readfds,
+ efd, EPOLLIN);
+ if (writefds)
+ max += fdset_to_events_kernel(nfds, writefds,
+ efd, EPOLLOUT);
+ if (exceptfds)
+ max += fdset_to_events_kernel(nfds, exceptfds,
+ efd, EPOLLERR);
+
+ struct epoll_event events[max];
+
+ if (timeout)
+ to = timeout->tv_sec * 1000 + timeout->tv_usec / 1000;
+ else
+ to = -1;
+ total = poll_common(ctx, events, max, to, efd);
+
+ k_close(efd);
+ for (j = 0; j < total; ++j) {
+ if (events[j].events & EPOLLIN)
+ FD_SET(events[j].data.fd, readfds);
+
+ if (events[j].events & EPOLLOUT)
+ FD_SET(events[j].data.fd, writefds);
+
+ if ((events[j].events & (EPOLLHUP | EPOLLERR)) && exceptfds)
+ FD_SET(events[j].data.fd, exceptfds);
+ }
+ return total;
+}
+
+int
+PRE(pselect)(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds,
+ const struct timespec *timeout, const sigset_t *sigmask)
+{
+ struct timeval tv, *tv_to;
+
+ if (sigmask != NULL)
+ rte_panic("pselect with signal is not supported");
+
+ if (timeout) {
+ tv.tv_usec = timeout->tv_nsec / 1000;
+ tv.tv_sec = timeout->tv_sec;
+ tv_to = &tv;
+ } else
+ tv_to = NULL;
+
+ return select(nfds, readfds, writefds, exceptfds, tv_to);
+}
diff --git a/lib/libtle_glue/sock.h b/lib/libtle_glue/sock.h
new file mode 100644
index 0000000..fcd6362
--- /dev/null
+++ b/lib/libtle_glue/sock.h
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef _SOCK_H_
+#define _SOCK_H_
+
+#include <stdarg.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <tle_event.h>
+#include <tle_ctx.h>
+
+#include "ctx.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern unsigned int def_sndbuf;
+extern unsigned int def_rcvbuf;
+
+#ifndef TCP_FASTOPEN
+#define TCP_FASTOPEN 23
+#endif
+
+#ifndef TCP_USER_TIMEOUT
+#define TCP_USER_TIMEOUT 18
+#endif
+
+#ifndef TCP_FASTOPEN_CONNECT
+#define TCP_FASTOPEN_CONNECT 30
+#endif
+
+struct sock;
+
+struct proto {
+ int (*setsockopt)(struct sock *sk, int optname, const void *optval,
+ socklen_t optlen);
+ int (*getsockopt)(struct sock *sk, int optname, void *optval,
+ socklen_t *option);
+ int (*getname)(struct sock *sk, struct sockaddr *addr, int peer);
+
+ int (*bind)(struct sock *sk, const struct sockaddr *addr);
+ int (*listen)(struct sock *sk, int backlog);
+ int (*connect)(struct sock *sk, const struct sockaddr *addr);
+ int (*accept)(struct sock *sk, struct sockaddr *addr,
+ socklen_t *addrlen, int flags);
+
+ ssize_t (*recv)(struct tle_stream *s, struct rte_mbuf *pkt[],
+ uint16_t num, struct sockaddr *addr);
+ ssize_t (*send)(struct sock *sk, struct rte_mbuf *pkt[],
+ uint16_t num, const struct sockaddr *dst_addr);
+
+ ssize_t (*readv)(struct tle_stream *s, struct msghdr *msg, int flags);
+ ssize_t (*writev)(struct sock *sk, const struct iovec *iov,
+ int iovcnt, const struct sockaddr *dst_addr);
+
+ int (*shutdown)(struct sock *sk, int how);
+ int (*close)(struct tle_stream *s);
+
+ void (*update_cfg)(struct sock *sk);
+
+ char name[32];
+};
+
+enum {
+ PROTO_TCP,
+ PROTO_UDP
+};
+
+#define RECV_SHUTDOWN 1
+#define SEND_SHUTDOWN 2
+
+extern struct proto udp_prot;
+extern struct proto tcp_prot;
+extern struct proto *supported_proto_ops[];
+
+struct sock {
+ int fd;
+ uint32_t cid:8, /* ctx id for indexing ctx_array */
+ domain:8, /* for AF_INET, AF_INET6 */
+ proto:8, /* PROTO_TCP, PROTO_UDP */
+ valid:1,
+ epoll:1,
+ ubind:1,
+ ubindany:1,
+ nonblock:1,
+ tcp_connected:1,
+ shutdown:2;
+ struct tle_stream *s;
+ struct rte_mbuf *rx_left;
+ tle_stream_options_t option;
+ union {
+ struct epoll_event event;
+ int shadow_efd;
+ };
+ struct tle_event txev;
+ struct tle_event rxev;
+ struct tle_event erev;
+} __rte_cache_aligned;
+
+#define CTX(so) (&ctx_array[so->cid])
+#define OPS(so) (supported_proto_ops[so->proto])
+#define IS_TCP(so) (so->proto == PROTO_TCP)
+#define IS_UDP(so) (so->proto == PROTO_UDP)
+
+static inline int
+is_nonblock(struct sock *so, int flags)
+{
+ return (flags & MSG_DONTWAIT) || so->nonblock;
+}
+
+static inline struct tle_ctx *
+get_sock_ctx(struct sock *so)
+{
+ if (IS_TCP(so))
+ return CTX(so)->tcp_ctx;
+ else
+ return CTX(so)->udp_ctx;
+}
+
+static inline size_t
+get_sockaddr_len(sa_family_t family)
+{
+ switch (family) {
+ case AF_INET:
+ return sizeof(struct sockaddr_in);
+ case AF_INET6:
+ return sizeof(struct sockaddr_in6);
+ case AF_UNSPEC:
+ return sizeof(sa_family_t);
+ default:
+ return 0;
+ }
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /*_SOCK_H_ */
diff --git a/lib/libtle_glue/socket.c b/lib/libtle_glue/socket.c
new file mode 100644
index 0000000..31b28be
--- /dev/null
+++ b/lib/libtle_glue/socket.c
@@ -0,0 +1,720 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "sym.h"
+
+#include <stdarg.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
+
+#include "tle_glue.h"
+#include "fd.h"
+#include "log.h"
+#include "util.h"
+#include "internal.h"
+#include "sock.h"
+
+struct proto *supported_proto_ops[] = {
+ [PROTO_TCP] = &tcp_prot,
+ [PROTO_UDP] = &udp_prot,
+};
+
+/* for setup, settings, and destroy */
+int PRE(socket)(int domain, int type, int protocol)
+{
+ int fd;
+ struct sock *so;
+
+ if ((domain != AF_INET && domain != AF_INET6) ||
+ (type != SOCK_STREAM && type != SOCK_DGRAM))
+ return k_socket(domain, type, protocol);
+
+ if (domain == AF_INET) {
+ if (default_ctx->ipv4 == 0 && !default_ctx->lo4_enabled) {
+ errno = EAFNOSUPPORT;
+ return -1;
+ }
+ } else {
+ if (IN6_IS_ADDR_UNSPECIFIED(&default_ctx->ipv6) &&
+ !default_ctx->lo6_enabled) {
+ errno = EAFNOSUPPORT;
+ return -1;
+ }
+ }
+
+ fd = get_unused_fd();
+ if (fd < 0) {
+ errno = ENFILE;
+ return -1;
+ }
+ so = fd2sock(fd);
+ so->cid = get_cid();
+ if (type == SOCK_STREAM)
+ so->proto = PROTO_TCP;
+ else /* type == SOCK_DGRAM */
+ so->proto = PROTO_UDP;
+
+ so->domain = domain;
+ so->option.raw = 0;
+ so->option.mulloop = 1;
+ so->option.multtl = 1;
+ if (type == SOCK_STREAM) {
+ so->option.tcpquickack = 1;
+ /* linux default value: 2 hours */
+ so->option.keepidle = 2 * 60 * 60;
+ /* linux default value: 75seconds */
+ so->option.keepintvl = 75;
+ /* linux default value: 9 */
+ so->option.keepcnt = 9;
+ }
+
+ sock_alloc_events(so);
+
+ GLUE_DEBUG("socket fd = %d", fd);
+ printf("socket fd = %d", fd);
+ return fd;
+}
+
+int PRE(bind)(int sockfd, const struct sockaddr *addr, socklen_t addrlen)
+{
+ struct sock *so;
+
+ if (is_kernel_fd(sockfd))
+ return k_bind(sockfd, addr, addrlen);
+
+ so = fd2sock(sockfd);
+ if (so->s) {
+ /* The socket is already bound to an address */
+ errno = EINVAL;
+ return -1;
+ }
+
+ if (addrlen < get_sockaddr_len(addr->sa_family)) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ so->cid = get_cid(); /* allow ctx reset as stream is null */
+ if (OPS(so)->bind)
+ return OPS(so)->bind(so, addr);
+
+ errno = EOPNOTSUPP;
+ return -1;
+}
+
+int PRE(listen)(int sockfd, int backlog)
+{
+ struct sock *so;
+
+ if (is_kernel_fd(sockfd))
+ return k_listen(sockfd, backlog);
+
+ so = fd2sock(sockfd);
+
+ if (OPS(so)->listen)
+ return OPS(so)->listen(so, backlog);
+
+ errno = EOPNOTSUPP;
+ return -1;
+}
+
+int PRE(accept)(int sockfd, struct sockaddr *addr, socklen_t *addrlen)
+{
+ struct sock *so;
+
+ if (is_kernel_fd(sockfd))
+ return k_accept(sockfd, addr, addrlen);
+
+ so = fd2sock(sockfd);
+ if (OPS(so)->accept)
+ return OPS(so)->accept(so, addr, addrlen, 0);
+
+ errno = EOPNOTSUPP;
+ return -1;
+}
+
+int PRE(accept4)(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags)
+{
+ int fd;
+ struct sock *so;
+
+ if (is_kernel_fd(sockfd))
+ return k_accept4(sockfd, addr, addrlen, flags);
+
+ fd = PRE(accept)(sockfd, addr, addrlen);
+
+ /* inherit NONBLOCK flag */
+ if (fd >= 0 && (flags & SOCK_NONBLOCK)) {
+ so = fd2sock(fd);
+ so->nonblock = 1;
+ }
+
+ return fd;
+}
+
+int PRE(connect)(int sockfd, const struct sockaddr *addr, socklen_t addrlen)
+{
+ struct sock *so;
+
+ if (is_kernel_fd(sockfd))
+ return k_connect(sockfd, addr, addrlen);
+
+ if (addrlen < get_sockaddr_len(addr->sa_family)) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ so = fd2sock(sockfd);
+ so->cid = get_cid();
+
+ if (!(is_nonblock(so, 0)))
+ mac_check(CTX(so), addr);
+
+ if (OPS(so)->connect)
+ return OPS(so)->connect(so, addr);
+
+ errno = EOPNOTSUPP;
+ return -1;
+}
+
+unsigned int def_sndbuf = 212992;
+unsigned int def_rcvbuf = 212992;
+static struct linger ling;
+
+int PRE(getsockopt)(int sockfd, int level, int optname,
+ void *optval, socklen_t *optlen)
+{
+ struct sock *so;
+ union {
+ int val;
+ uint64_t val64;
+ struct linger ling;
+ struct timeval tm;
+ } *p = optval;
+
+
+ if (is_kernel_fd(sockfd))
+ return k_getsockopt(sockfd, level, optname, optval, optlen);
+
+ if (!optval && !optlen)
+ return -1;
+
+ so = fd2sock(sockfd);
+
+ switch (level) {
+ case IPPROTO_IP:
+ switch (optname) {
+ case IP_OPTIONS:
+ *optlen = 0;
+ return 0;
+ case IP_MULTICAST_LOOP:
+ p->val = so->option.mulloop;
+ return 0;
+ case IP_MULTICAST_TTL:
+ p->val = so->option.multtl;
+ return 0;
+ }
+ break;
+ case IPPROTO_IPV6:
+ switch (optname) {
+ case IPV6_V6ONLY:
+ p->val = so->option.ipv6only;
+ return 0;
+ }
+ break;
+ case SOL_SOCKET:
+ /* man socket(7), see /usr/include/asm-generic/socket.h */
+ switch (optname) {
+ case SO_REUSEADDR:
+ p->val = so->option.reuseaddr;
+ return 0;
+ case SO_REUSEPORT:
+ p->val = so->option.reuseport;
+ return 0;
+ case SO_ERROR:
+ if (TLE_SEV_DOWN == tle_event_state(&so->erev))
+ p->val = 0;
+ else
+ p->val = ECONNREFUSED;
+ /* fixe me: ETIMEDOUT */
+ return 0;
+ case SO_LINGER:
+ p->ling.l_onoff = 0;
+ return 0;
+ case SO_SNDBUF:
+ p->val = def_sndbuf;
+ return 0;
+ case SO_RCVBUF:
+ p->val = def_rcvbuf;
+ return 0;
+ case SO_ACCEPTCONN:
+ if (IS_TCP(so)
+ && TCP_STREAM(so->s)->tcb.state == TCP_ST_LISTEN)
+ p->val = 1;
+ else
+ p->val = 0;
+ return 0;
+ case SO_KEEPALIVE:
+ p->val = so->option.keepalive;
+ return 0;
+ case SO_TYPE:
+ if (IS_TCP(so))
+ p->val = SOCK_STREAM;
+ else
+ p->val = SOCK_DGRAM;
+ return 0;
+ case SO_OOBINLINE:
+ p->val = so->option.oobinline;
+ return 0;
+ case SO_TIMESTAMP:
+ p->val = so->option.timestamp;
+ return 0;
+ case SO_PROTOCOL:
+ if (so->proto == PROTO_TCP)
+ p->val = IPPROTO_TCP;
+ else
+ p->val = IPPROTO_UDP;
+ return 0;
+ default:
+ break;
+ }
+
+ break;
+ case SOL_TCP:
+ case SOL_UDP:
+ return OPS(so)->getsockopt(so, optname, optval, optlen);
+ }
+
+ GLUE_LOG(WARNING, "getsockopt(%d) with level = %d, optname = %d",
+ sockfd, level, optname);
+ errno = EOPNOTSUPP;
+ return -1;
+}
+
+int PRE(setsockopt)(int sockfd, int level, int optname,
+ const void *optval, socklen_t optlen)
+{
+ int val;
+ struct sock *so;
+ if (is_kernel_fd(sockfd))
+ return k_setsockopt(sockfd, level, optname, optval, optlen);
+ if (!optval && !optlen)
+ return -1;
+
+ val = 0; /* just to make compiler happy */
+ switch (optlen) {
+ case sizeof(char):
+ val = *(const char *)optval;
+ break;
+ case sizeof(int):
+ val = *(const int *)optval;
+ break;
+ }
+
+ so = fd2sock(sockfd);
+
+ switch (level) {
+ case IPPROTO_IP:
+ switch (optname) {
+ case IP_RECVERR:
+ /* needed by netperf */
+ return 0;
+ case IP_MULTICAST_LOOP:
+ if (val == 0)
+ so->option.mulloop = 0;
+ else
+ so->option.mulloop = 1;
+ if (so->s != NULL)
+ so->s->option.mulloop = so->option.mulloop;
+ return 0;
+ case IP_MULTICAST_TTL:
+ if (val > 255 || val < -1) {
+ errno = EINVAL;
+ return -1;
+ }
+ if(val == -1) {
+ val = 1;
+ }
+ so->option.multtl = val;
+ if (so->s != NULL)
+ so->s->option.multtl = so->option.multtl;
+ return 0;
+ case IP_ADD_MEMBERSHIP:
+ if (optlen < sizeof(struct ip_mreq)) {
+ errno = EINVAL;
+ return -1;
+ }
+ const struct ip_mreq* mreq = (const struct ip_mreq*)optval;
+ if (mreq->imr_multiaddr.s_addr == INADDR_ANY) {
+ errno = EINVAL;
+ return -1;
+ }
+ errno = EOPNOTSUPP;
+ return -1;
+ case IP_MTU_DISCOVER:
+ return 0;
+ case IP_TOS:
+ return 0;
+ case IP_RECVTOS:
+ return 0;
+ }
+ break;
+ case IPPROTO_IPV6:
+ switch (optname) {
+ case IPV6_V6ONLY:
+ if (val == 0)
+ so->option.ipv6only = 0;
+ else
+ so->option.ipv6only = 1;
+ if (so->s != NULL)
+ so->s->option.ipv6only = so->option.ipv6only;
+ return 0;
+ case IPV6_TCLASS:
+ return 0;
+ case IPV6_RECVTCLASS:
+ return 0;
+ }
+ break;
+ case SOL_SOCKET:
+ switch (optname) {
+ case SO_REUSEADDR:
+ if (val == 0)
+ so->option.reuseaddr = 0;
+ else
+ so->option.reuseaddr = 1;
+ if (so->s != NULL)
+ so->s->option.reuseaddr = so->option.reuseaddr;
+ return 0;
+ case SO_LINGER:
+ ling = *(const struct linger *)optval;
+ if (ling.l_onoff == 0)
+ return 0;
+ else {
+ GLUE_LOG(ERR, "app is enabling SO_LINGER which is not really supported");
+ return 0;
+ }
+ break;
+ case SO_KEEPALIVE:
+ if (val == 0)
+ so->option.keepalive = 0;
+ else
+ so->option.keepalive = 1;
+ if (so->s != NULL) {
+ so->s->option.keepalive = so->option.keepalive;
+ if (so->proto == PROTO_TCP)
+ tle_tcp_stream_set_keepalive(so->s);
+ }
+ return 0;
+ case SO_REUSEPORT:
+ if (val == 0)
+ so->option.reuseport = 0;
+ else
+ so->option.reuseport = 1;
+ if (so->s != NULL)
+ so->s->option.reuseport = so->option.reuseport;
+ return 0;
+ case SO_SNDBUF:
+ def_sndbuf = val;
+ return 0;
+ case SO_RCVBUF:
+ def_rcvbuf = val;
+ return 0;
+ case SO_DONTROUTE:
+ /* needed by netperf */
+ return 0;
+ case SO_BROADCAST:
+ /* needed by nc */
+ /* todo: only supported for DGRAM */
+ return 0;
+ case SO_TIMESTAMP:
+ so->option.timestamp = !!val;
+ if (so->s != NULL)
+ so->s->option.timestamp = so->option.timestamp;
+ return 0;
+ case SO_OOBINLINE:
+ if (val == 0)
+ so->option.oobinline = 0;
+ else
+ so->option.oobinline = 1;
+ if (so->s != NULL)
+ so->s->option.oobinline = so->option.oobinline;
+ return 0;
+ default:
+ break;
+ }
+ break;
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ return OPS(so)->setsockopt(so, optname, optval, optlen);
+ }
+
+ GLUE_LOG(WARNING, "setsockopt(%d) with level = %d, optname = %d\n",
+ sockfd, level, optname);
+ errno = EOPNOTSUPP;
+ return -1;
+}
+
+/*
+ * Refer to glibc/sysdeps/unix/sysv/linux/fcntl.c
+ */
+int PRE(fcntl)(int fd, int cmd, ...)
+{
+ int rc;
+ void *arg;
+ va_list ap;
+ struct sock *so;
+
+ va_start(ap, cmd);
+ arg = va_arg(ap, void *);
+ va_end(ap);
+
+ if (is_kernel_fd(fd))
+ return k_fcntl(fd, cmd, arg);
+
+ so = fd2sock(fd);
+ switch (cmd) {
+ case F_SETFL:
+ if ((unsigned long)arg & O_NONBLOCK)
+ so->nonblock = 1;
+ else
+ so->nonblock = 0;
+ rc = 0;
+ break;
+ case F_GETFL:
+ if (so->nonblock)
+ rc = O_NONBLOCK | O_RDWR;
+ else
+ rc = O_RDWR;
+ break;
+ case F_SETFD:
+ rc = 0;
+ break;
+ default:
+ rc = -1;
+ errno = EOPNOTSUPP;
+ GLUE_LOG(WARNING, "fcntl(%d) with cmd = %d", fd, cmd);
+ }
+
+ return rc;
+}
+
+/*
+ * Refer to musl/src/misc/ioctl.c
+ */
+int PRE(ioctl)(int fd, unsigned long int request, ...)
+{
+ int rc;
+ void *arg;
+ va_list ap;
+ uint16_t left;
+ struct sock *so;
+ struct rte_mbuf *m;
+
+ va_start(ap, request);
+ arg = va_arg(ap, void *);
+ va_end(ap);
+
+ if (is_kernel_fd(fd))
+ return k_ioctl(fd, request, arg);
+
+ so = fd2sock(fd);
+
+ switch (request) {
+ case FIONREAD: /* SIOCINQ */
+ if (so->s == NULL)
+ *(int *)arg = 0;
+ else if (IS_TCP(so)) {
+ left = tle_tcp_stream_inq(so->s);
+ if (so->rx_left)
+ left += rte_pktmbuf_pkt_len(so->rx_left);
+ *(int *)arg = left;
+ } else {
+ if (so->rx_left)
+ *(int *)arg = rte_pktmbuf_pkt_len(so->rx_left);
+ else {
+ if (tle_udp_stream_recv(so->s, &m , 1) == 0)
+ *(int *)arg = 0;
+ else {
+ *(int *)arg = rte_pktmbuf_pkt_len(m);
+ so->rx_left = m;
+ }
+ }
+ }
+ rc = 0;
+ break;
+ case FIONBIO:
+ if (*(int *)arg)
+ so->nonblock = 1;
+ else
+ so->nonblock = 0;
+ rc = 0;
+ break;
+ case SIOCGSTAMP:
+ if (so->s->timestamp == 0) {
+ errno = ENOENT;
+ rc = -1;
+ } else {
+ ((struct timeval*)arg)->tv_sec = so->s->timestamp >> 20;
+ ((struct timeval*)arg)->tv_usec = so->s->timestamp & 0xFFFFFUL;
+ rc = 0;
+ }
+ break;
+ default:
+ errno = EOPNOTSUPP;
+ rc = -1;
+ GLUE_LOG(WARNING, "ioctl(%d) with request = %ld", fd, request);
+ }
+
+ return rc;
+}
+
+int PRE(shutdown)(int sockfd, int how)
+{
+ struct sock *so;
+
+ if (is_kernel_fd(sockfd))
+ return k_shutdown(sockfd, how);
+
+ so = fd2sock(sockfd);
+ switch (how) {
+ case SHUT_RD:
+ so->shutdown |= RECV_SHUTDOWN;
+ break;
+ case SHUT_WR:
+ so->shutdown |= SEND_SHUTDOWN;
+ break;
+ case SHUT_RDWR:
+ so->shutdown = RECV_SHUTDOWN | SEND_SHUTDOWN;
+ break;
+ }
+ if (OPS(so)->shutdown)
+ return OPS(so)->shutdown(so, how);
+
+ errno = EOPNOTSUPP;
+ return -1;
+}
+
+static inline int
+getname(int sockfd, struct sockaddr *uaddr, socklen_t *addrlen, int peer)
+{
+ struct sock *so;
+ size_t socklen;
+ int rc;
+
+ so = fd2sock(sockfd);
+
+ /* This is ugly, but netperf ask for local addr (before any
+ * connect or bind) to check family.
+ *
+ * To formally fix this, we shall bind a local address in advance
+ */
+ socklen = get_sockaddr_len(so->domain);
+ /* fixme: It is not conform to linux standard, fix it later. */
+ if (*addrlen < socklen) {
+ errno = EINVAL;
+ return -1;
+ }
+ *addrlen = socklen;
+
+ if (so->s == NULL) {
+ if (peer) {
+ errno = ENOTCONN;
+ return -1;
+ } else {
+ memset(uaddr, 0, socklen);
+ uaddr->sa_family = so->domain;
+ return 0;
+ }
+ }
+
+ if (OPS(so)->getname) {
+ rc = OPS(so)->getname(so, uaddr, peer);
+ if (rc < 0)
+ return rc;
+ if (peer) {
+ if ((uaddr->sa_family == AF_INET &&
+ ((struct sockaddr_in*)uaddr)->sin_addr.s_addr == 0) ||
+ (uaddr->sa_family == AF_INET6 &&
+ IN6_IS_ADDR_UNSPECIFIED(&((struct sockaddr_in6*)
+ uaddr)->sin6_addr))) {
+ errno = ENOTCONN;
+ return -1;
+ }
+ }
+ if (uaddr->sa_family == AF_INET && so->domain == AF_INET6)
+ trans_4mapped6_addr(uaddr);
+ return rc;
+ }
+
+ errno = EOPNOTSUPP;
+ return -1;
+}
+
+int PRE(getsockname)(int sockfd, struct sockaddr *addr, socklen_t *addrlen)
+{
+ if (is_kernel_fd(sockfd))
+ return k_getsockname(sockfd, addr, addrlen);
+
+ return getname(sockfd, addr, addrlen, 0);
+}
+
+int PRE(getpeername)(int sockfd, struct sockaddr *addr, socklen_t *addrlen)
+{
+ if (is_kernel_fd(sockfd))
+ return k_getpeername(sockfd, addr, addrlen);
+
+ return getname(sockfd, addr, addrlen, 1);
+}
+
+int PRE(close)(int fd)
+{
+ struct sock *so;
+
+ if (is_kernel_fd(fd))
+ return k_close(fd);
+
+ GLUE_DEBUG("close fd = %d", fd);
+
+ so = fd2sock(fd);
+ if (unlikely(so->valid == 0)) {
+ errno = EBADF;
+ return -1;
+ } else if (unlikely(so->epoll)) {
+ k_close(so->shadow_efd);
+ glue_ctx_free(CTX(so));
+ } else if (so->s) {
+ if (OPS(so)->close)
+ OPS(so)->close(so->s);
+
+ if (IS_TCP(so))
+ be_tx_with_lock(CTX(so));
+
+ if (so->rx_left)
+ rte_pktmbuf_free(so->rx_left);
+ }
+
+ tle_event_idle_err(&so->erev);
+ tle_event_idle(&so->rxev);
+ tle_event_idle(&so->txev);
+
+ memset(((int*)so) + 1, 0, sizeof(*so) - sizeof(int));
+ put_free_fd(fd);
+ return 0;
+}
diff --git a/lib/libtle_glue/sym.c b/lib/libtle_glue/sym.c
new file mode 100644
index 0000000..39b1707
--- /dev/null
+++ b/lib/libtle_glue/sym.c
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#ifndef __USE_GNU
+#define __USE_GNU
+#endif
+#include <dlfcn.h>
+
+#include <rte_debug.h>
+
+#include "sym.h"
+#include "log.h"
+
+#ifdef PRELOAD
+int (*k_epoll_create)(int size);
+int (*k_epoll_create1)(int flags);
+int (*k_epoll_create1)(int flags);
+int (*k_epoll_ctl)(int epfd, int op, int fd, struct epoll_event *event);
+int (*k_epoll_wait)(int epfd, struct epoll_event *events, int maxevents, int timeout);
+int (*k_epoll_pwait)(int epfd, struct epoll_event *events, int maxevents, int timeout, const sigset_t *sigmask);
+int (*k_poll)(struct pollfd *fds, nfds_t nfds, int timeout);
+int (*k_select)(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, struct timeval *timeout);
+int (*k_pselect)(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, const struct timespec *timeout, const sigset_t *sigmask);
+int (*k_socket)(int domain, int type, int protocol);
+int (*k_listen)(int sockfd, int backlog);
+int (*k_bind)(int sockfd, const struct sockaddr *addr, socklen_t addrlen);
+int (*k_accept)(int sockfd, struct sockaddr *addr, socklen_t *addrlen);
+int (*k_accept4)(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags);
+int (*k_connect)(int sockfd, const struct sockaddr *addr, socklen_t addrlen);
+int (*k_getsockopt)(int sockfd, int level, int optname, void *optval, socklen_t *optlen);
+int (*k_setsockopt)(int sockfd, int level, int optname, const void *optval, socklen_t optlen);
+int (*k_fcntl)(int fd, int cmd, ... /* arg */ );
+int (*k_ioctl)(int d, int request, ...);
+int (*k_shutdown)(int sockfd, int how);
+int (*k_close)(int fd);
+ssize_t (*k_recv)(int sockfd, void *buf, size_t len, int flags);
+ssize_t (*k_recvfrom)(int sockfd, void *buf, size_t len, int flags, struct sockaddr *src_addr, socklen_t *addrlen);
+ssize_t (*k_recvmsg)(int sockfd, struct msghdr *msg, int flags);
+ssize_t (*k_read)(int fd, void *buf, size_t count);
+ssize_t (*k_readv)(int fd, const struct iovec *iov, int iovcnt);
+ssize_t (*k_send)(int sockfd, const void *buf, size_t len, int flags);
+ssize_t (*k_sendto)(int sockfd, const void *buf, size_t len, int flags, const struct sockaddr *dest_addr, socklen_t addrlen);
+ssize_t (*k_sendmsg)(int sockfd, const struct msghdr *msg, int flags);
+ssize_t (*k_write)(int fd, const void *buf, size_t count);
+ssize_t (*k_writev)(int fd, const struct iovec *iov, int iovcnt);
+ssize_t (*k_splice)(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags);
+ssize_t (*k_sendfile)(int out_fd, int in_fd, off_t *offset, size_t count);
+int (*k_getsockname)(int sockfd, struct sockaddr *addr, socklen_t *addrlen);
+int (*k_getpeername)(int sockfd, struct sockaddr *addr, socklen_t *addrlen);
+
+#define INIT_FUNC(func, handle) do { \
+ k_##func = dlsym(handle, #func); \
+ if ((error = dlerror()) != NULL) { \
+ rte_panic(#func "is not init"); \
+ } \
+ RTE_ASSERT(k_##func); \
+} while (0)
+
+#endif
+
+void
+symbol_init(void)
+{
+#ifdef PRELOAD
+ void *handle;
+ char *error;
+
+ TRACE("in %s", __func__);
+
+ handle = dlopen("libc.so.6", RTLD_NOW);
+ error = dlerror();
+ if (!handle) {
+ fprintf(stderr, "%s\n", error);
+ exit(EXIT_FAILURE);
+ }
+
+ INIT_FUNC(epoll_create, handle);
+ INIT_FUNC(epoll_create1, handle);
+ INIT_FUNC(epoll_create1, handle);
+ INIT_FUNC(epoll_ctl, handle);
+ INIT_FUNC(epoll_wait, handle);
+ INIT_FUNC(epoll_pwait, handle);
+ INIT_FUNC(socket, handle);
+ INIT_FUNC(listen, handle);
+ INIT_FUNC(bind, handle);
+ INIT_FUNC(accept, handle);
+ INIT_FUNC(accept4, handle);
+ INIT_FUNC(connect, handle);
+ INIT_FUNC(getsockopt, handle);
+ INIT_FUNC(setsockopt, handle);
+ INIT_FUNC(fcntl, handle);
+ INIT_FUNC(ioctl, handle);
+ INIT_FUNC(shutdown, handle);
+ INIT_FUNC(close, handle);
+ INIT_FUNC(recv, handle);
+ INIT_FUNC(recvfrom, handle);
+ INIT_FUNC(recvmsg, handle);
+ INIT_FUNC(read, handle);
+ INIT_FUNC(readv, handle);
+ INIT_FUNC(send, handle);
+ INIT_FUNC(sendto, handle);
+ INIT_FUNC(sendmsg, handle);
+ INIT_FUNC(write, handle);
+ INIT_FUNC(writev, handle);
+ INIT_FUNC(splice, handle);
+ INIT_FUNC(sendfile, handle);
+ INIT_FUNC(poll, handle);
+ INIT_FUNC(getsockname, handle);
+ INIT_FUNC(getpeername, handle);
+ INIT_FUNC(select, handle);
+ INIT_FUNC(pselect, handle);
+
+ dlclose(handle);
+#endif
+}
diff --git a/lib/libtle_glue/sym.h b/lib/libtle_glue/sym.h
new file mode 100644
index 0000000..b5a333d
--- /dev/null
+++ b/lib/libtle_glue/sym.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _TLE_KSYM_H_
+#define _TLE_KSYM_H_
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <sys/socket.h>
+
+#include <sys/epoll.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <poll.h>
+#include <sys/uio.h>
+#include <sys/sendfile.h>
+#include <sys/select.h>
+#include <sys/time.h>
+
+#include "tle_glue.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void symbol_init(void);
+
+#ifdef PRELOAD
+int (*k_epoll_create)(int size);
+int (*k_epoll_create1)(int flags);
+int (*k_epoll_ctl)(int epfd, int op, int fd, struct epoll_event *event);
+int (*k_epoll_wait)(int epfd, struct epoll_event *events, int maxevents, int timeout);
+int (*k_epoll_pwait)(int epfd, struct epoll_event *events, int maxevents, int timeout, const sigset_t *sigmask);
+int (*k_poll)(struct pollfd *fds, nfds_t nfds, int timeout);
+int (*k_select)(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, struct timeval *timeout);
+int (*k_pselect)(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, const struct timespec *timeout, const sigset_t *sigmask);
+
+int (*k_socket)(int domain, int type, int protocol);
+int (*k_listen)(int sockfd, int backlog);
+int (*k_bind)(int sockfd, const struct sockaddr *addr, socklen_t addrlen);
+int (*k_accept)(int sockfd, struct sockaddr *addr, socklen_t *addrlen);
+int (*k_accept4)(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags);
+int (*k_connect)(int sockfd, const struct sockaddr *addr, socklen_t addrlen);
+int (*k_getsockopt)(int sockfd, int level, int optname, void *optval, socklen_t *optlen);
+int (*k_setsockopt)(int sockfd, int level, int optname, const void *optval, socklen_t optlen);
+int (*k_fcntl)(int fd, int cmd, ... /* arg */ );
+int (*k_ioctl)(int d, int request, ...);
+int (*k_shutdown)(int sockfd, int how);
+int (*k_close)(int fd);
+ssize_t (*k_recv)(int sockfd, void *buf, size_t len, int flags);
+ssize_t (*k_recvfrom)(int sockfd, void *buf, size_t len, int flags, struct sockaddr *src_addr, socklen_t *addrlen);
+ssize_t (*k_recvmsg)(int sockfd, struct msghdr *msg, int flags);
+ssize_t (*k_read)(int fd, void *buf, size_t count);
+ssize_t (*k_readv)(int fd, const struct iovec *iov, int iovcnt);
+ssize_t (*k_send)(int sockfd, const void *buf, size_t len, int flags);
+ssize_t (*k_sendto)(int sockfd, const void *buf, size_t len, int flags, const struct sockaddr *dest_addr, socklen_t addrlen);
+ssize_t (*k_sendmsg)(int sockfd, const struct msghdr *msg, int flags);
+ssize_t (*k_write)(int fd, const void *buf, size_t count);
+ssize_t (*k_writev)(int fd, const struct iovec *iov, int iovcnt);
+ssize_t (*k_splice)(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags);
+ssize_t (*k_sendfile)(int out_fd, int in_fd, off_t *offset, size_t count);
+int (*k_getsockname)(int sockfd, struct sockaddr *addr, socklen_t *addrlen);
+int (*k_getpeername)(int sockfd, struct sockaddr *addr, socklen_t *addrlen);
+#else
+#define k_epoll_create epoll_create
+#define k_epoll_create1 epoll_create1
+#define k_epoll_ctl epoll_ctl
+#define k_epoll_wait epoll_wait
+#define k_epoll_pwait epoll_pwait
+#define k_poll poll
+#define k_select select
+#define k_pselect pselect
+#define k_socket socket
+#define k_listen listen
+#define k_bind bind
+#define k_accept accept
+#define k_accept4 accept4
+#define k_connect connect
+#define k_getsockopt getsockopt
+#define k_setsockopt setsockopt
+#define k_fcntl fcntl
+#define k_ioctl ioctl
+#define k_shutdown shutdown
+#define k_close close
+#define k_recv recv
+#define k_recvfrom recvfrom
+#define k_recvmsg recvmsg
+#define k_read read
+#define k_readv readv
+#define k_send send
+#define k_sendto sendto
+#define k_sendmsg sendmsg
+#define k_write write
+#define k_writev writev
+#define k_splice splice
+#define k_sendfile sendfile
+#define k_getsockname getsockname
+#define k_getpeername getpeername
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _TLE_KSYM_H_ */
diff --git a/lib/libtle_glue/tcp.c b/lib/libtle_glue/tcp.c
new file mode 100644
index 0000000..e5186c0
--- /dev/null
+++ b/lib/libtle_glue/tcp.c
@@ -0,0 +1,558 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdarg.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
+
+#include <tle_tcp.h>
+
+#include "sym.h"
+#include "fd.h"
+#include "log.h"
+#include "util.h"
+#include "internal.h"
+#include "sock.h"
+
+#define MAX_TCP_KEEPIDLE 32767
+#define MAX_TCP_KEEPINTVL 32767
+#define MAX_TCP_KEEPCNT 127
+
+static inline void
+foo_support(const char *msg)
+{
+ GLUE_LOG(WARNING, "%s, return ok without really supporting it", msg);
+}
+
+static int
+tcp_setsockopt(struct sock *sk, int optname,
+ const void *optval, socklen_t optlen)
+{
+ int val;
+
+ val = 0; /* just to make compiler happy */
+ if (optlen == sizeof(val))
+ val = *(const int *)optval;
+
+ /* man tcp(7) or see /usr/include/netinet/tcp.h */
+ switch (optname) {
+ case TCP_NODELAY: /* antonym: TCP_CORK */
+ if (val == 0)
+ sk->option.tcpnodelay = 0;
+ else
+ sk->option.tcpnodelay = 1;
+ if (sk->s != NULL)
+ sk->s->option.tcpnodelay = sk->option.tcpnodelay;
+ return 0;
+ case TCP_CORK:
+ if (val == 0)
+ sk->option.tcpcork = 0;
+ else
+ sk->option.tcpcork = 1;
+ if (sk->s != NULL)
+ sk->s->option.tcpcork = sk->option.tcpcork;
+ return 0;
+ case TCP_KEEPIDLE:
+ if (val <= 0 || val > MAX_TCP_KEEPIDLE) {
+ errno = EINVAL;
+ return -1;
+ }
+ sk->option.keepidle = val;
+ if (sk->s != NULL) {
+ sk->s->option.keepidle = sk->option.keepidle;
+ tle_tcp_stream_set_keepalive(sk->s);
+ }
+ return 0;
+ case TCP_KEEPINTVL:
+ if (val <= 0 || val > MAX_TCP_KEEPINTVL) {
+ errno = EINVAL;
+ return -1;
+ }
+ sk->option.keepintvl = val;
+ if (sk->s != NULL) {
+ sk->s->option.keepintvl = sk->option.keepintvl;
+ tle_tcp_stream_set_keepalive(sk->s);
+ }
+ return 0;
+ case TCP_KEEPCNT:
+ if (val <= 0 || val > MAX_TCP_KEEPCNT) {
+ errno = EINVAL;
+ return -1;
+ }
+ sk->option.keepcnt = val;
+ if (sk->s != NULL)
+ sk->s->option.keepcnt = sk->option.keepcnt;
+ return 0;
+ case TCP_USER_TIMEOUT:
+ foo_support("set TCP_USER_TIMEOUT");
+ return 0;
+ case TCP_DEFER_ACCEPT:
+ if (val == 0)
+ return 0;
+ break;
+ case TCP_FASTOPEN:
+ case TCP_FASTOPEN_CONNECT:
+ if (val == 0)
+ return 0;
+ break;
+ case TCP_QUICKACK:
+ /* Based on below info, it's safe to just return 0:
+ * "This flag is not permanent, it only enables a
+ * switch to or from quickack mode. Subsequent
+ * operationof the TCP protocol will once again ..."
+ */
+ if (val == 0)
+ sk->option.tcpquickack = 0;
+ else
+ sk->option.tcpquickack = 8;
+ if (sk->s != NULL)
+ sk->s->option.tcpquickack = sk->option.tcpquickack;
+ return 0;
+ case TCP_CONGESTION:
+ /* only support NewReno; but we return success for
+ * any kind of setting.
+ */
+ foo_support("set TCP_CONGESTION");
+ return 0;
+ default:
+ break;
+ }
+
+ GLUE_LOG(WARNING, "setsockopt(%d) with level = SOL_TCP, optname = %d\n",
+ sock2fd(sk), optname);
+ errno = EOPNOTSUPP;
+ return -1;
+}
+
+static int
+tcp_getsockopt(struct sock *sk, int optname,
+ void *optval, socklen_t *optlen)
+{
+ int rc;
+ union {
+ int val;
+ uint64_t val64;
+ struct linger ling;
+ struct timeval tm;
+ } *p = optval;
+
+ RTE_SET_USED(optlen);
+
+ /* man tcp(7) or see /usr/include/netinet/tcp.h */
+ switch (optname) {
+ case TCP_MAXSEG:
+ p->val = 64 * 1024;
+ return 0;
+ case TCP_FASTOPEN:
+ case TCP_FASTOPEN_CONNECT:
+ p->val = 0;
+ return 0;
+ case TCP_INFO:
+ /* needed by netperf */
+ rc = tle_tcp_stream_get_info(sk->s, optval, optlen);
+ if (rc < 0) {
+ errno = -rc;
+ return -1;
+ }
+ return 0;
+ case TCP_CONGESTION:
+ strncpy(optval, "NewReno", *optlen);
+ ((char *)optval)[*optlen - 1] = '\0';
+ return 0;
+ case TCP_CORK:
+ p->val = sk->option.tcpcork;
+ return 0;
+ case TCP_QUICKACK:
+ p->val = sk->option.tcpquickack != 0 ? 1 : 0;
+ return 0;
+ case TCP_NODELAY:
+ p->val = sk->option.tcpnodelay;
+ return 0;
+ case TCP_KEEPIDLE:
+ p->val = sk->option.keepidle;
+ return 0;
+ case TCP_KEEPINTVL:
+ p->val = sk->option.keepintvl;
+ return 0;
+ case TCP_KEEPCNT:
+ p->val = sk->option.keepcnt;
+ return 0;
+ default:
+ break;
+ }
+
+ GLUE_LOG(WARNING, "getsockopt(%d) with level = SOL_TCP, optname = %d",
+ sock2fd(sk), optname);
+ errno = EOPNOTSUPP;
+ return -1;
+}
+
+static int
+tcp_getname(struct sock *sk, struct sockaddr *addr, int peer)
+{
+ int rc;
+ int addrlen;
+ struct tle_tcp_stream_addr a;
+
+ rc = tle_tcp_stream_get_addr(sk->s, &a);
+ if (rc) {
+ errno = -rc;
+ return -1;
+ }
+
+ if (a.local.ss_family == AF_INET)
+ addrlen = sizeof(struct sockaddr_in);
+ else
+ addrlen = sizeof(struct sockaddr_in6);
+
+ if (peer)
+ memcpy(addr, &a.remote, addrlen);
+ else
+ memcpy(addr, &a.local, addrlen);
+
+ addr->sa_family = a.local.ss_family;
+
+ return 0;
+}
+
+static int
+tcp_bind(struct sock *sk, const struct sockaddr *addr)
+{
+ sk->s = open_bind(sk, addr, NULL);
+ if (sk->s == NULL)
+ return -1;
+ return 0;
+}
+
+static int
+tcp_listen(struct sock *sk, int backlog)
+{
+ int32_t rc;
+
+ if (backlog < 0) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ /*
+ * if socket is unbind, should call open_bind to assign an ramdon addres
+ * before listening
+ */
+ if (sk->s == NULL) {
+ sk->s = open_bind(sk, NULL, NULL);
+ if (sk->s == NULL)
+ return -1;
+ }
+
+ rc = tle_tcp_stream_listen(sk->s);
+ if (rc) {
+ errno = -rc;
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+tcp_connect(struct sock *sk, const struct sockaddr *addr)
+{
+ int rc;
+ int rx;
+ int ret;
+ struct epoll_event event;
+ struct sockaddr_storage laddr;
+ struct sockaddr_storage raddr;
+ struct sockaddr_in *addr4;
+ struct sockaddr_in6 *addr6;
+ struct sockaddr *local = NULL;
+
+ /* TODO: For multi-thread case, we shall properly manage local
+ * L4 port so that packets coming back can be put into the same
+ * queue pair.
+ */
+ if (sk->s) {
+ struct tle_tcp_stream *ts = TCP_STREAM(sk->s);
+ /* case 1: bind happens before connect;
+ * case 2: connect after a previous connect, failed
+ * or succeeded.
+ */
+ if (ts->tcb.err != 0) {
+ errno = ts->tcb.err;
+ return -1;
+ }
+
+ int state = ts->tcb.state;
+
+ if (state >= TCP_ST_ESTABLISHED && sk->tcp_connected == 0) {
+ sk->tcp_connected = 1;
+ return 0; /* connect succeeds */
+ }
+
+ if (state == TCP_ST_CLOSED) {
+ if (tcp_getname(sk, (struct sockaddr *)&laddr, 0) == 0)
+ local = (struct sockaddr *)&laddr;
+ tle_tcp_stream_close(sk->s);
+ sk->s = NULL;
+ goto do_connect; /* case 1 */
+ } else if (state >= TCP_ST_SYN_SENT &&
+ state < TCP_ST_ESTABLISHED)
+ errno = EALREADY;
+ else if (state >= TCP_ST_ESTABLISHED)
+ errno = EISCONN;
+ else
+ errno = EINVAL;
+ return -1;
+ }
+
+do_connect:
+ sk->s = open_bind(sk, local, addr);
+ if (sk->s == NULL) /* errno is set */
+ return -1;
+
+ if (sk->domain == AF_INET) {
+ addr4 = (struct sockaddr_in*)&raddr;
+ addr4->sin_family = AF_INET;
+ addr4->sin_port = sk->s->port.src;
+ addr4->sin_addr.s_addr = sk->s->ipv4.addr.src;
+ } else {
+ addr6 = (struct sockaddr_in6*)&raddr;
+ addr6->sin6_family = AF_INET6;
+ addr6->sin6_port = sk->s->port.src;
+ rte_memcpy(&addr6->sin6_addr, &sk->s->ipv6.addr.src,
+ sizeof(struct in6_addr));
+ }
+ rc = tle_tcp_stream_connect(sk->s, (const struct sockaddr*)&raddr);
+ if (rc < 0) {
+ errno = -rc;
+ return -1;
+ }
+
+ if (is_nonblock(sk, 0)) {
+ be_tx_with_lock(CTX(sk));
+ errno = EINPROGRESS; /* It could not be ready so fast */
+ return -1;
+ }
+
+ do {
+ be_process(CTX(sk));
+
+ if (tle_event_state(&sk->txev) == TLE_SEV_UP) {
+ sk->tcp_connected = 1;
+ tle_event_down(&sk->txev);
+ ret = 0;
+ break;
+ }
+
+ if (tle_event_state(&sk->erev) == TLE_SEV_UP) {
+ tle_event_down(&sk->erev);
+ errno = ECONNREFUSED;
+ ret = -1;
+ break;
+ }
+
+ /* fix me: timeout? */
+ epoll_kernel_wait(CTX(sk), -1, &event, 1, 1, &rx);
+ } while (1);
+
+ return ret;
+}
+
+static void tcp_update_cfg(struct sock *sk);
+
+static int
+tcp_accept(struct sock *sk, struct sockaddr *addr,
+ socklen_t *addrlen, int flags)
+{
+ int fd;
+ int rx;
+ struct sock *newsk;
+ struct tle_stream *rs;
+ struct epoll_event event;
+ struct tle_tcp_stream_addr a;
+
+ if (sk->s == NULL) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ fd = get_unused_fd();
+ if (fd < 0) {
+ errno = ENFILE;
+ return -1;
+ }
+
+ newsk = fd2sock(fd);
+again:
+ if (tle_tcp_stream_accept(sk->s, &rs, 1) == 0) {
+ if (rte_errno != EAGAIN) {
+ errno = rte_errno;
+ return -1;
+ }
+
+ if (is_nonblock(sk, flags)) {
+ newsk->valid = 0;
+ put_free_fd(fd);
+ errno = EAGAIN;
+ return -1;
+ }
+
+ epoll_kernel_wait(CTX(sk), -1, &event, 1, 1, &rx);
+ be_process(CTX(sk));
+ goto again;
+ }
+
+ newsk->s = rs;
+ newsk->cid = sk->cid;
+ newsk->domain = sk->domain;
+ newsk->proto = sk->proto;
+ newsk->option.raw = 0;
+ newsk->option.tcpquickack = 1;
+ newsk->option.mulloop = 1;
+ newsk->option.multtl = 1;
+ newsk->option.keepidle = 2 * 60 * 60;
+ newsk->option.keepintvl = 75;
+ newsk->option.keepcnt = 9;
+ newsk->s->option.raw = newsk->option.raw;
+ sock_alloc_events(newsk);
+ tcp_update_cfg(newsk);
+
+ if (addr) {
+ /* We assume this function never fails */
+ tle_tcp_stream_get_addr(rs, &a);
+
+ *addrlen = sizeof(struct sockaddr_in);
+ memcpy(addr, &a.remote, *addrlen);
+ }
+
+ GLUE_DEBUG("accept fd = %d", fd);
+ return fd;
+}
+
+static ssize_t
+tcp_send(struct sock *sk, struct rte_mbuf *pkt[],
+ uint16_t num, const struct sockaddr *dst_addr)
+{
+ uint16_t rc;
+ RTE_SET_USED(dst_addr);
+
+ if (sk->s == NULL) {
+ errno = EPIPE;
+ return 0;
+ }
+
+ rc = tle_tcp_stream_send(sk->s, pkt, num);
+ if (rc == 0)
+ errno = rte_errno;
+ return rc;
+}
+
+static ssize_t
+tcp_recv(struct tle_stream *s, struct rte_mbuf *pkt[],
+ uint16_t num, struct sockaddr *addr)
+{
+ uint16_t rc;
+
+ RTE_SET_USED(addr);
+
+ /* optimize me: merge multiple mbufs into one */
+ rc = tle_tcp_stream_recv(s, pkt, num);
+ if (rc == 0)
+ errno = rte_errno;
+
+ return rc;
+}
+
+static ssize_t
+tcp_readv(struct tle_stream *ts, struct msghdr *msg, int flags __rte_unused)
+{
+ ssize_t rc;
+
+ rc = tle_tcp_stream_recvmsg(ts, msg);
+ if (rc < 0)
+ errno = rte_errno;
+ return rc;
+}
+
+static ssize_t
+tcp_writev(struct sock *sk, const struct iovec *iov,
+ int iovcnt, const struct sockaddr *dst_addr)
+{
+ ssize_t rc;
+ struct rte_mempool *mp = get_mempool_by_socket(0); /* fix me */
+
+ RTE_SET_USED(dst_addr);
+
+ if (sk->s == NULL) {
+ errno = EPIPE;
+ return -1;
+ }
+
+ rc = tle_tcp_stream_writev(sk->s, mp, iov, iovcnt);
+ if (rc < 0)
+ errno = rte_errno;
+ return rc;
+}
+
+static int
+tcp_shutdown(struct sock *sk, int how)
+{
+ int ret;
+
+ /* Refer to linux/net/ipv4/tcp.c:tcp_shutdown() */
+ if (how == SHUT_RD)
+ return 0;
+
+ ret = tle_tcp_stream_shutdown(sk->s, how);
+ if (ret < 0)
+ errno = rte_errno;
+ else
+ be_tx_with_lock(CTX(sk)); /* Make sure fin is sent */
+ return ret;
+
+}
+
+static void
+tcp_update_cfg(struct sock *sk)
+{
+ struct tle_tcp_stream_cfg prm = {0};
+
+ prm.recv_ev = &sk->rxev;
+ prm.send_ev = &sk->txev;
+ prm.err_ev = &sk->erev;
+ tle_tcp_stream_update_cfg(&sk->s, &prm, 1);
+}
+
+struct proto tcp_prot = {
+ .name = "TCP",
+ .setsockopt = tcp_setsockopt,
+ .getsockopt = tcp_getsockopt,
+ .getname = tcp_getname,
+ .bind = tcp_bind,
+ .listen = tcp_listen,
+ .connect = tcp_connect,
+ .accept = tcp_accept,
+ .recv = tcp_recv,
+ .send = tcp_send,
+ .readv = tcp_readv,
+ .writev = tcp_writev,
+ .shutdown = tcp_shutdown,
+ .close = tle_tcp_stream_close,
+ .update_cfg = tcp_update_cfg,
+};
diff --git a/lib/libtle_glue/tle_glue.h b/lib/libtle_glue/tle_glue.h
new file mode 100644
index 0000000..38357e4
--- /dev/null
+++ b/lib/libtle_glue/tle_glue.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _TLE_GLUE_H_
+#define _TLE_GLUE_H_
+
+#include <sys/types.h>
+#include <sys/epoll.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <signal.h>
+#include <poll.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef PRELOAD
+
+#define PRE(name) name
+
+#else
+
+#define PRE(name) tle_ ## name
+
+#endif
+
+void glue_init1(int argc, char **argv);
+
+/* epoll */
+int PRE(epoll_create)(int size);
+int PRE(epoll_create1)(int flags);
+int PRE(epoll_ctl)(int epfd, int op, int fd, struct epoll_event *event);
+int PRE(epoll_wait)(int epfd, struct epoll_event *events, int maxevents, int timeout);
+int PRE(epoll_pwait)(int epfd, struct epoll_event *events,
+ int maxevents, int timeout, const sigset_t *sigmask);
+
+/* for setup, settings, and destroy */
+int PRE(socket)(int domain, int type, int protocol);
+int PRE(listen)(int sockfd, int backlog);
+int PRE(bind)(int sockfd, const struct sockaddr *addr, socklen_t addrlen);
+int PRE(accept)(int sockfd, struct sockaddr *addr, socklen_t *addrlen);
+int PRE(accept4)(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags);
+int PRE(connect)(int sockfd, const struct sockaddr *addr, socklen_t addrlen);
+int PRE(getsockopt)(int sockfd, int level, int optname,
+ void *optval, socklen_t *optlen);
+int PRE(setsockopt)(int sockfd, int level, int optname,
+ const void *optval, socklen_t optlen);
+int PRE(getsockname)(int sockfd, struct sockaddr *addr, socklen_t *addrlen);
+int PRE(getpeername)(int sockfd, struct sockaddr *addr, socklen_t *addrlen);
+int PRE(fcntl)(int fd, int cmd, ... /* arg */ );
+int PRE(ioctl)(int d, unsigned long int request, ...);
+int PRE(shutdown)(int sockfd, int how);
+int PRE(close)(int fd);
+
+/* for recv */
+ssize_t PRE(recv)(int sockfd, void *buf, size_t len, int flags);
+ssize_t PRE(recvfrom)(int sockfd, void *buf, size_t len, int flags,
+ struct sockaddr *src_addr, socklen_t *addrlen);
+ssize_t PRE(recvmsg)(int sockfd, struct msghdr *msg, int flags);
+ssize_t PRE(read)(int fd, void *buf, size_t count);
+ssize_t PRE(readv)(int fd, const struct iovec *iov, int iovcnt);
+
+/* for send */
+ssize_t PRE(send)(int sockfd, const void *buf, size_t len, int flags);
+ssize_t PRE(sendto)(int sockfd, const void *buf, size_t len, int flags,
+ const struct sockaddr *dest_addr, socklen_t addrlen);
+ssize_t PRE(sendmsg)(int sockfd, const struct msghdr *msg, int flags);
+ssize_t PRE(write)(int fd, const void *buf, size_t count);
+ssize_t PRE(writev)(int fd, const struct iovec *iov, int iovcnt);
+
+/* advanced functions */
+ssize_t PRE(splice)(int fd_in, loff_t *off_in, int fd_out,
+ loff_t *off_out, size_t len, unsigned int flags);
+ssize_t PRE(sendfile)(int out_fd, int in_fd, off_t *offset, size_t count);
+
+/* poll */
+int PRE(poll)(struct pollfd *fds, nfds_t nfds, int timeout);
+int PRE(ppoll)(struct pollfd *fds, nfds_t nfds,
+ const struct timespec *tmo_p, const sigset_t *sigmask);
+
+/* select */
+int PRE(select)(int nfds, fd_set *readfds, fd_set *writefds,
+ fd_set *exceptfds, struct timeval *timeout);
+int PRE(pselect)(int nfds, fd_set *readfds, fd_set *writefds,
+ fd_set *exceptfds, const struct timespec *timeout,
+ const sigset_t *sigmask);
+
+/* non-posix APIs */
+int fd_ready(int fd, int events);
+void v_get_stats_snmp(unsigned long mibs[]);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _TLE_GLUE_H_ */
diff --git a/lib/libtle_glue/udp.c b/lib/libtle_glue/udp.c
new file mode 100644
index 0000000..9f199bc
--- /dev/null
+++ b/lib/libtle_glue/udp.c
@@ -0,0 +1,419 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdarg.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+
+#include <rte_ethdev.h>
+#include <tle_udp.h>
+
+#include "sym.h"
+#include "fd.h"
+#include "log.h"
+#include "util.h"
+#include "internal.h"
+#include "sock.h"
+
+static int
+udp_setsockopt(__rte_unused struct sock *sk, __rte_unused int optname,
+ __rte_unused const void *optval, __rte_unused socklen_t optlen)
+{
+ return 0;
+}
+
+static int
+udp_getsockopt(__rte_unused struct sock *sk, __rte_unused int optname,
+ __rte_unused void *optval, __rte_unused socklen_t *optlen)
+{
+ return 0;
+}
+
+static int
+udp_getname(struct sock *sk, struct sockaddr *addr, int peer)
+{
+ struct tle_udp_stream_param p;
+ size_t addrlen;
+ int rc;
+
+ rc = tle_udp_stream_get_param(sk->s, &p);
+ if (rc) {
+ errno = -rc;
+ return -1;
+ }
+
+ addrlen = get_sockaddr_len(sk->domain);
+ if (peer)
+ memcpy(addr, &p.remote_addr, addrlen);
+ else
+ memcpy(addr, &p.local_addr, addrlen);
+ addr->sa_family = p.local_addr.ss_family;
+ return 0;
+}
+
+static int
+udp_bind(struct sock *sk, const struct sockaddr *addr)
+{
+ if (sk->ubind) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ sk->s = open_bind(sk, addr, NULL);
+ if (sk->s != NULL) {
+ sk->ubind = 1;
+ if (is_any_addr(addr))
+ sk->ubindany = 1;
+ return 0;
+ }
+
+ return -1;
+}
+
+static int
+udp_connect(struct sock *sk, const struct sockaddr *addr)
+{
+ struct sockaddr_storage laddr;
+
+ /* According to linux manual, connectionless sockets may dissolve the
+ * association by connecting to an address with the sa_family member of
+ * sockaddr set to AF_UNSPEC (supported on Linux since kernel 2.2).
+ */
+ if (sk->ubind) {
+ if (udp_getname(sk, (struct sockaddr *)&laddr, 0))
+ return -1;
+ if (addr->sa_family == AF_UNSPEC) {
+ addr = NULL;
+ if (sk->ubindany)
+ set_any_addr((struct sockaddr *)&laddr);
+ }
+ sk->s = open_bind(sk, (const struct sockaddr *)&laddr, addr);
+ } else {
+ if (addr->sa_family == AF_UNSPEC) {
+ tle_udp_stream_close(sk->s);
+ sk->s = NULL;
+ return 0;
+ }
+ sk->s = open_bind(sk, NULL, addr);
+ }
+
+ if (sk->s)
+ return 0;
+
+ return -1;
+}
+
+static int
+udp_addr_prepare(struct sock *sk, const struct sockaddr **p_dst_addr,
+ struct sockaddr_storage *addr)
+{
+ const struct sockaddr *dst_addr = *p_dst_addr;
+
+ if (dst_addr != NULL &&
+ dst_addr->sa_family == AF_INET6 &&
+ IN6_IS_ADDR_V4MAPPED(&((const struct sockaddr_in6 *)dst_addr)->sin6_addr)) {
+ rte_memcpy(addr, dst_addr, sizeof(struct sockaddr_in6));
+ dst_addr = (const struct sockaddr*)(addr);
+ *p_dst_addr = dst_addr;
+ retrans_4mapped6_addr((struct sockaddr_storage*)(addr));
+ }
+
+ if (sk->s == NULL) {
+ if (dst_addr == NULL) {
+ errno = EDESTADDRREQ;
+ return -1;
+ }
+
+ sk->s = open_bind(sk, NULL, dst_addr);
+ if (sk->s == NULL) /* errno is set */
+ return -1;
+ } else if (dst_addr != NULL) {
+ if (dst_addr->sa_family == AF_INET6 && sk->domain == AF_INET) {
+ errno = EINVAL;
+ return -1;
+ }
+ if (dst_addr->sa_family == AF_INET && sk->domain == AF_INET6) {
+ if (IN6_IS_ADDR_UNSPECIFIED(&sk->s->ipv6.addr.dst)) {
+ sk->s->type = TLE_V4;
+ sk->s->ipv4.addr.dst = 0;
+ } else {
+ errno = ENETUNREACH;
+ return -1;
+ }
+ }
+ }
+
+ return 0;
+}
+
+/* abstract client info from mbuf into s */
+static inline void
+udp_pkt_addr(const struct rte_mbuf *m, struct sockaddr *addr,
+ __rte_unused uint16_t family)
+{
+ const struct ipv4_hdr *ip4h;
+ const struct ipv6_hdr *ip6h;
+ const struct udp_hdr *udph;
+ struct sockaddr_in *in4;
+ struct sockaddr_in6 *in6;
+ int off = -(m->l4_len + m->l3_len);
+
+ udph = rte_pktmbuf_mtod_offset(m, struct udp_hdr *, -m->l4_len);
+ ip4h = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr *, off);
+ if ((ip4h->version_ihl>>4) == 4) {
+ addr->sa_family = AF_INET;
+ in4 = (struct sockaddr_in *)addr;
+ in4->sin_port = udph->src_port;
+ in4->sin_addr.s_addr = ip4h->src_addr;
+ } else {
+ addr->sa_family = AF_INET6;
+ ip6h = (const struct ipv6_hdr*)ip4h;
+ in6 = (struct sockaddr_in6 *)addr;
+ in6->sin6_port = udph->src_port;
+ rte_memcpy(&in6->sin6_addr, ip6h->src_addr,
+ sizeof(in6->sin6_addr));
+ }
+}
+
+static ssize_t
+udp_send(struct sock *sk, struct rte_mbuf *pkt[],
+ uint16_t num, const struct sockaddr *dst_addr)
+{
+ uint16_t i;
+ struct sockaddr_storage addr;
+
+ if (udp_addr_prepare(sk, &dst_addr, &addr) != 0)
+ return 0;
+
+ /* chain them together as *one* message */
+ for (i = 1; i < num; ++i) {
+ pkt[i-1]->next = pkt[i];
+ pkt[0]->pkt_len += pkt[i]->pkt_len;
+ }
+ pkt[0]->nb_segs = num;
+
+ if (tle_udp_stream_send(sk->s, &pkt[0], 1, dst_addr) == 0) {
+ errno = rte_errno;
+ return 0;
+ }
+
+ return num;
+}
+
+static ssize_t
+udp_readv(struct tle_stream *s, struct msghdr *msg, int flags)
+{
+ int i;
+ ssize_t sz;
+ uint16_t rc;
+ uint32_t fin;
+ struct iovec iv;
+ struct rte_mbuf *m;
+ const struct iovec *iov = msg->msg_iov;
+ int iovcnt = msg->msg_iovlen;
+
+ rc = tle_udp_stream_recv(s, &m, 1);
+ if (rc == 0) {
+ errno = rte_errno;
+ return -1;
+ }
+
+ if (!s->option.timestamp)
+ s->timestamp = m->timestamp;
+ if (msg != NULL && msg->msg_control != NULL) {
+ if (s->option.timestamp)
+ tle_set_timestamp(msg, m);
+ else
+ msg->msg_controllen = 0;
+ }
+
+ if (msg != NULL && msg->msg_name != NULL) {
+ udp_pkt_addr(m, (struct sockaddr*)msg->msg_name, 0);
+ if (((struct sockaddr *)msg->msg_name)->sa_family == AF_INET)
+ msg->msg_namelen = sizeof(struct sockaddr_in);
+ else
+ msg->msg_namelen = sizeof(struct sockaddr_in6);
+ }
+
+ for (i = 0, sz = 0; i != iovcnt; i++) {
+ iv = iov[i];
+ sz += iv.iov_len;
+ fin = _mbus_to_iovec(&iv, &m, 1);
+ if (fin == 1) {
+ sz -= iv.iov_len;
+ break;
+ }
+ }
+ if (fin == 0) {
+ if (flags & MSG_TRUNC)
+ sz += m->pkt_len;
+ rte_pktmbuf_free_seg(m);
+ msg->msg_flags |= MSG_TRUNC;
+ }
+ return sz;
+}
+
+static ssize_t
+udp_writev(struct sock *sk, const struct iovec *iov,
+ int iovcnt, const struct sockaddr *dst_addr)
+{
+ struct rte_mempool *mp = get_mempool_by_socket(0); /* fix me */
+ struct sockaddr_storage addr;
+ uint32_t slen, left_m, left_b, copy_len, left;
+ uint16_t i, rc, nb_mbufs;
+ char *dst, *src;
+ uint64_t ufo;
+ size_t total;
+ int j;
+
+ if (udp_addr_prepare(sk, &dst_addr, &addr) != 0)
+ return -1;
+
+ for (j = 0, total = 0; j < iovcnt; ++j)
+ total += iov[j].iov_len;
+
+ ufo = tx_offload & DEV_TX_OFFLOAD_UDP_TSO;
+ if (ufo)
+ slen = RTE_MBUF_DEFAULT_DATAROOM;
+ else
+ slen = 1500 - 20; /* mtu - ip_hdr_len */
+
+ nb_mbufs = (total + 8 + slen - 1) / slen;
+ struct rte_mbuf *mbufs[nb_mbufs];
+ if (unlikely(rte_pktmbuf_alloc_bulk(mp, mbufs, nb_mbufs) != 0)) {
+ errno = ENOMEM;
+ return -1;
+ }
+
+ left_b = iov[0].iov_len;
+ for (i = 0, j = 0; i < nb_mbufs && j < iovcnt; ++i) {
+ /* first frag has udp hdr, its payload is 8 bytes less */
+ if (i == 0)
+ slen -= 8;
+ else if (i == 1)
+ slen += 8;
+ left_m = slen;
+ while (left_m > 0 && j < iovcnt) {
+ copy_len = RTE_MIN(left_m, left_b);
+ dst = rte_pktmbuf_mtod_offset(mbufs[i], char *,
+ slen - left_m);
+ src = (char *)iov[j].iov_base + iov[j].iov_len - left_b;
+ rte_memcpy(dst, src, copy_len);
+
+ left_m -= copy_len;
+ left_b -= copy_len;
+ if (left_b == 0) {
+ j++;
+ left_b = iov[j].iov_len;
+ }
+ }
+ mbufs[i]->data_len = slen;
+ mbufs[i]->pkt_len = slen;
+ }
+
+ /* last seg */
+ if (nb_mbufs == 1) {
+ mbufs[nb_mbufs - 1]->data_len = total;
+ mbufs[nb_mbufs - 1]->pkt_len = total;
+ } else {
+ mbufs[nb_mbufs - 1]->data_len = total - (nb_mbufs - 1) * slen + 8;
+ mbufs[nb_mbufs - 1]->pkt_len = total - (nb_mbufs - 1) * slen + 8;
+ }
+
+ /* chain as *one* message */
+ for (i = 1; i < nb_mbufs; ++i)
+ mbufs[i-1]->next = mbufs[i];
+ mbufs[0]->nb_segs = nb_mbufs;
+ mbufs[0]->pkt_len = total;
+ nb_mbufs = 1;
+
+ rc = tle_udp_stream_send(sk->s, mbufs, nb_mbufs, dst_addr);
+ for (i = rc, left = 0; i < nb_mbufs; ++i) {
+ left += mbufs[i]->pkt_len;
+ rte_pktmbuf_free(mbufs[i]);
+ }
+
+ if (rc == 0) {
+ errno = rte_errno;
+ return -1;
+ }
+
+ return total - left;
+}
+
+static ssize_t
+udp_recv(struct tle_stream *s, struct rte_mbuf *pkt[], uint16_t num,
+ struct sockaddr *addr)
+{
+ uint16_t rc;
+
+ rc = tle_udp_stream_recv(s, pkt, num);
+ if (addr && num == 1 && rc == 1)
+ udp_pkt_addr(pkt[0], addr, 0);
+
+ if (rc == 0)
+ errno = rte_errno;
+ return rc;
+}
+
+static void
+udp_update_cfg(struct sock *sk)
+{
+ struct tle_udp_stream_param prm;
+ memset(&prm, 0, sizeof(prm));
+
+ prm.recv_ev = &sk->rxev;
+ prm.send_ev = &sk->txev;
+
+ tle_udp_stream_update_cfg(&sk->s, &prm, 1);
+}
+
+static int
+udp_shutdown(struct sock *sk, int how)
+{
+ int rc;
+
+ if (sk->s == NULL) {
+ errno = ENOTCONN;
+ return -1;
+ }
+
+ rc = tle_udp_stream_shutdown(sk->s, how);
+ if (rc < 0) {
+ errno = -rc;
+ return -1;
+ }
+ return 0;
+}
+
+struct proto udp_prot = {
+ .name = "UDP",
+ .setsockopt = udp_setsockopt,
+ .getsockopt = udp_getsockopt,
+ .getname = udp_getname,
+ .bind = udp_bind,
+ .connect = udp_connect,
+ .recv = udp_recv,
+ .send = udp_send,
+ .readv = udp_readv,
+ .writev = udp_writev,
+ .shutdown = udp_shutdown,
+ .close = tle_udp_stream_close,
+ .update_cfg = udp_update_cfg,
+};
diff --git a/lib/libtle_glue/util.c b/lib/libtle_glue/util.c
new file mode 100644
index 0000000..69fc555
--- /dev/null
+++ b/lib/libtle_glue/util.c
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <pthread.h>
+#include <sched.h>
+#include <unistd.h>
+
+#include "util.h"
+
+#define NUMA_NODE_PATH "/sys/devices/system/node"
+
+static unsigned
+eal_cpu_socket_id(unsigned lcore_id)
+{
+ unsigned socket;
+ char path[PATH_MAX];
+
+ for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) {
+ snprintf(path, sizeof(path), "%s/node%u/cpu%u", NUMA_NODE_PATH,
+ socket, lcore_id);
+ if (access(path, F_OK) == 0)
+ return socket;
+ }
+ return 0;
+}
+
+uint32_t
+get_socket_id(void)
+{
+ int err;
+ uint32_t i;
+ cpu_set_t cpuset;
+
+ CPU_ZERO(&cpuset);
+ err = pthread_getaffinity_np(pthread_self(),
+ sizeof(cpuset), &cpuset);
+ if (err)
+ return 0;
+
+ for (i = 0; i < CPU_SETSIZE; i++)
+ if (CPU_ISSET(i, &cpuset))
+ break;
+
+ return eal_cpu_socket_id(i);
+}
diff --git a/lib/libtle_glue/util.h b/lib/libtle_glue/util.h
new file mode 100644
index 0000000..ac67d8b
--- /dev/null
+++ b/lib/libtle_glue/util.h
@@ -0,0 +1,377 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _TLE_GLUE_UTIL_H_
+#define _TLE_GLUE_UTIL_H_
+
+#include <stdarg.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <tle_tcp.h>
+#include <tle_udp.h>
+
+#include "../libtle_l4p/tcp_stream.h"
+
+#include "fd.h"
+#include "ctx.h"
+#include "sock.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static inline void *
+xstrdup(const void *old)
+{
+ void *new = strdup(old);
+ if (unlikely(new == NULL))
+ rte_panic("Failed to strdup");
+ return new;
+}
+
+static inline void *
+xmalloc(size_t size)
+{
+ void *p = malloc(size ? size : 1);
+ if (p == NULL)
+ rte_panic("Failed to malloc");
+ return p;
+}
+
+static inline char *
+xvasprintf(const char *format, va_list args)
+{
+ va_list args2;
+ size_t needed;
+ char *s;
+
+ va_copy(args2, args);
+ needed = vsnprintf(NULL, 0, format, args);
+
+ s = xmalloc(needed + 1);
+
+ vsnprintf(s, needed + 1, format, args2);
+ va_end(args2);
+
+ return s;
+}
+
+static inline char *
+xasprintf(const char *format, ...)
+{
+ va_list args;
+ char *s;
+
+ va_start(args, format);
+ s = xvasprintf(format, args);
+ va_end(args);
+
+ return s;
+}
+
+static inline char **
+grow_argv(char **argv, size_t cur_siz, size_t grow_by)
+{
+ char **p;
+
+ p = realloc(argv, sizeof(char *) * (cur_siz + grow_by));
+ if (unlikely(p == NULL))
+ rte_panic("Failed to grow argv");
+ return p;
+}
+
+static inline void
+release_argv(int argc, char **argv_to_release, char **argv)
+{
+ int i;
+
+ for (i = 0; i < argc; ++i)
+ free(argv_to_release[i]);
+
+ free(argv_to_release);
+ free(argv);
+}
+
+static inline void
+tle_event_attach(struct tle_event *ev, struct tle_evq *evq, const void *data)
+{
+ ev->head = evq;
+ ev->data = data;
+}
+
+static inline void
+sock_alloc_events(struct sock *so)
+{
+ tle_event_attach(&so->erev, CTX(so)->ereq, so);
+ tle_event_attach(&so->rxev, CTX(so)->rxeq, so);
+ tle_event_attach(&so->txev, CTX(so)->txeq, so);
+ tle_event_active(&so->erev, TLE_SEV_DOWN);
+#ifndef LOOK_ASIDE_BACKEND
+ tle_event_active(&so->rxev, TLE_SEV_DOWN);
+ tle_event_active(&so->txev, TLE_SEV_DOWN);
+#endif
+}
+
+static inline void
+sock_active_events(struct sock *so)
+{
+ tle_event_active(&so->erev, TLE_SEV_DOWN);
+ tle_event_active(&so->rxev, TLE_SEV_DOWN);
+ tle_event_active(&so->txev, TLE_SEV_DOWN);
+}
+
+static inline const struct in6_addr*
+select_local_addr_v6(const struct sockaddr *remote, struct glue_ctx *ctx)
+{
+ /* todo: implement route table to decide local address */
+
+ if (IN6_IS_ADDR_LOOPBACK(&((const struct sockaddr_in6 *)remote)
+ ->sin6_addr))
+ return &in6addr_loopback;
+ else
+ return &ctx->ipv6;
+}
+
+static inline in_addr_t
+select_local_addr(const struct sockaddr *remote, struct glue_ctx *ctx)
+{
+ /* todo: implement route table to decide local address */
+ in_addr_t remote_addr;
+
+ remote_addr = ((const struct sockaddr_in*)remote)->sin_addr.s_addr;
+ if (remote_addr == htonl(INADDR_LOOPBACK))
+ return htonl(INADDR_LOOPBACK);
+ else
+ return ctx->ipv4;
+}
+
+static inline bool
+is_any_addr(const struct sockaddr *addr)
+{
+ const struct sockaddr_in *addr4;
+ const struct sockaddr_in6 *addr6;
+
+ if (addr->sa_family == AF_INET) {
+ addr4 = (const struct sockaddr_in *)addr;
+ if (addr4->sin_addr.s_addr == htonl(INADDR_ANY))
+ return true;
+ else
+ return false;
+ } else if (addr->sa_family == AF_INET6) {
+ addr6 = (const struct sockaddr_in6 *)addr;
+ if (IN6_IS_ADDR_UNSPECIFIED(&addr6->sin6_addr))
+ return true;
+ else
+ return false;
+ } else
+ return false;
+}
+
+static inline void
+set_any_addr(struct sockaddr *addr)
+{
+ struct sockaddr_in *addr4;
+ struct sockaddr_in6 *addr6;
+
+ if (addr->sa_family == AF_INET) {
+ addr4 = (struct sockaddr_in *)addr;
+ addr4->sin_addr.s_addr = htonl(INADDR_ANY);
+ } else if (addr->sa_family == AF_INET6) {
+ addr6 = (struct sockaddr_in6 *)addr;
+ addr6->sin6_addr = in6addr_any;
+ }
+}
+
+/* transform an IPv4 address(in struct sockaddr_in) to
+ * an IPv4 mapped IPv6 address(in struct sockaddr_in6) */
+static inline void
+trans_4mapped6_addr(struct sockaddr *addr)
+{
+ struct sockaddr_in6 *addr6;
+
+ if (addr->sa_family != AF_INET)
+ return;
+
+ addr6 = (struct sockaddr_in6*)addr;
+ addr6->sin6_family = AF_INET6;
+ addr6->sin6_addr.s6_addr32[0] = 0;
+ addr6->sin6_addr.s6_addr32[1] = 0;
+ addr6->sin6_addr.s6_addr32[2] = 0xffff0000;
+ addr6->sin6_addr.s6_addr32[3] = ((struct sockaddr_in*)addr)->sin_addr.s_addr;
+}
+
+/* transform an IPv4 mapped IPv6 address(in struct sockaddr_in6) to
+ * an IPv4 address(in struct sockaddr_in) */
+static inline void
+retrans_4mapped6_addr(struct sockaddr_storage * addr)
+{
+ struct in6_addr* addr6;
+ if (addr->ss_family == AF_INET)
+ return;
+
+ addr6 = &((struct sockaddr_in6*)addr)->sin6_addr;
+ if(IN6_IS_ADDR_V4MAPPED(addr6)) {
+ addr->ss_family = AF_INET;
+ ((struct sockaddr_in*)addr)->sin_addr.s_addr = addr6->__in6_u.__u6_addr32[3];
+ }
+}
+
+static inline struct tle_stream *
+open_bind(struct sock *so, const struct sockaddr *local,
+ const struct sockaddr *remote)
+{
+ struct tle_stream *s;
+ struct sockaddr_storage *l, *r;
+ struct sockaddr_in *addr4;
+ struct sockaddr_in6 *addr6;
+ struct tle_tcp_stream_param pt = {0};
+ struct tle_udp_stream_param pu = {0};
+
+ if (IS_TCP(so)) {
+ pt.option = so->option.raw;
+ l = &pt.addr.local;
+ r = &pt.addr.remote;
+ pt.cfg.err_ev = &so->erev;
+ pt.cfg.recv_ev = &so->rxev;
+ pt.cfg.send_ev = &so->txev;
+ } else {
+ pu.option = so->option.raw;
+ l = &pu.local_addr;
+ r = &pu.remote_addr;
+ pu.recv_ev = &so->rxev;
+ pu.send_ev = &so->txev;
+ }
+
+ if (remote) {
+ memcpy(r, remote, get_sockaddr_len(remote->sa_family));
+ retrans_4mapped6_addr(r);
+ if(r->ss_family == AF_INET) {
+ addr4 = (struct sockaddr_in*)r;
+ if (addr4->sin_addr.s_addr == 0)
+ addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ } else {
+ addr6 = (struct sockaddr_in6*)r;
+ if (IN6_IS_ADDR_UNSPECIFIED(&addr6->sin6_addr))
+ rte_memcpy(&addr6->sin6_addr, &in6addr_loopback,
+ sizeof(struct in6_addr));
+ }
+ }
+
+ if (local) {
+ memcpy(l, local, get_sockaddr_len(local->sa_family));
+ retrans_4mapped6_addr(l);
+ } else {
+ if (remote)
+ l->ss_family = r->ss_family;
+ else
+ l->ss_family = so->domain;
+ }
+
+ if (!remote)
+ r->ss_family = l->ss_family;
+
+ /* Endpoints of stream have different socket families */
+ if (r->ss_family != l->ss_family) {
+ if (l->ss_family == AF_INET) {
+ errno = EINVAL;
+ return NULL;
+ } else {
+ /* if local addr is unbound, convert into remote family */
+ if (IN6_IS_ADDR_UNSPECIFIED(&((struct sockaddr_in6*)l)->sin6_addr)) {
+ l->ss_family = AF_INET;
+ ((struct sockaddr_in*)l)->sin_addr.s_addr = 0;
+ } else {
+ errno = ENETUNREACH;
+ return NULL;
+ }
+ }
+ }
+
+ if (l->ss_family == AF_INET) {
+ addr4 = (struct sockaddr_in*)l;
+ if (addr4->sin_addr.s_addr == htonl(INADDR_ANY) && remote) {
+ addr4->sin_addr.s_addr =
+ select_local_addr((struct sockaddr*)r, CTX(so));
+ if (addr4->sin_addr.s_addr == htonl(INADDR_ANY)) {
+ errno = EADDRNOTAVAIL;
+ return NULL;
+ }
+ }
+ else if (addr4->sin_addr.s_addr != CTX(so)->ipv4 &&
+ addr4->sin_addr.s_addr != htonl(INADDR_LOOPBACK) &&
+ addr4->sin_addr.s_addr != htonl(INADDR_ANY)) {
+ errno = EADDRNOTAVAIL;
+ return NULL;
+ }
+ } else {
+ addr6 = (struct sockaddr_in6 *)l;
+ if (IN6_IS_ADDR_UNSPECIFIED(&addr6->sin6_addr) && remote) {
+ memcpy(&addr6->sin6_addr,
+ select_local_addr_v6((struct sockaddr*)r, CTX(so)),
+ sizeof(struct in6_addr));
+ if (IN6_IS_ADDR_UNSPECIFIED(&addr6->sin6_addr)) {
+ errno = EADDRNOTAVAIL;
+ return NULL;
+ }
+ }
+ else if (memcmp(&addr6->sin6_addr, &CTX(so)->ipv6,
+ sizeof(struct in6_addr)) != 0 &&
+ (!IN6_IS_ADDR_LOOPBACK(&addr6->sin6_addr)) &&
+ (!IN6_IS_ADDR_UNSPECIFIED(&addr6->sin6_addr))) {
+ errno = EADDRNOTAVAIL;
+ return NULL;
+ }
+ }
+
+ if (IS_TCP(so))
+ s = tle_tcp_stream_open(CTX(so)->tcp_ctx, &pt);
+ else {
+ if (so->s == NULL)
+ s = tle_udp_stream_open(CTX(so)->udp_ctx, &pu);
+ else
+ s = tle_udp_stream_set(so->s, CTX(so)->udp_ctx, &pu);
+ }
+
+ if (s == NULL)
+ errno = rte_errno;
+
+ return s;
+}
+
+static inline struct tle_stream *
+open_bind_listen(struct sock *so, const struct sockaddr *local)
+{
+ struct tle_stream *s = open_bind(so, local, NULL);
+
+ if (s == NULL)
+ return NULL;
+
+ if (tle_tcp_stream_listen(s) != 0) {
+ tle_tcp_stream_close(s);
+ return NULL;
+ }
+
+ return s;
+}
+
+uint32_t get_socket_id(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /*_TLE_GLUE_UTIL_H_ */
diff --git a/lib/libtle_glue/zerocopy.h b/lib/libtle_glue/zerocopy.h
new file mode 100644
index 0000000..a37f8f5
--- /dev/null
+++ b/lib/libtle_glue/zerocopy.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _TLE_GLUE_ZEROCOPY_H_
+#define _TLE_GLUE_ZEROCOPY_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * This API performs recv operation on specified socket, and it's
+ * optimized for zero copy, which means the caller does not need to
+ * prepare the buffer, instead, it will get a pointer on success.
+ * @param sockfd
+ * the file descriptor for the socket.
+ * @param buf
+ * after successfully receiving some payload, the pointer of the
+ * received buffer will be stored in *buf.
+ * @return
+ * the number of bytes received, or -1 if an error occurred, or 0
+ * if a stream socket peer has performed an orderly shutdown.
+ *
+ */
+ssize_t recv_zc(int sockfd, void **buf);
+
+/**
+ * This API performs send operation on specified socket, and it's
+ * optimized for zero copy, which means the caller does not need to
+ * free the buffer, not even touch that buffer even after calling this
+ * API; the buffer will be freed after an ack from the socket peer.
+ * @param sockfd
+ * the file descriptor for the socket.
+ * @param buf
+ * The pointer to the payload buffer to be sent.
+ * @param len
+ * The length of the payload buffer to be sent.
+ * @return
+ * the number of bytes sent, or -1 if an error occurred.
+ */
+ssize_t send_zc(int sockfd, const void *buf, size_t len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /*_TLE_GLUE_ZEROCOPY_H_ */
diff --git a/lib/libtle_l4p/Makefile b/lib/libtle_l4p/Makefile
index e1357d1..ee81d4a 100644
--- a/lib/libtle_l4p/Makefile
+++ b/lib/libtle_l4p/Makefile
@@ -45,6 +45,7 @@ SYMLINK-y-include += tle_ctx.h
SYMLINK-y-include += tle_event.h
SYMLINK-y-include += tle_tcp.h
SYMLINK-y-include += tle_udp.h
+SYMLINK-y-include += tle_stats.h
# this lib dependencies
DEPDIRS-y += lib/libtle_misc
diff --git a/lib/libtle_l4p/ctx.c b/lib/libtle_l4p/ctx.c
index b8067f0..d6bde48 100644
--- a/lib/libtle_l4p/ctx.c
+++ b/lib/libtle_l4p/ctx.c
@@ -21,9 +21,14 @@
#include <rte_ip.h>
#include "stream.h"
+#include "stream_table.h"
#include "misc.h"
#include <halfsiphash.h>
+struct tle_mib default_mib;
+
+RTE_DEFINE_PER_LCORE(struct tle_mib *, mib) = &default_mib;
+
#define LPORT_START 0x8000
#define LPORT_END MAX_PORT_NUM
@@ -103,6 +108,16 @@ tle_ctx_create(const struct tle_ctx_param *ctx_prm)
ctx->prm = *ctx_prm;
+ rc = bhash_init(ctx);
+ if (rc != 0) {
+ UDP_LOG(ERR, "create bhash table (ctx=%p, proto=%u) failed "
+ "with error code: %d;\n",
+ ctx, ctx_prm->proto, rc);
+ tle_ctx_destroy(ctx);
+ rte_errno = -rc;
+ return NULL;
+ }
+
rc = tle_stream_ops[ctx_prm->proto].init_streams(ctx);
if (rc != 0) {
UDP_LOG(ERR, "init_streams(ctx=%p, proto=%u) failed "
@@ -114,9 +129,10 @@ tle_ctx_create(const struct tle_ctx_param *ctx_prm)
}
for (i = 0; i != RTE_DIM(ctx->use); i++)
- tle_pbm_init(ctx->use + i, LPORT_START_BLK);
+ tle_psm_init(ctx->use + i);
- ctx->streams.nb_free = ctx->prm.max_streams;
+ ctx->streams.nb_free = ctx->prm.min_streams;
+ ctx->streams.nb_cur = ctx->prm.min_streams;
/* Initialization of siphash state is done here to speed up the
* fastpath processing.
@@ -124,6 +140,11 @@ tle_ctx_create(const struct tle_ctx_param *ctx_prm)
if (ctx->prm.hash_alg == TLE_SIPHASH)
siphash_initialization(&ctx->prm.secret_key,
&ctx->prm.secret_key);
+
+ rte_spinlock_init(&ctx->dev_lock);
+ rte_spinlock_init(&ctx->bhash_lock[TLE_V4]);
+ rte_spinlock_init(&ctx->bhash_lock[TLE_V6]);
+
return ctx;
}
@@ -137,6 +158,8 @@ tle_ctx_destroy(struct tle_ctx *ctx)
return;
}
+ bhash_fini(ctx);
+
for (i = 0; i != RTE_DIM(ctx->dev); i++)
tle_del_dev(ctx->dev + i);
@@ -150,37 +173,6 @@ tle_ctx_invalidate(struct tle_ctx *ctx)
RTE_SET_USED(ctx);
}
-static void
-fill_pbm(struct tle_pbm *pbm, const struct tle_bl_port *blp)
-{
- uint32_t i;
-
- for (i = 0; i != blp->nb_port; i++)
- tle_pbm_set(pbm, blp->port[i]);
-}
-
-static int
-init_dev_proto(struct tle_dev *dev, uint32_t idx, int32_t socket_id,
- const struct tle_bl_port *blp)
-{
- size_t sz;
-
- sz = sizeof(*dev->dp[idx]);
- dev->dp[idx] = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE,
- socket_id);
-
- if (dev->dp[idx] == NULL) {
- UDP_LOG(ERR, "allocation of %zu bytes on "
- "socket %d for %u-th device failed\n",
- sz, socket_id, idx);
- return ENOMEM;
- }
-
- tle_pbm_init(&dev->dp[idx]->use, LPORT_START_BLK);
- fill_pbm(&dev->dp[idx]->use, blp);
- return 0;
-}
-
static struct tle_dev *
find_free_dev(struct tle_ctx *ctx)
{
@@ -214,27 +206,8 @@ tle_add_dev(struct tle_ctx *ctx, const struct tle_dev_param *dev_prm)
return NULL;
rc = 0;
- /* device can handle IPv4 traffic */
- if (dev_prm->local_addr4.s_addr != INADDR_ANY) {
- rc = init_dev_proto(dev, TLE_V4, ctx->prm.socket_id,
- &dev_prm->bl4);
- if (rc == 0)
- fill_pbm(&ctx->use[TLE_V4], &dev_prm->bl4);
- }
-
- /* device can handle IPv6 traffic */
- if (rc == 0 && memcmp(&dev_prm->local_addr6, &tle_ipv6_any,
- sizeof(tle_ipv6_any)) != 0) {
- rc = init_dev_proto(dev, TLE_V6, ctx->prm.socket_id,
- &dev_prm->bl6);
- if (rc == 0)
- fill_pbm(&ctx->use[TLE_V6], &dev_prm->bl6);
- }
-
if (rc != 0) {
/* cleanup and return an error. */
- rte_free(dev->dp[TLE_V4]);
- rte_free(dev->dp[TLE_V6]);
rte_errno = rc;
return NULL;
}
@@ -246,16 +219,19 @@ tle_add_dev(struct tle_ctx *ctx, const struct tle_dev_param *dev_prm)
if ((dev_prm->tx_offload & DEV_TX_OFFLOAD_UDP_CKSUM) != 0 &&
ctx->prm.proto == TLE_PROTO_UDP) {
- dev->tx.ol_flags[TLE_V4] |= PKT_TX_IPV4 | PKT_TX_UDP_CKSUM;
- dev->tx.ol_flags[TLE_V6] |= PKT_TX_IPV6 | PKT_TX_UDP_CKSUM;
+ dev->tx.ol_flags[TLE_V4] |= PKT_TX_UDP_CKSUM;
+ dev->tx.ol_flags[TLE_V6] |= PKT_TX_UDP_CKSUM;
} else if ((dev_prm->tx_offload & DEV_TX_OFFLOAD_TCP_CKSUM) != 0 &&
ctx->prm.proto == TLE_PROTO_TCP) {
- dev->tx.ol_flags[TLE_V4] |= PKT_TX_IPV4 | PKT_TX_TCP_CKSUM;
- dev->tx.ol_flags[TLE_V6] |= PKT_TX_IPV6 | PKT_TX_TCP_CKSUM;
+ dev->tx.ol_flags[TLE_V4] |= PKT_TX_TCP_CKSUM;
+ dev->tx.ol_flags[TLE_V6] |= PKT_TX_TCP_CKSUM;
}
if ((dev_prm->tx_offload & DEV_TX_OFFLOAD_IPV4_CKSUM) != 0)
- dev->tx.ol_flags[TLE_V4] |= PKT_TX_IPV4 | PKT_TX_IP_CKSUM;
+ dev->tx.ol_flags[TLE_V4] |= PKT_TX_IP_CKSUM;
+
+ dev->tx.ol_flags[TLE_V4] |= PKT_TX_IPV4;
+ dev->tx.ol_flags[TLE_V6] |= PKT_TX_IPV6;
dev->prm = *dev_prm;
dev->ctx = ctx;
@@ -300,220 +276,97 @@ tle_del_dev(struct tle_dev *dev)
ctx = dev->ctx;
p = dev - ctx->dev;
- if (p >= RTE_DIM(ctx->dev) ||
- (dev->dp[TLE_V4] == NULL &&
- dev->dp[TLE_V6] == NULL))
+ if (p >= RTE_DIM(ctx->dev))
return -EINVAL;
/* emtpy TX queues. */
empty_dring(&dev->tx.dr, ctx->prm.proto);
- rte_free(dev->dp[TLE_V4]);
- rte_free(dev->dp[TLE_V6]);
memset(dev, 0, sizeof(*dev));
ctx->nb_dev--;
return 0;
}
-static struct tle_dev *
-find_ipv4_dev(struct tle_ctx *ctx, const struct in_addr *addr)
-{
- uint32_t i;
-
- for (i = 0; i != RTE_DIM(ctx->dev); i++) {
- if (ctx->dev[i].prm.local_addr4.s_addr == addr->s_addr &&
- ctx->dev[i].dp[TLE_V4] != NULL)
- return ctx->dev + i;
- }
-
- return NULL;
-}
-
-static struct tle_dev *
-find_ipv6_dev(struct tle_ctx *ctx, const struct in6_addr *addr)
+int
+stream_fill_ctx(struct tle_ctx *ctx, struct tle_stream *s,
+ const struct sockaddr *laddr, const struct sockaddr *raddr)
{
- uint32_t i;
+ struct sockaddr_storage addr;
+ int32_t rc = 0;
- for (i = 0; i != RTE_DIM(ctx->dev); i++) {
- if (memcmp(&ctx->dev[i].prm.local_addr6, addr,
- sizeof(*addr)) == 0 &&
- ctx->dev[i].dp[TLE_V6] != NULL)
- return ctx->dev + i;
+ if (laddr->sa_family == AF_INET) {
+ s->type = TLE_V4;
+ } else if (laddr->sa_family == AF_INET6) {
+ s->type = TLE_V6;
}
- return NULL;
-}
-
-static int
-stream_fill_dev(struct tle_ctx *ctx, struct tle_stream *s,
- const struct sockaddr *addr)
-{
- struct tle_dev *dev;
- struct tle_pbm *pbm;
- const struct sockaddr_in *lin4;
- const struct sockaddr_in6 *lin6;
- uint32_t i, p, sp, t;
-
- if (addr->sa_family == AF_INET) {
- lin4 = (const struct sockaddr_in *)addr;
- t = TLE_V4;
- p = lin4->sin_port;
- } else if (addr->sa_family == AF_INET6) {
- lin6 = (const struct sockaddr_in6 *)addr;
- t = TLE_V6;
- p = lin6->sin6_port;
- } else
- return EINVAL;
-
+ uint16_t p = ((const struct sockaddr_in *)laddr)->sin_port;
p = ntohs(p);
-
- /* if local address is not wildcard, find device it belongs to. */
- if (t == TLE_V4 && lin4->sin_addr.s_addr != INADDR_ANY) {
- dev = find_ipv4_dev(ctx, &lin4->sin_addr);
- if (dev == NULL)
- return ENODEV;
- } else if (t == TLE_V6 && memcmp(&tle_ipv6_any, &lin6->sin6_addr,
- sizeof(tle_ipv6_any)) != 0) {
- dev = find_ipv6_dev(ctx, &lin6->sin6_addr);
- if (dev == NULL)
- return ENODEV;
- } else
- dev = NULL;
-
- if (dev != NULL)
- pbm = &dev->dp[t]->use;
- else
- pbm = &ctx->use[t];
-
+ struct tle_psm *psm = &ctx->use[s->type];
/* try to acquire local port number. */
+ rte_spinlock_lock(&ctx->dev_lock);
if (p == 0) {
- p = tle_pbm_find_range(pbm, pbm->blk, LPORT_END_BLK);
- if (p == 0 && pbm->blk > LPORT_START_BLK)
- p = tle_pbm_find_range(pbm, LPORT_START_BLK, pbm->blk);
- } else if (tle_pbm_check(pbm, p) != 0)
- return EEXIST;
-
- if (p == 0)
- return ENFILE;
-
- /* fill socket's dst port and type */
-
- sp = htons(p);
- s->type = t;
- s->port.dst = sp;
-
- /* mark port as in-use */
-
- tle_pbm_set(&ctx->use[t], p);
- if (dev != NULL) {
- tle_pbm_set(pbm, p);
- dev->dp[t]->streams[sp] = s;
- } else {
- for (i = 0; i != RTE_DIM(ctx->dev); i++) {
- if (ctx->dev[i].dp[t] != NULL) {
- tle_pbm_set(&ctx->dev[i].dp[t]->use, p);
- ctx->dev[i].dp[t]->streams[sp] = s;
- }
+ if (s->type == TLE_V6 && is_empty_addr(laddr) && !s->option.ipv6only)
+ p = tle_psm_alloc_dual_port(&ctx->use[TLE_V4], psm);
+ else
+ p = tle_psm_alloc_port(psm);
+ if (p == 0) {
+ rte_spinlock_unlock(&ctx->dev_lock);
+ return ENFILE;
}
+ rte_memcpy(&addr, laddr, sizeof(struct sockaddr_storage));
+ ((struct sockaddr_in *)&addr)->sin_port = htons(p);
+ laddr = (const struct sockaddr*)&addr;
}
- return 0;
-}
+ if (tle_psm_set(psm, p, s->option.reuseport) != 0) {
+ rte_spinlock_unlock(&ctx->dev_lock);
+ return EADDRINUSE;
+ }
-static int
-stream_clear_dev(struct tle_ctx *ctx, const struct tle_stream *s)
-{
- struct tle_dev *dev;
- uint32_t i, p, sp, t;
-
- t = s->type;
- sp = s->port.dst;
- p = ntohs(sp);
-
- /* if local address is not wildcard, find device it belongs to. */
- if (t == TLE_V4 && s->ipv4.addr.dst != INADDR_ANY) {
- dev = find_ipv4_dev(ctx,
- (const struct in_addr *)&s->ipv4.addr.dst);
- if (dev == NULL)
- return ENODEV;
- } else if (t == TLE_V6 && memcmp(&tle_ipv6_any, &s->ipv6.addr.dst,
- sizeof(tle_ipv6_any)) != 0) {
- dev = find_ipv6_dev(ctx,
- (const struct in6_addr *)&s->ipv6.addr.dst);
- if (dev == NULL)
- return ENODEV;
- } else
- dev = NULL;
-
- tle_pbm_clear(&ctx->use[t], p);
- if (dev != NULL) {
- if (dev->dp[t]->streams[sp] == s) {
- tle_pbm_clear(&dev->dp[t]->use, p);
- dev->dp[t]->streams[sp] = NULL;
- }
- } else {
- for (i = 0; i != RTE_DIM(ctx->dev); i++) {
- if (ctx->dev[i].dp[t] != NULL &&
- ctx->dev[i].dp[t]->streams[sp] == s) {
- tle_pbm_clear(&ctx->dev[i].dp[t]->use, p);
- ctx->dev[i].dp[t]->streams[sp] = NULL;
+ if (is_empty_addr(laddr)) {
+ if (s->type == TLE_V6 && !s->option.ipv6only) {
+ rc = tle_psm_set(&ctx->use[TLE_V4], p, s->option.reuseport);
+ if (rc != 0) {
+ tle_psm_clear(psm, p);
+ rte_spinlock_unlock(&ctx->dev_lock);
+ return EADDRINUSE;
}
}
}
- return 0;
-}
-
-static void
-fill_ipv4_am(const struct sockaddr_in *in, uint32_t *addr, uint32_t *mask)
-{
- *addr = in->sin_addr.s_addr;
- *mask = (*addr == INADDR_ANY) ? INADDR_ANY : INADDR_NONE;
-}
+ if (is_empty_addr(raddr))
+ rc = bhash_add_entry(ctx, laddr, s);
-static void
-fill_ipv6_am(const struct sockaddr_in6 *in, rte_xmm_t *addr, rte_xmm_t *mask)
-{
- const struct in6_addr *pm;
-
- memcpy(addr, &in->sin6_addr, sizeof(*addr));
- if (memcmp(&tle_ipv6_any, addr, sizeof(*addr)) == 0)
- pm = &tle_ipv6_any;
- else
- pm = &tle_ipv6_none;
-
- memcpy(mask, pm, sizeof(*mask));
-}
+ if (rc) {
+ tle_psm_clear(psm, p);
+ }
-int
-stream_fill_ctx(struct tle_ctx *ctx, struct tle_stream *s,
- const struct sockaddr *laddr, const struct sockaddr *raddr)
-{
- const struct sockaddr_in *rin;
- int32_t rc;
+ rte_spinlock_unlock(&ctx->dev_lock);
+ /* fill socket's dst (src actually) port */
+ s->port.dst = htons(p);
- /* setup ports and port mask fields (except dst port). */
- rin = (const struct sockaddr_in *)raddr;
- s->port.src = rin->sin_port;
- s->pmsk.src = (s->port.src == 0) ? 0 : UINT16_MAX;
- s->pmsk.dst = UINT16_MAX;
+ if (rc)
+ return rc;
- /* setup src and dst addresses. */
+ /* setup src, dst addresses, and src port. */
if (laddr->sa_family == AF_INET) {
fill_ipv4_am((const struct sockaddr_in *)laddr,
&s->ipv4.addr.dst, &s->ipv4.mask.dst);
fill_ipv4_am((const struct sockaddr_in *)raddr,
&s->ipv4.addr.src, &s->ipv4.mask.src);
+ s->port.src = ((const struct sockaddr_in *)raddr)->sin_port;
} else if (laddr->sa_family == AF_INET6) {
fill_ipv6_am((const struct sockaddr_in6 *)laddr,
&s->ipv6.addr.dst, &s->ipv6.mask.dst);
fill_ipv6_am((const struct sockaddr_in6 *)raddr,
&s->ipv6.addr.src, &s->ipv6.mask.src);
+ s->port.src = ((const struct sockaddr_in6 *)raddr)->sin6_port;
}
- rte_spinlock_lock(&ctx->dev_lock);
- rc = stream_fill_dev(ctx, s, laddr);
- rte_spinlock_unlock(&ctx->dev_lock);
+ /* setup port mask fields. */
+ s->pmsk.src = (s->port.src == 0) ? 0 : UINT16_MAX;
+ s->pmsk.dst = UINT16_MAX;
return rc;
}
@@ -522,11 +375,41 @@ stream_fill_ctx(struct tle_ctx *ctx, struct tle_stream *s,
int
stream_clear_ctx(struct tle_ctx *ctx, struct tle_stream *s)
{
- int32_t rc;
+ bool is_any = false;
+ struct sockaddr_storage addr;
+ struct sockaddr_in *addr4;
+ struct sockaddr_in6 *addr6;
+
+ if (s->type == TLE_V4) {
+ if (s->ipv4.addr.src == INADDR_ANY) {
+ is_any = true;
+ addr4 = (struct sockaddr_in *)&addr;
+ addr4->sin_addr.s_addr = s->ipv4.addr.dst;
+ addr4->sin_port = s->port.dst;
+ addr.ss_family = AF_INET;
+ bhash_del_entry(ctx, s, (struct sockaddr*)&addr);
+ }
+ } else {
+ if (IN6_IS_ADDR_UNSPECIFIED(&s->ipv6.addr.src)) {
+ is_any = true;
+ addr6 = (struct sockaddr_in6 *)&addr;
+ memcpy(&addr6->sin6_addr, &s->ipv6.addr.dst,
+ sizeof(tle_ipv6_any));
+ addr6->sin6_port = s->port.dst;
+ addr.ss_family = AF_INET6;
+ bhash_del_entry(ctx, s, (struct sockaddr*)&addr);
+ }
+ }
rte_spinlock_lock(&ctx->dev_lock);
- rc = stream_clear_dev(ctx, s);
+ /* strange behaviour to match linux stack */
+ if (is_any) {
+ if (s->type == TLE_V6 && !s->option.ipv6only)
+ tle_psm_clear(&ctx->use[TLE_V4], ntohs(s->port.dst));
+ }
+
+ tle_psm_clear(&ctx->use[s->type], ntohs(s->port.dst));
rte_spinlock_unlock(&ctx->dev_lock);
- return rc;
+ return 0;
}
diff --git a/lib/libtle_l4p/ctx.h b/lib/libtle_l4p/ctx.h
index f18060b..9483976 100644
--- a/lib/libtle_l4p/ctx.h
+++ b/lib/libtle_l4p/ctx.h
@@ -21,7 +21,7 @@
#include <tle_dring.h>
#include <tle_ctx.h>
-#include "port_bitmap.h"
+#include "port_statmap.h"
#include "osdep.h"
#include "net_misc.h"
@@ -29,11 +29,6 @@
extern "C" {
#endif
-struct tle_dport {
- struct tle_pbm use; /* ports in use. */
- struct tle_stream *streams[MAX_PORT_NUM]; /* port to stream. */
-};
-
struct tle_dev {
struct tle_ctx *ctx;
struct {
@@ -45,7 +40,6 @@ struct tle_dev {
struct tle_dring dr;
} tx;
struct tle_dev_param prm; /* copy of device parameters. */
- struct tle_dport *dp[TLE_VNUM]; /* device L4 ports */
};
struct tle_ctx {
@@ -54,18 +48,23 @@ struct tle_ctx {
struct {
rte_spinlock_t lock;
uint32_t nb_free; /* number of free streams. */
+ uint32_t nb_cur; /* number of allocated streams. */
STAILQ_HEAD(, tle_stream) free;
void *buf; /* space allocated for streams */
} streams;
- rte_spinlock_t dev_lock;
+ rte_spinlock_t bhash_lock[TLE_VNUM];
+ struct rte_hash *bhash[TLE_VNUM]; /* bind and listen hash table */
+
uint32_t nb_dev;
- struct tle_pbm use[TLE_VNUM]; /* all ports in use. */
+ rte_spinlock_t dev_lock;
+ struct tle_psm use[TLE_VNUM]; /* all ports in use. */
struct tle_dev dev[RTE_MAX_ETHPORTS];
};
struct stream_ops {
int (*init_streams)(struct tle_ctx *);
+ uint32_t (*more_streams)(struct tle_ctx *);
void (*fini_streams)(struct tle_ctx *);
void (*free_drbs)(struct tle_stream *, struct tle_drb *[], uint32_t);
};
@@ -77,6 +76,27 @@ int stream_fill_ctx(struct tle_ctx *ctx, struct tle_stream *s,
int stream_clear_ctx(struct tle_ctx *ctx, struct tle_stream *s);
+static inline void
+fill_ipv4_am(const struct sockaddr_in *in, uint32_t *addr, uint32_t *mask)
+{
+ *addr = in->sin_addr.s_addr;
+ *mask = (*addr == INADDR_ANY) ? INADDR_ANY : INADDR_NONE;
+}
+
+static inline void
+fill_ipv6_am(const struct sockaddr_in6 *in, rte_xmm_t *addr, rte_xmm_t *mask)
+{
+ const struct in6_addr *pm;
+
+ memcpy(addr, &in->sin6_addr, sizeof(*addr));
+ if (IN6_IS_ADDR_UNSPECIFIED(addr))
+ pm = &tle_ipv6_any;
+ else
+ pm = &tle_ipv6_none;
+
+ memcpy(mask, pm, sizeof(*mask));
+}
+
#ifdef __cplusplus
}
#endif
diff --git a/lib/libtle_l4p/misc.h b/lib/libtle_l4p/misc.h
index 327296f..d39e5a1 100644
--- a/lib/libtle_l4p/misc.h
+++ b/lib/libtle_l4p/misc.h
@@ -16,12 +16,34 @@
#ifndef _MISC_H_
#define _MISC_H_
+#include <tle_stats.h>
#include <tle_dpdk_wrapper.h>
#ifdef __cplusplus
extern "C" {
#endif
+union typflg {
+ uint16_t raw;
+ struct {
+ uint8_t type; /* TLE_V4/TLE_V6 */
+ uint8_t flags; /* TCP header flags */
+ };
+};
+
+union pkt_info {
+ rte_xmm_t raw;
+ struct {
+ union typflg tf;
+ uint16_t csf; /* checksum flags */
+ union l4_ports port;
+ union {
+ union ipv4_addrs addr4;
+ const union ipv6_addrs *addr6;
+ };
+ };
+};
+
static inline int
xmm_cmp(const rte_xmm_t *da, const rte_xmm_t *sa)
{
@@ -286,43 +308,41 @@ _ipv4x_cksum(const void *iph, size_t len)
return (cksum == 0xffff) ? cksum : ~cksum;
}
-/*
- * helper function to check csum.
- */
static inline int
-check_pkt_csum(const struct rte_mbuf *m, uint64_t ol_flags, uint32_t type,
- uint32_t proto)
+check_pkt_csum(const struct rte_mbuf *m, uint32_t type, uint32_t proto)
{
const struct ipv4_hdr *l3h4;
const struct ipv6_hdr *l3h6;
const struct udp_hdr *l4h;
- uint64_t fl3, fl4;
- uint16_t csum;
int32_t ret;
-
- fl4 = ol_flags & PKT_RX_L4_CKSUM_MASK;
- fl3 = (type == TLE_V4) ?
- (ol_flags & PKT_RX_IP_CKSUM_MASK) : PKT_RX_IP_CKSUM_GOOD;
+ uint16_t csum;
+ uint64_t ol_flags = m->ol_flags;
/* case 0: both ip and l4 cksum is verified or data is valid */
- if ((fl3 | fl4) == (PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD))
+ if ((ol_flags & PKT_RX_IP_CKSUM_GOOD) &&
+ (ol_flags & PKT_RX_L4_CKSUM_GOOD))
return 0;
/* case 1: either ip or l4 cksum bad */
- if (fl3 == PKT_RX_IP_CKSUM_BAD || fl4 == PKT_RX_L4_CKSUM_BAD)
+ if ((ol_flags & PKT_RX_IP_CKSUM_MASK) == PKT_RX_IP_CKSUM_BAD)
+ return 1;
+
+ if ((ol_flags & PKT_RX_L4_CKSUM_MASK) == PKT_RX_L4_CKSUM_BAD)
return 1;
/* case 2: either ip or l4 or both cksum is unknown */
+ ret = 0;
l3h4 = rte_pktmbuf_mtod_offset(m, const struct ipv4_hdr *, m->l2_len);
l3h6 = rte_pktmbuf_mtod_offset(m, const struct ipv6_hdr *, m->l2_len);
- ret = 0;
- if (fl3 == PKT_RX_IP_CKSUM_UNKNOWN && l3h4->hdr_checksum != 0) {
+ if ((ol_flags & PKT_RX_IP_CKSUM_MASK) == PKT_RX_IP_CKSUM_UNKNOWN &&
+ l3h4->hdr_checksum != 0) {
csum = _ipv4x_cksum(l3h4, m->l3_len);
ret = (csum != UINT16_MAX);
}
- if (ret == 0 && fl4 == PKT_RX_L4_CKSUM_UNKNOWN) {
+ if (ret == 0 && (ol_flags & PKT_RX_L4_CKSUM_MASK) ==
+ PKT_RX_L4_CKSUM_UNKNOWN) {
/*
* for IPv4 it is allowed to have zero UDP cksum,
@@ -376,8 +396,20 @@ rwl_acquire(rte_atomic32_t *p)
static inline void
rwl_down(rte_atomic32_t *p)
{
- while (rte_atomic32_cmpset((volatile uint32_t *)p, 0, INT32_MIN) == 0)
+ while (rte_atomic32_cmpset((volatile uint32_t *)p, 0, INT32_MIN) == 0)
+ rte_pause();
+}
+
+static inline int
+rwl_try_down(rte_atomic32_t *p)
+{
+ while (rte_atomic32_cmpset((volatile uint32_t *)p, 0, INT32_MIN) == 0) {
+ /* Already down */
+ if (rte_atomic32_read(p) == INT32_MIN)
+ return -1;
rte_pause();
+ }
+ return 0;
}
static inline void
diff --git a/lib/libtle_l4p/net_misc.h b/lib/libtle_l4p/net_misc.h
index 2d8dac2..c1d946b 100644
--- a/lib/libtle_l4p/net_misc.h
+++ b/lib/libtle_l4p/net_misc.h
@@ -16,6 +16,7 @@
#ifndef _NET_MISC_H_
#define _NET_MISC_H_
+#include <stdbool.h>
#include <rte_ip.h>
#include <rte_udp.h>
#include "osdep.h"
@@ -71,6 +72,26 @@ union ip_addrs {
union ipv6_addrs v6;
};
+static inline bool
+is_empty_addr(const struct sockaddr *addr)
+{
+ bool any = false;
+ const struct sockaddr_in *in4;
+ const struct sockaddr_in6 *in6;
+
+ if (addr->sa_family == AF_INET) {
+ in4 = (const struct sockaddr_in *)addr;
+ if (in4->sin_addr.s_addr == INADDR_ANY)
+ any = true;
+ } else if (addr->sa_family == AF_INET6) {
+ in6 = (const struct sockaddr_in6 *)addr;
+ if (IN6_IS_ADDR_UNSPECIFIED(&in6->sin6_addr))
+ any = true;
+ }
+
+ return any;
+}
+
#ifdef __cplusplus
}
#endif
diff --git a/lib/libtle_l4p/port_statmap.h b/lib/libtle_l4p/port_statmap.h
new file mode 100644
index 0000000..8bbb0ba
--- /dev/null
+++ b/lib/libtle_l4p/port_statmap.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2019 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _PORT_STATMAP_H_
+#define _PORT_STATMAP_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_PORT_NUM (UINT16_MAX + 1)
+#define ALLOC_PORT_START 0x8000
+
+struct tle_psm {
+ uint32_t nb_used; /* Number of ports already in use. */
+ uint32_t next_alloc; /* Next port to try allocate. */
+ uint8_t stat[MAX_PORT_NUM]; /* Status of the port:
+ * 1) the most significant bit indicates
+ * if SO_REUSEPORT is allowed;
+ * 2) lowest 7 bits indicate # of streams
+ * using the port.
+ */
+};
+
+static inline void
+tle_psm_init(struct tle_psm *psm)
+{
+ memset(psm, 0, sizeof(struct tle_psm));
+ psm->next_alloc = ALLOC_PORT_START;
+}
+
+static inline int
+tle_psm_set(struct tle_psm *psm, uint16_t port, uint8_t reuseport)
+{
+ if (psm->stat[port] == 0) {
+ /* port has not been used */
+ psm->stat[port]++;
+ if (reuseport)
+ psm->stat[port] |= 0x80;
+ } else {
+ /* port is used by some socket */
+ if (reuseport && (psm->stat[port] & 0x80)) {
+ /* all sockets set reuseport */
+ psm->stat[port]++;
+ } else
+ return -1;
+ }
+
+ return 0;
+}
+
+static inline void
+tle_psm_clear(struct tle_psm *psm, uint16_t port)
+{
+ psm->stat[port]--;
+ if ((psm->stat[port] & 0x7f) == 0)
+ psm->stat[port] = 0;
+}
+
+
+static inline uint8_t
+tle_psm_check(const struct tle_psm *psm, uint16_t port)
+{
+ return psm->stat[port];
+}
+
+static inline uint16_t
+tle_psm_alloc_port(struct tle_psm *psm)
+{
+ uint32_t i = psm->next_alloc;
+
+ for (; i < MAX_PORT_NUM; i++) {
+ if (psm->stat[i] == 0) {
+ psm->next_alloc = i + 1;
+ return (uint16_t)i;
+ }
+ }
+
+ for (i = ALLOC_PORT_START; i < psm->next_alloc; i++) {
+ if (psm->stat[i] == 0) {
+ psm->next_alloc = i + 1;
+ return (uint16_t)i;
+ }
+ }
+
+ return 0;
+}
+
+static inline uint16_t
+tle_psm_alloc_dual_port(struct tle_psm *psm4, struct tle_psm *psm6)
+{
+ uint32_t i = psm6->next_alloc;
+
+ for (; i < MAX_PORT_NUM; i++) {
+ if (psm6->stat[i] == 0 && psm4->stat[i] == 0) {
+ psm6->next_alloc = i + 1;
+ return (uint16_t)i;
+ }
+ }
+
+ for (i = ALLOC_PORT_START; i < psm6->next_alloc; i++) {
+ if (psm6->stat[i] == 0 && psm4->stat[i] == 0) {
+ psm6->next_alloc = i + 1;
+ return (uint16_t)i;
+ }
+ }
+
+ return 0;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _PORT_STATMAP_H_ */
diff --git a/lib/libtle_l4p/stream.h b/lib/libtle_l4p/stream.h
index 49a2809..9f2bbc1 100644
--- a/lib/libtle_l4p/stream.h
+++ b/lib/libtle_l4p/stream.h
@@ -31,7 +31,11 @@ struct tle_stream {
STAILQ_ENTRY(tle_stream) link;
struct tle_ctx *ctx;
- uint8_t type; /* TLE_V4 or TLE_V6 */
+ tle_stream_options_t option;
+ unsigned long timestamp;
+ uint16_t reuseport_seed;
+ uint8_t type; /* TLE_V4 or TLE_V6 */
+ uint8_t padding;
/* Stream address information. */
union l4_ports port;
@@ -53,15 +57,25 @@ static inline uint32_t
get_streams(struct tle_ctx *ctx, struct tle_stream *s[], uint32_t num)
{
struct tle_stream *p;
- uint32_t i, n;
+ uint32_t i, n, inc;
rte_spinlock_lock(&ctx->streams.lock);
- n = RTE_MIN(ctx->streams.nb_free, num);
- for (i = 0, p = STAILQ_FIRST(&ctx->streams.free);
- i != n;
- i++, p = STAILQ_NEXT(p, link))
+ n = ctx->streams.nb_free;
+ if (n < num) {
+ inc = tle_stream_ops[ctx->prm.proto].more_streams(ctx);
+ ctx->streams.nb_free += inc;
+ ctx->streams.nb_cur += inc;
+ n = ctx->streams.nb_free;
+ }
+ n = RTE_MIN(n, num);
+
+ for (i = 0, p = STAILQ_FIRST(&ctx->streams.free); i != n; ) {
s[i] = p;
+ p = STAILQ_NEXT(p, link);
+ s[i]->link.stqe_next = NULL;
+ i++;
+ }
if (p == NULL)
/* we retrieved all free entries */
@@ -80,9 +94,6 @@ get_stream(struct tle_ctx *ctx)
struct tle_stream *s;
s = NULL;
- if (ctx->streams.nb_free == 0)
- return s;
-
get_streams(ctx, &s, 1);
return s;
}
@@ -120,8 +131,8 @@ drb_nb_elem(const struct tle_ctx *ctx)
}
static inline int32_t
-stream_get_dest(struct tle_stream *s, const void *dst_addr,
- struct tle_dest *dst)
+stream_get_dest(uint8_t type, struct tle_stream *s, const void *src_addr,
+ const void *dst_addr, struct tle_dest *dst)
{
int32_t rc;
const struct in_addr *d4;
@@ -133,12 +144,13 @@ stream_get_dest(struct tle_stream *s, const void *dst_addr,
/* it is here just to keep gcc happy. */
d4 = NULL;
+ /* it is here just to keep gcc happy. */
d6 = NULL;
- if (s->type == TLE_V4) {
+ if (type == TLE_V4) {
d4 = dst_addr;
rc = ctx->prm.lookup4(ctx->prm.lookup4_data, d4, dst);
- } else if (s->type == TLE_V6) {
+ } else if (type == TLE_V6) {
d6 = dst_addr;
rc = ctx->prm.lookup6(ctx->prm.lookup6_data, d6, dst);
} else
@@ -148,18 +160,25 @@ stream_get_dest(struct tle_stream *s, const void *dst_addr,
return -ENOENT;
dev = dst->dev;
- dst->ol_flags = dev->tx.ol_flags[s->type];
+ dst->ol_flags = dev->tx.ol_flags[type];
- if (s->type == TLE_V4) {
+ if (type == TLE_V4) {
struct ipv4_hdr *l3h;
l3h = (struct ipv4_hdr *)(dst->hdr + dst->l2_len);
- l3h->src_addr = dev->prm.local_addr4.s_addr;
+ if (((const struct in_addr*)src_addr)->s_addr != INADDR_ANY)
+ l3h->src_addr = ((const struct in_addr*)src_addr)->s_addr;
+ else
+ l3h->src_addr = dev->prm.local_addr4.s_addr;
l3h->dst_addr = d4->s_addr;
} else {
struct ipv6_hdr *l3h;
l3h = (struct ipv6_hdr *)(dst->hdr + dst->l2_len);
- rte_memcpy(l3h->src_addr, &dev->prm.local_addr6,
- sizeof(l3h->src_addr));
+ if (!IN6_IS_ADDR_UNSPECIFIED(src_addr))
+ rte_memcpy(l3h->src_addr, src_addr,
+ sizeof(l3h->src_addr));
+ else
+ rte_memcpy(l3h->src_addr, &dev->prm.local_addr6,
+ sizeof(l3h->src_addr));
rte_memcpy(l3h->dst_addr, d6, sizeof(l3h->dst_addr));
}
diff --git a/lib/libtle_l4p/stream_table.c b/lib/libtle_l4p/stream_table.c
index 5a89553..e029306 100644
--- a/lib/libtle_l4p/stream_table.c
+++ b/lib/libtle_l4p/stream_table.c
@@ -13,68 +13,47 @@
* limitations under the License.
*/
#include <string.h>
-#include <rte_malloc.h>
#include <rte_errno.h>
#include "stream_table.h"
void
-stbl_fini(struct stbl *st)
+bhash_fini(struct tle_ctx *ctx)
{
uint32_t i;
- for (i = 0; i != RTE_DIM(st->ht); i++) {
- rte_hash_free(st->ht[i].t);
- rte_free(st->ht[i].ent);
- }
-
- memset(st, 0, sizeof(*st));
+ for (i = 0; i != RTE_DIM(ctx->bhash); i++)
+ rte_hash_free(ctx->bhash[i]);
}
int
-stbl_init(struct stbl *st, uint32_t num, int32_t socket)
+bhash_init(struct tle_ctx *ctx)
{
- int32_t rc;
- size_t i, sz;
- struct rte_hash_parameters hprm;
+ int rc = 0;
+ struct rte_hash_parameters hprm = {0};
+ bool ipv6 = ctx->prm.lookup6 != NULL;
char buf[RTE_HASH_NAMESIZE];
- num = RTE_MAX(5 * num / 4, 0x10U);
-
- memset(&hprm, 0, sizeof(hprm));
hprm.name = buf;
- hprm.entries = num;
- hprm.socket_id = socket;
-
- rc = 0;
-
- snprintf(buf, sizeof(buf), "stbl4@%p", st);
- hprm.key_len = sizeof(struct stbl4_key);
- st->ht[TLE_V4].t = rte_hash_create(&hprm);
- if (st->ht[TLE_V4].t == NULL)
+ hprm.entries = 4096;
+ hprm.extra_flag = RTE_HASH_EXTRA_FLAGS_EXT_TABLE;
+ hprm.socket_id = ctx->prm.socket_id;
+
+ snprintf(buf, sizeof(buf), "bhash4@%p", ctx);
+ hprm.key_len = sizeof(struct bhash4_key);
+ ctx->bhash[TLE_V4] = rte_hash_create(&hprm);
+ if (ctx->bhash[TLE_V4] == NULL)
rc = (rte_errno != 0) ? -rte_errno : -ENOMEM;
- if (rc == 0) {
- snprintf(buf, sizeof(buf), "stbl6@%p", st);
- hprm.key_len = sizeof(struct stbl6_key);
- st->ht[TLE_V6].t = rte_hash_create(&hprm);
- if (st->ht[TLE_V6].t == NULL)
+ if (rc == 0 && ipv6) {
+ snprintf(buf, sizeof(buf), "bhash6@%p", ctx);
+ hprm.key_len = sizeof(struct bhash6_key);
+ ctx->bhash[TLE_V6] = rte_hash_create(&hprm);
+ if (ctx->bhash[TLE_V6] == NULL) {
+ rte_hash_free(ctx->bhash[TLE_V4]);
rc = (rte_errno != 0) ? -rte_errno : -ENOMEM;
+ }
}
- for (i = 0; i != RTE_DIM(st->ht) && rc == 0; i++) {
-
- sz = sizeof(*st->ht[i].ent) * num;
- st->ht[i].ent = rte_zmalloc_socket(NULL, sz,
- RTE_CACHE_LINE_SIZE, socket);
- if (st->ht[i].ent == NULL)
- rc = -ENOMEM;
- else
- st->ht[i].nb_ent = num;
- }
-
- if (rc != 0)
- stbl_fini(st);
-
return rc;
}
diff --git a/lib/libtle_l4p/stream_table.h b/lib/libtle_l4p/stream_table.h
index 033c306..ba8d165 100644
--- a/lib/libtle_l4p/stream_table.h
+++ b/lib/libtle_l4p/stream_table.h
@@ -16,199 +16,415 @@
#ifndef _STREAM_TABLE_H_
#define _STREAM_TABLE_H_
+#include <string.h>
#include <rte_hash.h>
-#include "tcp_misc.h"
+#include "stream.h"
+#include "misc.h"
#ifdef __cplusplus
extern "C" {
#endif
+#define HASH_SIZE_32K 32771
+#define HASH_SIZE_64K 65537
+#define HASH_SIZE_128K 131071
+
+#define HASH_SIZE HASH_SIZE_64K
+
struct stbl_entry {
void *data;
};
-struct shtbl {
- uint32_t nb_ent; /* max number of entries in the table. */
- rte_spinlock_t l; /* lock to protect the hash table */
- struct rte_hash *t;
- struct stbl_entry *ent;
+struct stbl {
+ rte_spinlock_t l;
+ uint32_t need_lock;
+ struct stbl_entry head[HASH_SIZE];
} __rte_cache_aligned;
-struct stbl {
- struct shtbl ht[TLE_VNUM];
-};
+static inline int
+stbl_init(struct stbl *st, uint32_t lock)
+{
+ st->need_lock = lock;
+ return 0;
+}
-struct stbl4_key {
- union l4_ports port;
- union ipv4_addrs addr;
-} __attribute__((__packed__));
+static inline int
+stbl_fini(struct stbl *st)
+{
+ st->need_lock = 0;
+ return 0;
+}
-struct stbl6_key {
- union l4_ports port;
- union ipv6_addrs addr;
-} __attribute__((__packed__));
+static inline uint8_t
+compare_pkt(const struct tle_stream *s, const union pkt_info *pi)
+{
+ if (s->type != pi->tf.type)
+ return -1;
-struct stbl_key {
- union l4_ports port;
- union {
- union ipv4_addrs addr4;
- union ipv6_addrs addr6;
- };
-} __attribute__((__packed__));
+ if (s->port.raw != pi->port.raw)
+ return -1;
-extern void stbl_fini(struct stbl *st);
+ if (s->type == TLE_V4) {
+ if (s->ipv4.addr.raw != pi->addr4.raw)
+ return -1;
+ } else {
+ if (memcmp(&s->ipv6.addr, pi->addr6, sizeof(union ipv6_addrs)))
+ return -1;
+ }
-extern int stbl_init(struct stbl *st, uint32_t num, int32_t socket);
+ return 0;
+}
-static inline void
-stbl_pkt_fill_key(struct stbl_key *k, const union pkt_info *pi, uint32_t type)
+static inline uint32_t
+stbl_hash_stream(const struct tle_stream *s)
{
- static const struct stbl_key zero = {
- .port.raw = 0,
- };
-
- k->port = pi->port;
- if (type == TLE_V4)
- k->addr4 = pi->addr4;
- else if (type == TLE_V6)
- k->addr6 = *pi->addr6;
- else
- *k = zero;
+ int i;
+ unsigned int hash;
+
+ if (s->type == TLE_V4) {
+ hash = s->ipv4.addr.src ^ s->ipv4.addr.dst
+ ^ s->port.src ^ s->port.dst;
+ } else {
+ hash = s->port.src ^ s->port.dst;
+ for (i = 0; i < 4; i++) {
+ hash ^= s->ipv6.addr.src.u32[i];
+ hash ^= s->ipv6.addr.dst.u32[i];
+ }
+ }
+
+ return hash % HASH_SIZE;
}
-static inline void
-stbl_lock(struct stbl *st, uint32_t type)
+static inline uint32_t
+stbl_hash_pkt(const union pkt_info* pi)
{
- rte_spinlock_lock(&st->ht[type].l);
+ int i;
+ unsigned int hash;
+
+ if (pi->tf.type == TLE_V4) {
+ hash = pi->addr4.src ^ pi->addr4.dst ^ pi->port.src ^ pi->port.dst;
+ } else {
+ hash = pi->port.src ^ pi->port.dst;
+ for (i = 0; i < 4; i++) {
+ hash ^= pi->addr6->src.u32[i];
+ hash ^= pi->addr6->dst.u32[i];
+ }
+ }
+
+ return hash % HASH_SIZE;
}
-static inline void
-stbl_unlock(struct stbl *st, uint32_t type)
+static inline struct stbl_entry*
+stbl_add_stream(struct stbl *st, struct tle_stream *s)
{
- rte_spinlock_unlock(&st->ht[type].l);
+ struct stbl_entry* entry;
+
+ if (st->need_lock)
+ rte_spinlock_lock(&st->l);
+ entry = &st->head[stbl_hash_stream(s)];
+ s->link.stqe_next = (struct tle_stream*)entry->data;
+ entry->data = s;
+ if (st->need_lock)
+ rte_spinlock_unlock(&st->l);
+
+ return entry;
}
-static inline struct stbl_entry *
-stbl_add_entry(struct stbl *st, const union pkt_info *pi)
+static inline struct tle_stream *
+stbl_find_stream(struct stbl *st, const union pkt_info *pi)
{
- int32_t rc;
- uint32_t type;
- struct shtbl *ht;
- struct stbl_key k;
-
- type = pi->tf.type;
- stbl_pkt_fill_key(&k, pi, type);
- ht = st->ht + type;
-
- rc = rte_hash_add_key(ht->t, &k);
- if ((uint32_t)rc >= ht->nb_ent)
- return NULL;
- return ht->ent + rc;
+ struct tle_stream* head;
+
+ if (st->need_lock)
+ rte_spinlock_lock(&st->l);
+ head = (struct tle_stream*)st->head[stbl_hash_pkt(pi)].data;
+ while (head != NULL) {
+ if (compare_pkt(head, pi) == 0)
+ break;
+
+ head = head->link.stqe_next;
+ }
+ if (st->need_lock)
+ rte_spinlock_unlock(&st->l);
+ return head;
}
-static inline struct stbl_entry *
-stbl_add_stream(struct stbl *st, const union pkt_info *pi, const void *s)
+static inline void
+stbl_del_stream(struct stbl *st, struct stbl_entry *se,
+ struct tle_stream *s)
{
- struct stbl_entry *se;
+ struct tle_stream *prev, *current;
- se = stbl_add_entry(st, pi);
- if (se != NULL)
- se->data = (void *)(uintptr_t)s;
- return se;
+ if (st->need_lock)
+ rte_spinlock_lock(&st->l);
+ if (se == NULL)
+ se = &st->head[stbl_hash_stream(s)];
+ prev = NULL;
+ current = (struct tle_stream*)se->data;
+ while (current != NULL) {
+ if (current != s) {
+ prev = current;
+ current = current->link.stqe_next;
+ continue;
+ }
+
+ if (prev)
+ prev->link.stqe_next = current->link.stqe_next;
+ else
+ se->data = current->link.stqe_next;
+ break;
+ }
+ if (st->need_lock)
+ rte_spinlock_unlock(&st->l);
+
+ s->link.stqe_next = NULL;
}
-static inline struct stbl_entry *
-stbl_find_entry(struct stbl *st, const union pkt_info *pi)
+struct bhash4_key {
+ uint16_t port;
+ uint32_t addr;
+} __attribute__((__packed__));
+
+struct bhash6_key {
+ uint16_t port;
+ rte_xmm_t addr;
+} __attribute__((__packed__));
+
+struct bhash_key {
+ uint16_t port;
+ union {
+ uint32_t addr4;
+ rte_xmm_t addr6;
+ };
+} __attribute__((__packed__));
+
+void bhash_fini(struct tle_ctx *ctx);
+
+int bhash_init(struct tle_ctx *ctx);
+
+static inline int
+bhash_sockaddr2key(const struct sockaddr *addr, struct bhash_key *key)
{
- int32_t rc;
- uint32_t type;
- struct shtbl *ht;
- struct stbl_key k;
-
- type = pi->tf.type;
- stbl_pkt_fill_key(&k, pi, type);
- ht = st->ht + type;
-
- rc = rte_hash_lookup(ht->t, &k);
- if ((uint32_t)rc >= ht->nb_ent)
- return NULL;
- return ht->ent + rc;
+ int t;
+ const struct sockaddr_in *lin4;
+ const struct sockaddr_in6 *lin6;
+
+ if (addr->sa_family == AF_INET) {
+ lin4 = (const struct sockaddr_in *)addr;
+ key->port = lin4->sin_port;
+ key->addr4 = lin4->sin_addr.s_addr;
+ t = TLE_V4;
+ } else {
+ lin6 = (const struct sockaddr_in6 *)addr;
+ memcpy(&key->addr6, &lin6->sin6_addr, sizeof(key->addr6));
+ key->port = lin6->sin6_port;
+ t = TLE_V6;
+ }
+
+ return t;
}
-static inline void *
-stbl_find_data(struct stbl *st, const union pkt_info *pi)
+/* Return 0 on success;
+ * Return errno on failure.
+ */
+static inline int
+bhash_add_entry(struct tle_ctx *ctx, const struct sockaddr *addr,
+ struct tle_stream *s)
{
- struct stbl_entry *ent;
-
- ent = stbl_find_entry(st, pi);
- return (ent == NULL) ? NULL : ent->data;
+ int t;
+ int rc;
+ int is_first;
+ struct bhash_key key;
+ struct rte_hash *bhash;
+ struct tle_stream *old, *tmp;
+
+ is_first = 0;
+ t = bhash_sockaddr2key(addr, &key);
+
+ rte_spinlock_lock(&ctx->bhash_lock[t]);
+ bhash = ctx->bhash[t];
+ rc = rte_hash_lookup_data(bhash, &key, (void **)&old);
+ if (rc == -ENOENT) {
+ is_first = 1;
+ s->link.stqe_next = NULL; /* just to avoid follow */
+ rc = rte_hash_add_key_data(bhash, &key, s);
+ } else if (rc >= 0) {
+ if (t == TLE_V4 && old->type == TLE_V6) {
+ /* V6 stream may listen V4 address, assure V4 stream
+ * is ahead of V6 stream in the list
+ */
+ s->link.stqe_next = old;
+ rte_hash_add_key_data(bhash, &key, s);
+ } else {
+ tmp = old->link.stqe_next;
+ old->link.stqe_next = s;
+ s->link.stqe_next = tmp;
+ }
+ }
+ rte_spinlock_unlock(&ctx->bhash_lock[t]);
+
+ /* IPv6 socket with unspecified address could receive IPv4 packets.
+ * So the stream should also be recorded in IPv4 table.
+ * Only the first stream need be inserted into V4 list, otherwise
+ * the V6 list is already following V4 list.
+ */
+ if (t == TLE_V6 && !s->option.ipv6only && is_first &&
+ IN6_IS_ADDR_UNSPECIFIED(&key.addr6)) {
+ t = TLE_V4;
+ rte_spinlock_lock(&ctx->bhash_lock[t]);
+ bhash = ctx->bhash[t];
+ rc = rte_hash_lookup_data(bhash, &key, (void **)&old);
+ if (rc == -ENOENT)
+ rc = rte_hash_add_key_data(bhash, &key, s);
+ else if (rc >= 0) {
+ while(old->link.stqe_next != NULL)
+ old = old->link.stqe_next;
+ old->link.stqe_next = s;
+ s->link.stqe_next = NULL;
+ }
+ rte_spinlock_unlock(&ctx->bhash_lock[t]);
+ }
+
+ return (rc >= 0) ? 0 : (-rc);
}
-#include "tcp_stream.h"
-
static inline void
-stbl_stream_fill_key(struct stbl_key *k, const struct tle_stream *s,
- uint32_t type)
+bhash_del_entry(struct tle_ctx *ctx, struct tle_stream *s,
+ const struct sockaddr *addr)
{
- static const struct stbl_key zero = {
- .port.raw = 0,
- };
+ int t;
+ int rc;
+ struct bhash_key key;
+ struct tle_stream *f, *cur, *pre = NULL;
+
+ t = bhash_sockaddr2key(addr, &key);
+
+ rte_spinlock_lock(&ctx->bhash_lock[t]);
+ rc = rte_hash_lookup_data(ctx->bhash[t], &key, (void **)&f);
+ if (rc >= 0) {
+ cur = f;
+ pre = NULL;
+ while (cur != s) {
+ pre = cur;
+ cur = cur->link.stqe_next;
+ }
+
+ if (pre == NULL) {
+ cur = cur->link.stqe_next;
+ if (cur == NULL)
+ rte_hash_del_key(ctx->bhash[t], &key);
+ else /* change data */
+ rte_hash_add_key_data(ctx->bhash[t], &key, cur);
+ } else
+ pre->link.stqe_next = cur->link.stqe_next;
+ }
+
+ rte_spinlock_unlock(&ctx->bhash_lock[t]);
+
+ if (rc < 0)
+ return;
+
+ s->link.stqe_next = NULL;
+
+ /* IPv6 socket with unspecified address could receive IPv4 packets.
+ * So the stream should also be recorded in IPv4 table*/
+ if (t == TLE_V6 && !s->option.ipv6only && pre == NULL &&
+ IN6_IS_ADDR_UNSPECIFIED(&key.addr6)) {
+ t = TLE_V4;
+ rte_spinlock_lock(&ctx->bhash_lock[t]);
+ rc = rte_hash_lookup_data(ctx->bhash[t], &key, (void **)&f);
+ if (rc >= 0) {
+ cur = f;
+ pre = NULL;
+ while (cur != s) {
+ pre = cur;
+ cur = cur->link.stqe_next;
+ }
+
+ if (pre == NULL) {
+ cur = cur->link.stqe_next;
+ if (cur == NULL)
+ rte_hash_del_key(ctx->bhash[t], &key);
+ else /* change data */
+ rte_hash_add_key_data(ctx->bhash[t], &key, cur);
+ } else
+ pre->link.stqe_next = cur->link.stqe_next;
+ }
+
+ rte_spinlock_unlock(&ctx->bhash_lock[t]);
+ }
- k->port = s->port;
- if (type == TLE_V4)
- k->addr4 = s->ipv4.addr;
- else if (type == TLE_V6)
- k->addr6 = s->ipv6.addr;
- else
- *k = zero;
}
-static inline struct stbl_entry *
-stbl_add_stream_lock(struct stbl *st, const struct tle_tcp_stream *s)
+static inline void *
+bhash_reuseport_get_stream(struct tle_stream *s)
{
- uint32_t type;
- struct stbl_key k;
- struct stbl_entry *se;
- struct shtbl *ht;
- int32_t rc;
-
- type = s->s.type;
- stbl_stream_fill_key(&k, &s->s, type);
- ht = st->ht + type;
+ int n = 0;
+ struct tle_stream *e, *all[32];
+
+ e = s;
+ while(e && n < 32) {
+ all[n++] = e;
+ e = e->link.stqe_next;
+ }
+
+ /* for each connection, this function will be called twice
+ * 1st time for the first handshake: SYN
+ * 2nd time for the third handshake: ACK
+ */
+ return all[(s->reuseport_seed++) % n];
+}
- stbl_lock(st, type);
- rc = rte_hash_add_key(ht->t, &k);
- stbl_unlock(st, type);
+static inline void *
+bhash_lookup4(struct rte_hash *t, uint32_t addr, uint16_t port, uint8_t reuse)
+{
+ int rc;
+ void *s = NULL;
+ struct bhash_key key = {
+ .port = port,
+ .addr4 = addr,
+ };
- if ((uint32_t)rc >= ht->nb_ent)
- return NULL;
+ rc = rte_hash_lookup_data(t, &key, &s);
+ if (rc == -ENOENT) {
+ key.addr4 = INADDR_ANY;
+ rc = rte_hash_lookup_data(t, &key, &s);
+ }
- se = ht->ent + rc;
- if (se != NULL)
- se->data = (void *)(uintptr_t)s;
+ if (rc >= 0) {
+ if (reuse)
+ return bhash_reuseport_get_stream(s);
+ else
+ return s;
+ }
- return se;
+ return NULL;
}
-static inline void
-stbl_del_stream(struct stbl *st, struct stbl_entry *se,
- const struct tle_tcp_stream *s, uint32_t lock)
+static inline void *
+bhash_lookup6(struct rte_hash *t, rte_xmm_t addr, uint16_t port, uint8_t reuse)
{
- uint32_t type;
- struct stbl_key k;
+ int rc;
+ void *s = NULL;
+ struct bhash_key key = {
+ .port = port,
+ .addr6 = addr,
+ };
- if (se == NULL)
- return;
+ rc = rte_hash_lookup_data(t, &key, &s);
+ if (rc == -ENOENT) {
+ memcpy(&key.addr6, &tle_ipv6_any, sizeof(key.addr6));
+ rc = rte_hash_lookup_data(t, &key, &s);
+ }
- se->data = NULL;
+ if (rc >= 0) {
+ if (reuse)
+ return bhash_reuseport_get_stream(s);
+ else
+ return s;
+ }
- type = s->s.type;
- stbl_stream_fill_key(&k, &s->s, type);
- if (lock != 0)
- stbl_lock(st, type);
- rte_hash_del_key(st->ht[type].t, &k);
- if (lock != 0)
- stbl_unlock(st, type);
+ return NULL;
}
#ifdef __cplusplus
diff --git a/lib/libtle_l4p/syncookie.h b/lib/libtle_l4p/syncookie.h
index 61bfce4..bf01e78 100644
--- a/lib/libtle_l4p/syncookie.h
+++ b/lib/libtle_l4p/syncookie.h
@@ -182,9 +182,12 @@ sync_fill_tcb(struct tcb *tcb, const union seg_info *si, const union tsopt *to)
{
uint32_t ack, mss, seq, wscale;
+ tcb->err = 0;
+
seq = si->seq;
tcb->rcv.nxt = seq;
+ tcb->rcv.cpy = seq;
tcb->rcv.irs = seq - 1;
tcb->snd.wu.wl1 = seq;
@@ -202,6 +205,7 @@ sync_fill_tcb(struct tcb *tcb, const union seg_info *si, const union tsopt *to)
tcb->so.mss = mss;
tcb->snd.ts = to->ecr;
+ tcb->snd.cork_ts = 0;
tcb->rcv.ts = to->val;
tcb->so.ts.raw = to->raw;
diff --git a/lib/libtle_l4p/tcp_ctl.h b/lib/libtle_l4p/tcp_ctl.h
index bec1e76..3196470 100644
--- a/lib/libtle_l4p/tcp_ctl.h
+++ b/lib/libtle_l4p/tcp_ctl.h
@@ -22,6 +22,7 @@
#include "tcp_stream.h"
#include "tcp_ofo.h"
+#include "tcp_timer.h"
#ifdef __cplusplus
extern "C" {
@@ -97,10 +98,10 @@ calc_rx_wnd(const struct tle_tcp_stream *s, uint32_t scale)
/* peer doesn't support WSCALE option, wnd size is limited to 64K */
if (scale == TCP_WSCALE_NONE) {
- wnd = _rte_ring_get_mask(s->rx.q) << TCP_WSCALE_DEFAULT;
+ wnd = rte_ring_free_count(s->rx.q) << TCP_WSCALE_DEFAULT;
return RTE_MIN(wnd, (uint32_t)UINT16_MAX);
} else
- return _rte_ring_get_mask(s->rx.q) << scale;
+ return rte_ring_free_count(s->rx.q) << scale;
}
/* empty stream's send queue */
@@ -144,31 +145,34 @@ static inline void
tcp_stream_reset(struct tle_ctx *ctx, struct tle_tcp_stream *s)
{
struct stbl *st;
- uint16_t uop;
+ uint16_t state;
+ uint8_t i;
st = CTX_TCP_STLB(ctx);
- /* reset TX armed */
- rte_atomic32_set(&s->tx.arm, 0);
+ for (i = 0; i < TIMER_NUM; i++)
+ timer_stop(s, i);
/* reset TCB */
- uop = s->tcb.uop & ~TCP_OP_CLOSE;
+ state = s->tcb.state;
memset(&s->tcb, 0, sizeof(s->tcb));
/* reset cached destination */
memset(&s->tx.dst, 0, sizeof(s->tx.dst));
- if (uop != TCP_OP_ACCEPT) {
+ /* state could be ESTABLISHED, CLOSED or LISTEN
+ * stream in CLOSED state has already been cleared by stream_term
+ * stream in ESTABLISHED state is accepted stream, and doesn't need clear
+ */
+ if (state == TCP_ST_LISTEN) {
/* free stream's destination port */
stream_clear_ctx(ctx, &s->s);
- if (uop == TCP_OP_LISTEN)
- empty_lq(s);
+ empty_lq(s);
}
if (s->ste != NULL) {
/* remove entry from RX streams table */
- stbl_del_stream(st, s->ste, s,
- (s->flags & TLE_CTX_FLAG_ST) == 0);
+ stbl_del_stream(st, s->ste, &s->s);
s->ste = NULL;
empty_rq(s);
}
@@ -184,6 +188,48 @@ tcp_stream_reset(struct tle_ctx *ctx, struct tle_tcp_stream *s)
put_stream(ctx, &s->s, TCP_STREAM_TX_FINISHED(s));
}
+static inline void
+stream_term(struct tle_tcp_stream *s)
+{
+ struct sdr *dr;
+
+ /* 1) recv a RST packet; 2) keepalive timeout */
+ if (s->tcb.state == TCP_ST_ESTABLISHED) {
+ TCP_DEC_STATS_ATOMIC(TCP_MIB_CURRESTAB);
+ TCP_INC_STATS(TCP_MIB_ESTABRESETS);
+ }
+
+ s->tcb.state = TCP_ST_CLOSED;
+ rte_smp_wmb();
+
+ /* close() was already invoked, schedule final cleanup */
+ if ((s->tcb.uop & TCP_OP_CLOSE) != 0) {
+ if ((s->tcb.uop & TCP_OP_ACCEPT) == 0) {
+ /* free stream's destination port */
+ stream_clear_ctx(s->s.ctx, &s->s);
+ if ((s->tcb.uop & TCP_OP_LISTEN) != 0)
+ empty_lq(s);
+ }
+
+ if (s->ste != NULL) {
+ /* remove entry from RX streams table */
+ stbl_del_stream(CTX_TCP_STLB(s->s.ctx), s->ste, &s->s);
+ s->ste = NULL;
+ empty_rq(s);
+ }
+
+ dr = CTX_TCP_SDR(s->s.ctx);
+ rte_spinlock_lock(&dr->lock);
+ STAILQ_INSERT_TAIL(&dr->be, &s->s, link);
+ rte_spinlock_unlock(&dr->lock);
+
+ /* notify user that stream need to be closed */
+ } else if (s->err.ev != NULL)
+ tle_event_raise(s->err.ev);
+ else if (s->err.cb.func != NULL)
+ s->err.cb.func(s->err.cb.data, &s->s);
+}
+
#ifdef __cplusplus
}
#endif
diff --git a/lib/libtle_l4p/tcp_misc.h b/lib/libtle_l4p/tcp_misc.h
index 0cef8b2..1f7974e 100644
--- a/lib/libtle_l4p/tcp_misc.h
+++ b/lib/libtle_l4p/tcp_misc.h
@@ -30,7 +30,7 @@ extern "C" {
* of protocol related data.
*/
-#define TCP_WSCALE_DEFAULT 7
+#define TCP_WSCALE_DEFAULT 10
#define TCP_WSCALE_NONE 0
#define TCP_TX_HDR_MAX (sizeof(struct tcp_hdr) + TCP_TX_OPT_LEN_MAX)
@@ -71,27 +71,6 @@ extern "C" {
/* TCP flags mask. */
#define TCP_FLAG_MASK UINT8_MAX
-union typflg {
- uint16_t raw;
- struct {
- uint8_t type; /* TLE_V4/TLE_V6 */
- uint8_t flags; /* TCP header flags */
- };
-};
-
-union pkt_info {
- rte_xmm_t raw;
- struct {
- union typflg tf;
- uint16_t csf; /* checksum flags */
- union l4_ports port;
- union {
- union ipv4_addrs addr4;
- const union ipv6_addrs *addr6;
- };
- };
-};
-
union seg_info {
rte_xmm_t raw;
struct {
@@ -226,7 +205,7 @@ struct dack_info {
};
/* get current timestamp in ms */
-static inline uint32_t
+static inline uint64_t
tcp_get_tms(uint32_t mshift)
{
uint64_t ts;
@@ -344,7 +323,9 @@ fill_syn_opts(void *p, const struct syn_opts *so)
opt = (struct tcpopt *)to;
}
- to[0] = TCP_OPT_KIND_EOL;
+ to[0] = TCP_OPT_KIND_NOP;
+ to[1] = TCP_OPT_KIND_NOP;
+ to[2] = TCP_OPT_KIND_NOP;
}
/*
@@ -390,6 +371,8 @@ get_tms_opts(uintptr_t p, uint32_t len)
else if (kind == TCP_OPT_KIND_NOP)
i += sizeof(to->kl.kind);
else {
+ if (to->kl.len == 0)
+ break;
i += to->kl.len;
if (i <= len && to->kl.raw == TCP_OPT_KL_TMS) {
ts.val = rte_be_to_cpu_32(to->ts.val);
@@ -449,7 +432,6 @@ get_pkt_info(const struct rte_mbuf *m, union pkt_info *pi, union seg_info *si)
((uintptr_t)tcph + offsetof(struct tcp_hdr, src_port));
pi->tf.flags = tcph->tcp_flags;
pi->tf.type = type;
- pi->csf = m->ol_flags & (PKT_RX_IP_CKSUM_MASK | PKT_RX_L4_CKSUM_MASK);
pi->port.raw = prt->raw;
get_seg_info(tcph, si);
@@ -462,7 +444,7 @@ tcp_mbuf_seq_free(struct rte_mbuf *mb[], uint32_t num)
len = 0;
for (i = 0; i != num; i++) {
- len += mb[i]->pkt_len;
+ len += PKT_L4_PLEN(mb[i]);
rte_pktmbuf_free(mb[i]);
}
diff --git a/lib/libtle_l4p/tcp_ofo.c b/lib/libtle_l4p/tcp_ofo.c
index 1565445..b31f2b5 100644
--- a/lib/libtle_l4p/tcp_ofo.c
+++ b/lib/libtle_l4p/tcp_ofo.c
@@ -12,7 +12,6 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-#include <rte_malloc.h>
#include <rte_errno.h>
#include "tcp_stream.h"
@@ -28,12 +27,6 @@
#define OFO_OBJ_MAX (OFODB_OBJ_MAX * OFO_DB_MAX)
void
-tcp_ofo_free(struct ofo *ofo)
-{
- rte_free(ofo);
-}
-
-static void
calc_ofo_elems(uint32_t nbufs, uint32_t *nobj, uint32_t *ndb)
{
uint32_t n, nd, no;
@@ -51,35 +44,3 @@ calc_ofo_elems(uint32_t nbufs, uint32_t *nobj, uint32_t *ndb)
*nobj = no;
*ndb = nd;
}
-
-struct ofo *
-tcp_ofo_alloc(uint32_t nbufs, int32_t socket)
-{
- uint32_t i, ndb, nobj;
- size_t dsz, osz, sz;
- struct ofo *ofo;
- struct rte_mbuf **obj;
-
- calc_ofo_elems(nbufs, &nobj, &ndb);
- osz = sizeof(*ofo) + sizeof(ofo->db[0]) * ndb;
- dsz = sizeof(ofo->db[0].obj[0]) * nobj * ndb;
- sz = osz + dsz;
-
- ofo = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE, socket);
- if (ofo == NULL) {
- TCP_LOG(ERR, "%s: allocation of %zu bytes on socket %d "
- "failed with error code: %d\n",
- __func__, sz, socket, rte_errno);
- return NULL;
- }
-
- obj = (struct rte_mbuf **)&ofo->db[ndb];
- for (i = 0; i != ndb; i++) {
- ofo->db[i].nb_max = nobj;
- ofo->db[i].obj = obj + i * nobj;
- }
-
- ofo->nb_max = ndb;
- return ofo;
-}
-
diff --git a/lib/libtle_l4p/tcp_ofo.h b/lib/libtle_l4p/tcp_ofo.h
index 9d88266..0857f17 100644
--- a/lib/libtle_l4p/tcp_ofo.h
+++ b/lib/libtle_l4p/tcp_ofo.h
@@ -20,8 +20,6 @@
extern "C" {
#endif
-#include <stdbool.h>
-
struct ofodb {
uint32_t nb_elem;
uint32_t nb_max;
@@ -103,7 +101,7 @@ _ofo_insert_mbuf(struct ofo* ofo, uint32_t pos, union seqlen* sl,
db->obj[k + i] = mb[i];
}
if (tcp_seq_lt(end, seq))
- rte_pktmbuf_trim(mb[i - 1], seq - end);
+ _rte_pktmbuf_trim(mb[i - 1], seq - end);
db->nb_elem += i;
db->sl.len += tcp_seq_min(seq, end) - sl->seq;
@@ -157,7 +155,7 @@ _ofo_insert_right(struct ofo *ofo, uint32_t pos, union seqlen *sl,
plen = mb[i]->pkt_len;
if (n < plen) {
/* adjust partially overlapped packet. */
- rte_pktmbuf_adj(mb[i], n);
+ mb[i] = _rte_pktmbuf_adj(mb[i], n);
break;
}
}
@@ -258,7 +256,7 @@ static inline uint32_t
_ofodb_enqueue(struct rte_ring *r, const struct ofodb *db, uint32_t *seq)
{
uint32_t i, n, num, begin, end;
- struct rte_mbuf *pkt;
+ struct rte_mbuf* pkt;
n = 0;
num = db->nb_elem;
@@ -289,11 +287,7 @@ _ofodb_enqueue(struct rte_ring *r, const struct ofodb *db, uint32_t *seq)
return num - n;
}
-struct ofo *
-tcp_ofo_alloc(uint32_t nbufs, int32_t socket);
-
-void
-tcp_ofo_free(struct ofo *ofo);
+void calc_ofo_elems(uint32_t nbufs, uint32_t *nobj, uint32_t *ndb);
#ifdef __cplusplus
}
diff --git a/lib/libtle_l4p/tcp_rxq.h b/lib/libtle_l4p/tcp_rxq.h
index 2351ee6..be092f9 100644
--- a/lib/libtle_l4p/tcp_rxq.h
+++ b/lib/libtle_l4p/tcp_rxq.h
@@ -17,6 +17,7 @@
#define _TCP_RXQ_H_
#include "tcp_ofo.h"
+#include "tcp_ctl.h"
#ifdef __cplusplus
extern "C" {
@@ -74,6 +75,7 @@ rx_ofo_reduce(struct tle_tcp_stream *s)
s->tcb.rcv.nxt = seq;
_ofo_remove(ofo, 0, i);
+
return n;
}
@@ -133,6 +135,8 @@ rx_data_enqueue(struct tle_tcp_stream *s, uint32_t seq, uint32_t len,
}
n = rte_ring_count(s->rx.q);
+ /* update receive window with left recv buffer*/
+ s->tcb.rcv.wnd = calc_rx_wnd(s, s->tcb.rcv.wscale);
if (r != n) {
/* raise RX event */
if (s->rx.ev != NULL)
diff --git a/lib/libtle_l4p/tcp_rxtx.c b/lib/libtle_l4p/tcp_rxtx.c
index a519645..5d7e0d1 100644
--- a/lib/libtle_l4p/tcp_rxtx.c
+++ b/lib/libtle_l4p/tcp_rxtx.c
@@ -28,8 +28,30 @@
#include "tcp_rxq.h"
#include "tcp_txq.h"
#include "tcp_tx_seg.h"
+#include "tcp_rxtx.h"
-#define TCP_MAX_PKT_SEG 0x20
+/* Uncomment below line to debug cwnd */
+// #define DEBUG_CWND
+
+#ifdef DEBUG_CWND
+#define CWND_INFO(msg, value) printf("CWND: %s: %d\n", msg, value)
+#else
+#define CWND_INFO(msg, value) do {} while (0)
+#endif
+
+#define TCP_MAX_PKT_SEG 0x20
+#define DELAY_ACK_CHECK_INTERVAL 100
+
+/* must larger than l2_len(14)+l3_len(20)+l4_len(20)+tms_option(12) */
+#define RESERVE_HEADER_LEN 128
+
+/* If we encounter exhaustion of recv win, we set this thresh to
+ * update recv win to the remote. It's not set to 1 or some smaller
+ * value to avoid too-frequent update.
+ */
+#define RECV_WIN_NOTIFY_THRESH 64
+
+static inline int stream_fill_dest(struct tle_tcp_stream *s);
/*
* checks if input TCP ports and IP addresses match given stream.
@@ -54,11 +76,17 @@ rx_check_stream(const struct tle_tcp_stream *s, const union pkt_info *pi)
static inline struct tle_tcp_stream *
rx_obtain_listen_stream(const struct tle_dev *dev, const union pkt_info *pi,
- uint32_t type)
+ uint32_t type, uint8_t reuse)
{
struct tle_tcp_stream *s;
- s = (struct tle_tcp_stream *)dev->dp[type]->streams[pi->port.dst];
+ if (type == TLE_V4)
+ s = bhash_lookup4(dev->ctx->bhash[type],
+ pi->addr4.dst, pi->port.dst, reuse);
+ else
+ s = bhash_lookup6(dev->ctx->bhash[type],
+ pi->addr6->dst, pi->port.dst, reuse);
+
if (s == NULL || tcp_stream_acquire(s) < 0)
return NULL;
@@ -77,10 +105,10 @@ rx_obtain_stream(const struct tle_dev *dev, struct stbl *st,
{
struct tle_tcp_stream *s;
- s = stbl_find_data(st, pi);
+ s = TCP_STREAM(stbl_find_stream(st, pi));
if (s == NULL) {
- if (pi->tf.flags == TCP_FLAG_ACK)
- return rx_obtain_listen_stream(dev, pi, type);
+ if (pi->tf.flags & TCP_FLAG_ACK)
+ return rx_obtain_listen_stream(dev, pi, type, 1);
return NULL;
}
@@ -150,131 +178,6 @@ pkt_info_bulk_syneq(const union pkt_info pi[], uint32_t num)
return i;
}
-static inline void
-stream_drb_free(struct tle_tcp_stream *s, struct tle_drb *drbs[],
- uint32_t nb_drb)
-{
- _rte_ring_enqueue_burst(s->tx.drb.r, (void **)drbs, nb_drb);
-}
-
-static inline uint32_t
-stream_drb_alloc(struct tle_tcp_stream *s, struct tle_drb *drbs[],
- uint32_t nb_drb)
-{
- return _rte_ring_dequeue_burst(s->tx.drb.r, (void **)drbs, nb_drb);
-}
-
-static inline uint32_t
-get_ip_pid(struct tle_dev *dev, uint32_t num, uint32_t type, uint32_t st)
-{
- uint32_t pid;
- rte_atomic32_t *pa;
-
- pa = &dev->tx.packet_id[type];
-
- if (st == 0) {
- pid = rte_atomic32_add_return(pa, num);
- return pid - num;
- } else {
- pid = rte_atomic32_read(pa);
- rte_atomic32_set(pa, pid + num);
- return pid;
- }
-}
-
-static inline void
-fill_tcph(struct tcp_hdr *l4h, const struct tcb *tcb, union l4_ports port,
- uint32_t seq, uint8_t hlen, uint8_t flags)
-{
- uint16_t wnd;
-
- l4h->src_port = port.dst;
- l4h->dst_port = port.src;
-
- wnd = (flags & TCP_FLAG_SYN) ?
- RTE_MIN(tcb->rcv.wnd, (uint32_t)UINT16_MAX) :
- tcb->rcv.wnd >> tcb->rcv.wscale;
-
- /* ??? use sse shuffle to hton all remaining 16 bytes at once. ??? */
- l4h->sent_seq = rte_cpu_to_be_32(seq);
- l4h->recv_ack = rte_cpu_to_be_32(tcb->rcv.nxt);
- l4h->data_off = hlen / TCP_DATA_ALIGN << TCP_DATA_OFFSET;
- l4h->tcp_flags = flags;
- l4h->rx_win = rte_cpu_to_be_16(wnd);
- l4h->cksum = 0;
- l4h->tcp_urp = 0;
-
- if (flags & TCP_FLAG_SYN)
- fill_syn_opts(l4h + 1, &tcb->so);
- else if ((flags & TCP_FLAG_RST) == 0 && tcb->so.ts.raw != 0)
- fill_tms_opts(l4h + 1, tcb->snd.ts, tcb->rcv.ts);
-}
-
-static inline int
-tcp_fill_mbuf(struct rte_mbuf *m, const struct tle_tcp_stream *s,
- const struct tle_dest *dst, uint64_t ol_flags,
- union l4_ports port, uint32_t seq, uint32_t flags,
- uint32_t pid, uint32_t swcsm)
-{
- uint32_t l4, len, plen;
- struct tcp_hdr *l4h;
- char *l2h;
-
- len = dst->l2_len + dst->l3_len;
- plen = m->pkt_len;
-
- if (flags & TCP_FLAG_SYN)
- l4 = sizeof(*l4h) + TCP_TX_OPT_LEN_MAX;
- else if ((flags & TCP_FLAG_RST) == 0 && s->tcb.rcv.ts != 0)
- l4 = sizeof(*l4h) + TCP_TX_OPT_LEN_TMS;
- else
- l4 = sizeof(*l4h);
-
- /* adjust mbuf to put L2/L3/L4 headers into it. */
- l2h = rte_pktmbuf_prepend(m, len + l4);
- if (l2h == NULL)
- return -EINVAL;
-
- /* copy L2/L3 header */
- rte_memcpy(l2h, dst->hdr, len);
-
- /* setup TCP header & options */
- l4h = (struct tcp_hdr *)(l2h + len);
- fill_tcph(l4h, &s->tcb, port, seq, l4, flags);
-
- /* setup mbuf TX offload related fields. */
- m->tx_offload = _mbuf_tx_offload(dst->l2_len, dst->l3_len, l4, 0, 0, 0);
- m->ol_flags |= ol_flags;
-
- /* update proto specific fields. */
-
- if (s->s.type == TLE_V4) {
- struct ipv4_hdr *l3h;
- l3h = (struct ipv4_hdr *)(l2h + dst->l2_len);
- l3h->packet_id = rte_cpu_to_be_16(pid);
- l3h->total_length = rte_cpu_to_be_16(plen + dst->l3_len + l4);
-
- if ((ol_flags & PKT_TX_TCP_CKSUM) != 0)
- l4h->cksum = _ipv4x_phdr_cksum(l3h, m->l3_len,
- ol_flags);
- else if (swcsm != 0)
- l4h->cksum = _ipv4_udptcp_mbuf_cksum(m, len, l3h);
-
- if ((ol_flags & PKT_TX_IP_CKSUM) == 0 && swcsm != 0)
- l3h->hdr_checksum = _ipv4x_cksum(l3h, m->l3_len);
- } else {
- struct ipv6_hdr *l3h;
- l3h = (struct ipv6_hdr *)(l2h + dst->l2_len);
- l3h->payload_len = rte_cpu_to_be_16(plen + l4);
- if ((ol_flags & PKT_TX_TCP_CKSUM) != 0)
- l4h->cksum = rte_ipv6_phdr_cksum(l3h, ol_flags);
- else if (swcsm != 0)
- l4h->cksum = _ipv6_udptcp_mbuf_cksum(m, len, l3h);
- }
-
- return 0;
-}
-
/*
* That function supposed to be used only for data packets.
* Assumes that L2/L3/L4 headers and mbuf fields already setup properly.
@@ -355,6 +258,9 @@ tx_data_pkts(struct tle_tcp_stream *s, struct rte_mbuf *const m[], uint32_t num)
i = tle_dring_mp_enqueue(&dev->tx.dr, (const void * const*)m,
num, drb, &nb);
+ if (i > 0)
+ timer_stop(s, TIMER_DACK);
+
/* free unused drbs. */
if (nb != 0)
stream_drb_free(s, drb + nbm - nb, nb);
@@ -362,6 +268,113 @@ tx_data_pkts(struct tle_tcp_stream *s, struct rte_mbuf *const m[], uint32_t num)
return i;
}
+/*
+ * case 0: pkt is not split yet, (indicate plen > sl->len)
+ * case 1: pkt is split, but left packet > sl->len
+ * case 2: pkt is split, but left packet <= sl->len
+ */
+static inline struct rte_mbuf *
+get_indirect_mbuf(struct tle_tcp_stream *s,
+ struct rte_mbuf *m, uint32_t *p_plen,
+ union seqlen *sl, uint32_t type,
+ uint32_t mss)
+{
+ uint32_t hdr_len = PKT_L234_HLEN(m), plen, left;
+ struct rte_mbuf *f, *t;
+ uint16_t i, nb_segs, adj;
+ void *hdr;
+
+ if (s->tcb.snd.nxt_pkt) {
+ f = s->tcb.snd.nxt_pkt;
+ plen = f->data_len - s->tcb.snd.nxt_offset;
+ if (f == m) /* 1st segment contains net headers */
+ plen -= hdr_len;
+ } else {
+ f = m;
+ plen = f->data_len - hdr_len;
+ }
+
+ TCP_LOG(DEBUG, "m(%p): pkt_len=%u, nb_segs=%u, sl->len = %u\n",
+ m, m->pkt_len, m->nb_segs, sl->len);
+
+ nb_segs = 1;
+ if (sl->len < plen) {
+ /* Segment split needed: sometimes, cwnd will be reset to
+ * 1 or 2 mss. In this case, we send part of this seg, and
+ * record which segment we've sent, and the offset of sent
+ * data in tcb.
+ */
+ left = plen - sl->len;
+ plen = sl->len;
+ s->tcb.snd.nxt_pkt = f;
+ } else {
+ left = 0;
+ t = f->next;
+ while (t && plen + t->data_len <= sl->len) {
+ plen += t->data_len;
+ t = t->next;
+ nb_segs++;
+ }
+ s->tcb.snd.nxt_pkt = t;
+ }
+
+ struct rte_mbuf *pkts[1 + nb_segs];
+ if (rte_pktmbuf_alloc_bulk(s->tx.dst.head_mp, pkts, 1 + nb_segs) < 0)
+ return NULL;
+
+ rte_pktmbuf_attach(pkts[1], f);
+
+ /* remove bytes in the beginning */
+ adj = s->tcb.snd.nxt_offset;
+ if (f == m)
+ adj += hdr_len;
+ if (adj)
+ rte_pktmbuf_adj(pkts[1], adj);
+
+ /* remove bytes in the end */
+ if (left > 0) {
+ rte_pktmbuf_trim(pkts[1], left);
+ s->tcb.snd.nxt_offset += plen;
+ } else
+ s->tcb.snd.nxt_offset = 0;
+
+ /* attach chaining segment if we have */
+ for (i = 1, t = f->next; i < nb_segs; ++i) {
+ rte_pktmbuf_attach(pkts[i+1], t);
+ pkts[i]->next = pkts[i+1];
+ t = t->next;
+ }
+
+ /* prepare l2/l3/l4 header */
+ hdr = rte_pktmbuf_append(pkts[0], hdr_len);
+ rte_memcpy(hdr, rte_pktmbuf_mtod(m, void *), hdr_len);
+ pkts[0]->nb_segs = nb_segs + 1;
+ pkts[0]->pkt_len = plen + hdr_len;
+ pkts[0]->ol_flags = m->ol_flags;
+ pkts[0]->tx_offload = m->tx_offload;
+ if (type == TLE_V4) {
+ struct ipv4_hdr *l3h;
+
+ l3h = rte_pktmbuf_mtod_offset(pkts[0],
+ struct ipv4_hdr *, m->l2_len);
+ l3h->total_length =
+ rte_cpu_to_be_16(plen + m->l3_len + m->l4_len);
+ } else {
+ struct ipv6_hdr *l3h;
+
+ l3h = rte_pktmbuf_mtod_offset(pkts[0],
+ struct ipv6_hdr *, m->l2_len);
+ l3h->payload_len =
+ rte_cpu_to_be_16(plen + m->l4_len);
+ }
+ if (plen <= mss)
+ pkts[0]->ol_flags &= ~PKT_TX_TCP_SEG;
+ pkts[0]->next = pkts[1];
+
+ *p_plen = plen;
+ return pkts[0];
+}
+
static inline uint32_t
tx_data_bulk(struct tle_tcp_stream *s, union seqlen *sl, struct rte_mbuf *mi[],
uint32_t num)
@@ -371,11 +384,13 @@ tx_data_bulk(struct tle_tcp_stream *s, union seqlen *sl, struct rte_mbuf *mi[],
struct rte_mbuf *mb;
struct rte_mbuf *mo[MAX_PKT_BURST + TCP_MAX_PKT_SEG];
+ /* check stream has drb to send pkts */
+ if (stream_drb_empty(s))
+ return 0;
+
mss = s->tcb.snd.mss;
type = s->s.type;
-
dev = s->tx.dst.dev;
- pid = get_ip_pid(dev, num, type, (s->flags & TLE_CTX_FLAG_ST) != 0);
k = 0;
tn = 0;
@@ -383,26 +398,64 @@ tx_data_bulk(struct tle_tcp_stream *s, union seqlen *sl, struct rte_mbuf *mi[],
for (i = 0; i != num && sl->len != 0 && fail == 0; i++) {
mb = mi[i];
- sz = RTE_MIN(sl->len, mss);
plen = PKT_L4_PLEN(mb);
/*fast path, no need to use indirect mbufs. */
- if (plen <= sz) {
-
+ if (s->tcb.snd.nxt_pkt == NULL && plen <= sl->len) {
+ pid = get_ip_pid(dev, calc_seg_cnt(plen, s->tcb.snd.mss),
+ type, (s->flags & TLE_CTX_FLAG_ST) != 0);
/* update pkt TCP header */
- tcp_update_mbuf(mb, type, &s->tcb, sl->seq, pid + i);
+ tcp_update_mbuf(mb, type, &s->tcb, sl->seq, pid);
/* keep mbuf till ACK is received. */
rte_pktmbuf_refcnt_update(mb, 1);
sl->len -= plen;
sl->seq += plen;
mo[k++] = mb;
- /* remaining snd.wnd is less them MSS, send nothing */
- } else if (sz < mss)
+ if (sl->seq <= s->tcb.snd.rcvr)
+ TCP_INC_STATS(TCP_MIB_RETRANSSEGS);
+ /* remaining snd.wnd is less than MSS, send nothing */
+ } else if (sl->len < mss) {
+ break;
+ /* some data to send already */
+ } else if (k != 0 || tn != 0) {
break;
/* packet indirection needed */
- else
- RTE_VERIFY(0);
+ } else {
+ struct rte_mbuf *out;
+
+ out = get_indirect_mbuf(s, mb, &plen, sl, type, mss);
+ if (out == NULL)
+ return 0;
+
+ pid = get_ip_pid(dev, calc_seg_cnt(plen, s->tcb.snd.mss),
+ type, (s->flags & TLE_CTX_FLAG_ST) != 0);
+ /* update pkt TCP header */
+ tcp_update_mbuf(out, type, &s->tcb, sl->seq, pid);
+
+ /* no need to bump refcnt !!! */
+
+ sl->len -= plen;
+ sl->seq += plen;
+
+ if (tx_data_pkts(s, &out, 1) == 0) {
+ /* should not happen, we have checked at least one
+ * drb is available to send this mbuf
+ */
+ rte_pktmbuf_free(out);
+ return 0;
+ }
+
+ if (sl->seq <= s->tcb.snd.rcvr)
+ TCP_INC_STATS(TCP_MIB_RETRANSSEGS);
+
+ if (s->tcb.snd.nxt_pkt)
+ return 0;
+ else {
+ tn = 1;
+ continue;
+ }
+ }
if (k >= MAX_PKT_BURST) {
n = tx_data_pkts(s, mo, k);
@@ -466,14 +519,17 @@ tx_nxt_data(struct tle_tcp_stream *s, uint32_t tms)
tcp_txq_set_nxt_head(s, n);
} while (n == num);
- s->tcb.snd.nxt += sl.seq - (uint32_t)s->tcb.snd.nxt;
+ if (sl.seq != (uint32_t)s->tcb.snd.nxt) {
+ s->tcb.snd.nxt += sl.seq - (uint32_t)s->tcb.snd.nxt;
+ s->tcb.snd.ack = s->tcb.rcv.nxt;
+ }
return tn;
}
static inline void
free_una_data(struct tle_tcp_stream *s, uint32_t len)
{
- uint32_t i, num, plen;
+ uint32_t i, num, plen, una_data;
struct rte_mbuf **mi;
plen = 0;
@@ -487,14 +543,18 @@ free_una_data(struct tle_tcp_stream *s, uint32_t len)
/* free acked data */
for (i = 0; i != num && plen != len; i++) {
- uint32_t next_pkt_len = PKT_L4_PLEN(mi[i]);
- if (plen + next_pkt_len > len) {
- /* keep SND.UNA at the start of the packet */
- len = plen;
+ una_data = PKT_L4_PLEN(mi[i]) - s->tcb.snd.una_offset;
+
+ /* partial ack */
+ if (plen + una_data > len) {
+ s->tcb.snd.una_offset += len - plen;
+ plen = len;
break;
- } else {
- plen += next_pkt_len;
}
+
+ /* monolithic ack */
+ s->tcb.snd.una_offset = 0;
+ plen += una_data;
rte_pktmbuf_free(mi[i]);
}
@@ -503,6 +563,7 @@ free_una_data(struct tle_tcp_stream *s, uint32_t len)
} while (plen < len);
s->tcb.snd.una += len;
+ s->tcb.snd.waitlen -= len;
/*
* that could happen in case of retransmit,
@@ -519,7 +580,7 @@ calc_smss(uint16_t mss, const struct tle_dest *dst)
{
uint16_t n;
- n = dst->mtu - dst->l2_len - dst->l3_len - TCP_TX_HDR_DACK;
+ n = dst->mtu - dst->l3_len - sizeof(struct tcp_hdr);
mss = RTE_MIN(n, mss);
return mss;
}
@@ -537,71 +598,53 @@ initial_cwnd(uint32_t smss, uint32_t icw)
return RTE_MIN(10 * smss, RTE_MAX(2 * smss, icw));
}
-/*
- * queue standalone packet to he particular output device
- * It assumes that:
- * - L2/L3/L4 headers should be already set.
- * - packet fits into one segment.
- */
-static inline int
-send_pkt(struct tle_tcp_stream *s, struct tle_dev *dev, struct rte_mbuf *m)
+void
+tle_tcp_stream_kill(struct tle_stream *ts)
{
- uint32_t n, nb;
- struct tle_drb *drb;
-
- if (stream_drb_alloc(s, &drb, 1) == 0)
- return -ENOBUFS;
-
- /* enqueue pkt for TX. */
- nb = 1;
- n = tle_dring_mp_enqueue(&dev->tx.dr, (const void * const*)&m, 1,
- &drb, &nb);
-
- /* free unused drbs. */
- if (nb != 0)
- stream_drb_free(s, &drb, 1);
-
- return (n == 1) ? 0 : -ENOBUFS;
-}
+ struct tle_tcp_stream *s;
-static inline int
-send_ctrl_pkt(struct tle_tcp_stream *s, struct rte_mbuf *m, uint32_t seq,
- uint32_t flags)
-{
- const struct tle_dest *dst;
- uint32_t pid, type;
- int32_t rc;
+ s = TCP_STREAM(ts);
+ if (ts == NULL || s->s.type >= TLE_VNUM)
+ return;
- dst = &s->tx.dst;
- type = s->s.type;
- pid = get_ip_pid(dst->dev, 1, type, (s->flags & TLE_CTX_FLAG_ST) != 0);
+ if (s->tcb.state > TCP_ST_LISTEN)
+ send_rst(s, s->tcb.snd.nxt);
- rc = tcp_fill_mbuf(m, s, dst, 0, s->s.port, seq, flags, pid, 1);
- if (rc == 0)
- rc = send_pkt(s, dst->dev, m);
+ if (s->tcb.state == TCP_ST_ESTABLISHED)
+ TCP_DEC_STATS_ATOMIC(TCP_MIB_CURRESTAB);
- return rc;
+ s->tcb.state = TCP_ST_CLOSED;
+ rte_smp_wmb();
+ timer_stop(s, TIMER_RTO);
}
static inline int
-send_rst(struct tle_tcp_stream *s, uint32_t seq)
+send_ack(struct tle_tcp_stream *s, uint32_t tms, uint32_t flags)
{
struct rte_mbuf *m;
+ uint32_t seq;
int32_t rc;
m = rte_pktmbuf_alloc(s->tx.dst.head_mp);
if (m == NULL)
return -ENOMEM;
- rc = send_ctrl_pkt(s, m, seq, TCP_FLAG_RST);
- if (rc != 0)
+ seq = s->tcb.snd.nxt - ((flags & (TCP_FLAG_FIN | TCP_FLAG_SYN)) != 0);
+ s->tcb.snd.ts = tms;
+
+ rc = send_ctrl_pkt(s, m, seq, flags);
+ if (rc != 0) {
rte_pktmbuf_free(m);
+ return rc;
+ }
- return rc;
+ timer_stop(s, TIMER_DACK);
+ s->tcb.snd.ack = s->tcb.rcv.nxt;
+ return 0;
}
static inline int
-send_ack(struct tle_tcp_stream *s, uint32_t tms, uint32_t flags)
+send_keepalive(struct tle_tcp_stream *s)
{
struct rte_mbuf *m;
uint32_t seq;
@@ -611,20 +654,16 @@ send_ack(struct tle_tcp_stream *s, uint32_t tms, uint32_t flags)
if (m == NULL)
return -ENOMEM;
- seq = s->tcb.snd.nxt - ((flags & (TCP_FLAG_FIN | TCP_FLAG_SYN)) != 0);
- s->tcb.snd.ts = tms;
+ seq = s->tcb.snd.una - 1;
- rc = send_ctrl_pkt(s, m, seq, flags);
+ rc = send_ctrl_pkt(s, m, seq, TCP_FLAG_ACK);
if (rc != 0) {
rte_pktmbuf_free(m);
return rc;
}
-
- s->tcb.snd.ack = s->tcb.rcv.nxt;
return 0;
}
-
static int
sync_ack(struct tle_tcp_stream *s, const union pkt_info *pi,
const union seg_info *si, uint32_t ts, struct rte_mbuf *m)
@@ -633,19 +672,23 @@ sync_ack(struct tle_tcp_stream *s, const union pkt_info *pi,
int32_t rc;
uint32_t pid, seq, type;
struct tle_dev *dev;
- const void *da;
+ const void *sa, *da;
struct tle_dest dst;
const struct tcp_hdr *th;
- type = s->s.type;
+ type = pi->tf.type;
/* get destination information. */
- if (type == TLE_V4)
+ if (type == TLE_V4) {
da = &pi->addr4.src;
- else
+ sa = &pi->addr4.dst;
+ }
+ else {
da = &pi->addr6->src;
+ sa = &pi->addr6->dst;
+ }
- rc = stream_get_dest(&s->s, da, &dst);
+ rc = stream_get_dest(type, &s->s, sa, da, &dst);
if (rc < 0)
return rc;
@@ -654,11 +697,16 @@ sync_ack(struct tle_tcp_stream *s, const union pkt_info *pi,
get_syn_opts(&s->tcb.so, (uintptr_t)(th + 1), m->l4_len - sizeof(*th));
s->tcb.rcv.nxt = si->seq + 1;
+ s->tcb.rcv.cpy = si->seq + 1;
seq = sync_gen_seq(pi, s->tcb.rcv.nxt, ts, s->tcb.so.mss,
s->s.ctx->prm.hash_alg,
&s->s.ctx->prm.secret_key);
- s->tcb.so.ts.ecr = s->tcb.so.ts.val;
- s->tcb.so.ts.val = sync_gen_ts(ts, s->tcb.so.wscale);
+
+ if (s->tcb.so.ts.raw) {
+ s->tcb.so.ts.ecr = s->tcb.so.ts.val;
+ s->tcb.so.ts.val = sync_gen_ts(ts, s->tcb.so.wscale);
+ }
+
s->tcb.so.wscale = (s->tcb.so.wscale == TCP_WSCALE_NONE) ?
TCP_WSCALE_NONE : TCP_WSCALE_DEFAULT;
s->tcb.so.mss = calc_smss(dst.mtu, &dst);
@@ -672,11 +720,13 @@ sync_ack(struct tle_tcp_stream *s, const union pkt_info *pi,
dev = dst.dev;
pid = get_ip_pid(dev, 1, type, (s->flags & TLE_CTX_FLAG_ST) != 0);
- rc = tcp_fill_mbuf(m, s, &dst, 0, pi->port, seq,
- TCP_FLAG_SYN | TCP_FLAG_ACK, pid, 1);
+ rc = tcp_fill_mbuf(m, s, &dst, TCP_OLFLAGS_CKSUM(dst.ol_flags),
+ pi->port, seq, TCP_FLAG_SYN | TCP_FLAG_ACK, pid, 1);
if (rc == 0)
rc = send_pkt(s, dev, m);
+ TCP_INC_STATS(TCP_MIB_PASSIVEOPENS);
+
return rc;
}
@@ -800,43 +850,24 @@ restore_syn_opt(union seg_info *si, union tsopt *to,
return 0;
}
-static inline void
-stream_term(struct tle_tcp_stream *s)
-{
- struct sdr *dr;
-
- s->tcb.state = TCP_ST_CLOSED;
- rte_smp_wmb();
-
- timer_stop(s);
-
- /* close() was already invoked, schedule final cleanup */
- if ((s->tcb.uop & TCP_OP_CLOSE) != 0) {
-
- dr = CTX_TCP_SDR(s->s.ctx);
- STAILQ_INSERT_TAIL(&dr->be, &s->s, link);
-
- /* notify user that stream need to be closed */
- } else if (s->err.ev != NULL)
- tle_event_raise(s->err.ev);
- else if (s->err.cb.func != NULL)
- s->err.cb.func(s->err.cb.data, &s->s);
-}
-
static inline int
stream_fill_dest(struct tle_tcp_stream *s)
{
int32_t rc;
uint32_t type;
- const void *da;
+ const void *sa, *da;
- type = s->s.type;
- if (type == TLE_V4)
+ type = s->s.type;
+ if (type == TLE_V4) {
+ sa = &s->s.ipv4.addr.dst;
da = &s->s.ipv4.addr.src;
- else
+ }
+ else {
+ sa = &s->s.ipv6.addr.dst;
da = &s->s.ipv6.addr.src;
+ }
- rc = stream_get_dest(&s->s, da, &s->tx.dst);
+ rc = stream_get_dest(type, &s->s, sa, da, &s->tx.dst);
return (rc < 0) ? rc : 0;
}
@@ -851,19 +882,17 @@ accept_prep_stream(struct tle_tcp_stream *ps, struct stbl *st,
int32_t rc;
uint32_t rtt;
- /* some TX still pending for that stream. */
- if (TCP_STREAM_TX_PENDING(cs))
- return -EAGAIN;
-
/* setup L4 ports and L3 addresses fields. */
cs->s.port.raw = pi->port.raw;
cs->s.pmsk.raw = UINT32_MAX;
if (pi->tf.type == TLE_V4) {
+ cs->s.type = TLE_V4;
cs->s.ipv4.addr = pi->addr4;
cs->s.ipv4.mask.src = INADDR_NONE;
cs->s.ipv4.mask.dst = INADDR_NONE;
} else if (pi->tf.type == TLE_V6) {
+ cs->s.type = TLE_V6;
cs->s.ipv6.addr = *pi->addr6;
rte_memcpy(&cs->s.ipv6.mask.src, &tle_ipv6_none,
sizeof(cs->s.ipv6.mask.src));
@@ -887,7 +916,7 @@ accept_prep_stream(struct tle_tcp_stream *ps, struct stbl *st,
cs->tcb.snd.rto = TCP_RTO_DEFAULT;
/* copy streams type & flags. */
- cs->s.type = ps->s.type;
+ cs->s.type = pi->tf.type;
cs->flags = ps->flags;
/* retrive and cache destination information. */
@@ -897,16 +926,23 @@ accept_prep_stream(struct tle_tcp_stream *ps, struct stbl *st,
/* update snd.mss with SMSS value */
cs->tcb.snd.mss = calc_smss(cs->tcb.snd.mss, &cs->tx.dst);
+ if (cs->tcb.so.ts.raw != 0) {
+ cs->tcb.snd.mss -= TCP_TX_OPT_LEN_TMS;
+ }
/* setup congestion variables */
cs->tcb.snd.cwnd = initial_cwnd(cs->tcb.snd.mss, ps->tcb.snd.cwnd);
+ CWND_INFO("accept", cs->tcb.snd.cwnd);
+
cs->tcb.snd.ssthresh = cs->tcb.snd.wnd;
cs->tcb.snd.rto_tw = ps->tcb.snd.rto_tw;
+ cs->tcb.snd.rto_fw = ps->tcb.snd.rto_fw;
cs->tcb.state = TCP_ST_ESTABLISHED;
+ TCP_INC_STATS_ATOMIC(TCP_MIB_CURRESTAB);
/* add stream to the table */
- cs->ste = stbl_add_stream(st, pi, cs);
+ cs->ste = stbl_add_stream(st, &cs->s);
if (cs->ste == NULL)
return -ENOBUFS;
@@ -937,7 +973,7 @@ rx_ack_listen(struct tle_tcp_stream *s, struct stbl *st,
*csp = NULL;
- if (pi->tf.flags != TCP_FLAG_ACK || rx_check_stream(s, pi) != 0)
+ if ((pi->tf.flags & TCP_FLAG_ACK) == 0|| rx_check_stream(s, pi) != 0)
return -EINVAL;
ctx = s->s.ctx;
@@ -964,7 +1000,8 @@ rx_ack_listen(struct tle_tcp_stream *s, struct stbl *st,
/* cleanup on failure */
tcp_stream_down(cs);
- stbl_del_stream(st, cs->ste, cs, 0);
+ TCP_DEC_STATS_ATOMIC(TCP_MIB_CURRESTAB);
+ stbl_del_stream(st, cs->ste, &cs->s);
cs->ste = NULL;
}
@@ -982,6 +1019,10 @@ data_pkt_adjust(const struct tcb *tcb, struct rte_mbuf **mb, uint32_t hlen,
len = *plen;
rte_pktmbuf_adj(*mb, hlen);
+ /* header is removed, so we clear tx_offload here to make sure
+ * we can get correct payload length with PKT_L4_PLEN.
+ */
+ (*mb)->tx_offload = 0;
if (len == 0)
return -ENODATA;
/* cut off the start of the packet */
@@ -1018,7 +1059,8 @@ rx_ackdata(struct tle_tcp_stream *s, uint32_t ack)
tle_event_raise(s->tx.ev);
else if (k == 0 && s->tx.cb.func != NULL)
s->tx.cb.func(s->tx.cb.data, &s->s);
- }
+ } else
+ txs_enqueue(s->s.ctx, s);
}
return n;
@@ -1029,8 +1071,7 @@ stream_timewait(struct tle_tcp_stream *s, uint32_t rto)
{
if (rto != 0) {
s->tcb.state = TCP_ST_TIME_WAIT;
- s->tcb.snd.rto = rto;
- timer_reset(s);
+ timer_reset(s, TIMER_RTO, rto);
} else
stream_term(s);
}
@@ -1041,20 +1082,30 @@ rx_fin_state(struct tle_tcp_stream *s, struct resp_info *rsp)
uint32_t state;
int32_t ackfin;
+ s->tcb.rcv.frs.on = 2;
s->tcb.rcv.nxt += 1;
ackfin = (s->tcb.snd.una == s->tcb.snd.fss);
state = s->tcb.state;
if (state == TCP_ST_ESTABLISHED) {
+ TCP_DEC_STATS_ATOMIC(TCP_MIB_CURRESTAB);
s->tcb.state = TCP_ST_CLOSE_WAIT;
/* raise err.ev & err.cb */
- if (s->err.ev != NULL)
+ /* raise error event only when recvbuf is empty, to inform
+ * that the stream will not receive data any more.
+ */
+ if (rte_ring_count(s->rx.q) == 0 && s->err.ev != NULL)
tle_event_raise(s->err.ev);
else if (s->err.cb.func != NULL)
s->err.cb.func(s->err.cb.data, &s->s);
} else if (state == TCP_ST_FIN_WAIT_1 || state == TCP_ST_CLOSING) {
rsp->flags |= TCP_FLAG_ACK;
+
+ /* shutdown instead of close happens */
+ if (rte_ring_count(s->rx.q) == 0 && s->err.ev != NULL)
+ tle_event_raise(s->err.ev);
+
if (ackfin != 0)
stream_timewait(s, s->tcb.snd.rto_tw);
else
@@ -1089,8 +1140,10 @@ rx_fin(struct tle_tcp_stream *s, uint32_t state,
ts = rx_tms_opt(&s->tcb, mb);
ret = rx_check_seqack(&s->tcb, seq, si->ack, plen, ts);
- if (ret != 0)
+ if (ret != 0) {
+ rsp->flags |= TCP_FLAG_ACK;
return ret;
+ }
if (state < TCP_ST_ESTABLISHED)
return -EINVAL;
@@ -1108,9 +1161,10 @@ rx_fin(struct tle_tcp_stream *s, uint32_t state,
* fast-path: all data & FIN was already sent out
* and now is acknowledged.
*/
- if (s->tcb.snd.fss == s->tcb.snd.nxt &&
- si->ack == (uint32_t)s->tcb.snd.nxt) {
+ if (s->tcb.snd.fss >= s->tcb.snd.nxt &&
+ si->ack == (uint32_t)s->tcb.snd.fss) {
s->tcb.snd.una = s->tcb.snd.fss;
+ s->tcb.snd.nxt = s->tcb.snd.una;
empty_tq(s);
/* conventional ACK processiing */
} else
@@ -1148,8 +1202,25 @@ rx_rst(struct tle_tcp_stream *s, uint32_t state, uint32_t flags,
else
rc = check_seqn(&s->tcb, si->seq, 0);
- if (rc == 0)
+ if (rc == 0) {
+ /* receive rst, connection is closed abnormal
+ * and should return errno in later operations.
+ */
+ switch (state) {
+ case TCP_ST_SYN_SENT:
+ TCP_INC_STATS(TCP_MIB_ATTEMPTFAILS);
+ s->tcb.err = ECONNREFUSED;
+ break;
+ case TCP_ST_CLOSE_WAIT:
+ s->tcb.err = EPIPE;
+ break;
+ case TCP_ST_CLOSED:
+ return rc;
+ default:
+ s->tcb.err = ECONNRESET;
+ }
stream_term(s);
+ }
return rc;
}
@@ -1222,6 +1293,7 @@ rto_cwnd_update(struct tcb *tcb)
* no more than 1 full-sized segment.
*/
tcb->snd.cwnd = tcb->snd.mss;
+ CWND_INFO("update", tcb->snd.cwnd);
}
static inline void
@@ -1330,13 +1402,17 @@ rx_data_ack(struct tle_tcp_stream *s, struct dack_info *tack,
ret = rx_check_seqack(&s->tcb, si[j].seq, si[j].ack,
plen, ts);
- if (ret != 0)
- break;
-
/* account for segment received */
ack_info_update(tack, &si[j], ret != 0, plen, ts);
+ if (ret != 0)
+ break;
+
rte_pktmbuf_adj(mb[j], hlen);
+ /* header is removed, so we clear tx_offload here to make sure
+ * we can get correct payload length with PKT_L4_PLEN.
+ */
+ mb[j]->tx_offload = 0;
}
n = j - i;
@@ -1377,6 +1453,7 @@ start_fast_retransmit(struct tle_tcp_stream *s)
tcp_txq_rst_nxt_head(s);
tcb->snd.nxt = tcb->snd.una;
tcb->snd.cwnd = tcb->snd.ssthresh + 3 * tcb->snd.mss;
+ CWND_INFO("start fast retrans", tcb->snd.cwnd);
}
static inline void
@@ -1389,6 +1466,7 @@ stop_fast_retransmit(struct tle_tcp_stream *s)
n = tcb->snd.nxt - tcb->snd.una;
tcb->snd.cwnd = RTE_MIN(tcb->snd.ssthresh,
RTE_MAX(n, tcb->snd.mss) + tcb->snd.mss);
+ CWND_INFO("stop fast retrans", tcb->snd.cwnd);
tcb->snd.fastack = 0;
}
@@ -1415,8 +1493,10 @@ in_fast_retransmit(struct tle_tcp_stream *s, uint32_t ack_len, uint32_t ack_num,
* during fast recovery, also reset the
* retransmit timer.
*/
- if (tcb->snd.fastack == 1)
- timer_reset(s);
+ if (tcb->snd.fastack == 1) {
+ timer_reset(s, TIMER_RTO, s->tcb.snd.rto);
+ s->tcb.snd.nb_retx = 0;
+ }
tcb->snd.fastack += ack_num;
return 1;
@@ -1456,7 +1536,8 @@ process_ack(struct tle_tcp_stream *s, uint32_t acked,
/* remain in normal mode */
} else if (acked != 0) {
ack_cwnd_update(&s->tcb, acked, tack);
- timer_stop(s);
+ timer_stop(s, TIMER_RTO);
+ s->tcb.snd.nb_retx = 0;
}
/* fast retransmit mode */
@@ -1470,7 +1551,7 @@ process_ack(struct tle_tcp_stream *s, uint32_t acked,
} else {
/* RFC 5682 3.2.3 full ACK */
stop_fast_retransmit(s);
- timer_stop(s);
+ timer_stop(s, TIMER_RTO);
/* if we have another series of dup ACKs */
if (tack->dup3.seg != 0 &&
@@ -1501,17 +1582,22 @@ rx_ackfin(struct tle_tcp_stream *s)
uint32_t state;
s->tcb.snd.una = s->tcb.snd.fss;
+ s->tcb.snd.nxt = s->tcb.snd.una;
empty_tq(s);
state = s->tcb.state;
if (state == TCP_ST_LAST_ACK)
stream_term(s);
else if (state == TCP_ST_FIN_WAIT_1) {
- timer_stop(s);
+ timer_stop(s, TIMER_RTO);
s->tcb.state = TCP_ST_FIN_WAIT_2;
- } else if (state == TCP_ST_CLOSING) {
+ /* if stream is closed, should be released
+ * before timeout even without fin from peer
+ */
+ if (s->tcb.uop & TCP_OP_CLOSE)
+ timer_start(s, TIMER_RTO, s->tcb.snd.rto_fw);
+ } else if (state == TCP_ST_CLOSING)
stream_timewait(s, s->tcb.snd.rto_tw);
- }
}
static inline void
@@ -1532,7 +1618,7 @@ rx_process_ack(struct tle_tcp_stream *s, uint32_t ts,
/* restart RTO timer. */
if (s->tcb.snd.nxt != s->tcb.snd.una)
- timer_start(s);
+ timer_start(s, TIMER_RTO, s->tcb.snd.rto);
/* update rto, if fresh packet is here then calculate rtt */
if (tack->ts.ecr != 0)
@@ -1554,15 +1640,9 @@ rx_synack(struct tle_tcp_stream *s, uint32_t ts, uint32_t state,
if (state != TCP_ST_SYN_SENT)
return -EINVAL;
- /*
- * RFC 793 3.9: in the SYN-SENT state
- * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send a reset
- * <SEQ=SEG.ACK><CTL=RST>
- * and discard the segment.
- * The connection remains in the same state.
- */
+ /* invalid SEG.SEQ */
if (si->ack != (uint32_t)s->tcb.snd.nxt) {
- send_rst(s, si->ack);
+ rsp->flags = TCP_FLAG_RST;
return 0;
}
@@ -1574,18 +1654,25 @@ rx_synack(struct tle_tcp_stream *s, uint32_t ts, uint32_t state,
s->tcb.snd.una = s->tcb.snd.nxt;
s->tcb.snd.mss = calc_smss(so.mss, &s->tx.dst);
+ if (s->tcb.so.ts.raw != 0) {
+ s->tcb.snd.mss -= TCP_TX_OPT_LEN_TMS;
+ }
s->tcb.snd.wnd = si->wnd << so.wscale;
s->tcb.snd.wu.wl1 = si->seq;
s->tcb.snd.wu.wl2 = si->ack;
s->tcb.snd.wscale = so.wscale;
+ s->tcb.snd.cork_ts = 0;
/* setup congestion variables */
s->tcb.snd.cwnd = initial_cwnd(s->tcb.snd.mss, s->tcb.snd.cwnd);
+ CWND_INFO("synack", s->tcb.snd.cwnd);
+
s->tcb.snd.ssthresh = s->tcb.snd.wnd;
s->tcb.rcv.ts = so.ts.val;
s->tcb.rcv.irs = si->seq;
s->tcb.rcv.nxt = si->seq + 1;
+ s->tcb.rcv.cpy = si->seq + 1;
/* if peer doesn't support WSCALE opt, recalculate RCV.WND */
s->tcb.rcv.wscale = (so.wscale == TCP_WSCALE_NONE) ?
@@ -1597,9 +1684,14 @@ rx_synack(struct tle_tcp_stream *s, uint32_t ts, uint32_t state,
rsp->flags |= TCP_FLAG_ACK;
- timer_stop(s);
+ timer_stop(s, TIMER_RTO);
+ s->tcb.snd.nb_retx = 0;
s->tcb.state = TCP_ST_ESTABLISHED;
rte_smp_wmb();
+ TCP_INC_STATS_ATOMIC(TCP_MIB_CURRESTAB);
+
+ if (s->s.option.keepalive)
+ timer_start(s, TIMER_KEEPALIVE, s->s.option.keepidle * MS_PER_S);
if (s->tx.ev != NULL)
tle_event_raise(s->tx.ev);
@@ -1689,8 +1781,8 @@ rx_stream(struct tle_tcp_stream *s, uint32_t ts,
* fast-path: all data & FIN was already sent out
* and now is acknowledged.
*/
- if (s->tcb.snd.fss == s->tcb.snd.nxt &&
- tack.ack == (uint32_t)s->tcb.snd.nxt)
+ if (s->tcb.snd.fss >= s->tcb.snd.nxt &&
+ tack.ack == (uint32_t)s->tcb.snd.fss)
rx_ackfin(s);
else
rx_process_ack(s, ts, &tack);
@@ -1702,27 +1794,44 @@ rx_stream(struct tle_tcp_stream *s, uint32_t ts,
* - received segment with INO data and no TX is scheduled
* for that stream.
*/
- if (tack.segs.badseq != 0 || tack.segs.ofo != 0 ||
- (tack.segs.data != 0 &&
- rte_atomic32_read(&s->tx.arm) == 0))
+ if (tack.segs.badseq != 0 || tack.segs.ofo != 0)
+ rsp.flags |= TCP_FLAG_ACK;
+ else if (tack.segs.data != 0 &&
+ rte_atomic32_read(&s->tx.arm) == 0 &&
+ (s->s.option.tcpquickack ||
+ s->tcb.rcv.nxt - s->tcb.snd.ack > 8 * s->tcb.so.mss)) {
rsp.flags |= TCP_FLAG_ACK;
+ if (s->s.option.tcpquickack > 0)
+ s->s.option.tcpquickack--;
+ }
+ else if (tack.segs.data && rsp.flags == 0)
+ timer_start(s, TIMER_DACK, DELAY_ACK_CHECK_INTERVAL);
rx_ofo_fin(s, &rsp);
k += num - n;
i = num;
+ if (s->s.option.keepalive) {
+ s->tcb.snd.nb_keepalive = 0;
+ timer_reset(s, TIMER_KEEPALIVE, s->s.option.keepidle * MS_PER_S);
+ }
/* unhandled state, drop all packets. */
} else
i = 0;
/* we have a response packet to send. */
- if (rsp.flags != 0) {
+ if (rsp.flags == TCP_FLAG_RST) {
+ send_rst(s, si[i].ack);
+ stream_term(s);
+ } else if (rsp.flags != 0) {
send_ack(s, ts, rsp.flags);
/* start the timer for FIN packet */
- if ((rsp.flags & TCP_FLAG_FIN) != 0)
- timer_reset(s);
+ if ((rsp.flags & TCP_FLAG_FIN) != 0) {
+ timer_reset(s, TIMER_RTO, s->tcb.snd.rto);
+ s->tcb.snd.nb_retx = 0;
+ }
}
/* unprocessed packets */
@@ -1778,7 +1887,6 @@ rx_postsyn(struct tle_dev *dev, struct stbl *st, uint32_t type, uint32_t ts,
state = s->tcb.state;
if (state == TCP_ST_LISTEN) {
-
/* one connection per flow */
cs = NULL;
ret = -EINVAL;
@@ -1835,6 +1943,74 @@ rx_postsyn(struct tle_dev *dev, struct stbl *st, uint32_t type, uint32_t ts,
return num - k;
}
+static inline void
+sync_refuse(struct tle_tcp_stream *s, struct tle_dev *dev,
+ const union pkt_info *pi, struct rte_mbuf *m)
+{
+ struct ether_hdr *eth_h;
+ struct ether_addr eth_addr;
+ struct ipv4_hdr *ip_h;
+ uint32_t ip_addr;
+ struct ipv6_hdr *ipv6_h;
+ struct in6_addr ipv6_addr;
+ struct tcp_hdr *th;
+ uint16_t port;
+
+ /* rst pkt should not contain options for syn */
+ rte_pktmbuf_trim(m, m->l4_len - sizeof(*th));
+
+ eth_h = rte_pktmbuf_mtod(m, struct ether_hdr*);
+ ether_addr_copy(&eth_h->s_addr, &eth_addr);
+ ether_addr_copy(&eth_h->d_addr, &eth_h->s_addr);
+ ether_addr_copy(&eth_addr, &eth_h->d_addr);
+
+ th = rte_pktmbuf_mtod_offset(m, struct tcp_hdr*,
+ m->l2_len + m->l3_len);
+ port = th->src_port;
+ th->src_port = th->dst_port;
+ th->dst_port = port;
+ th->tcp_flags = TCP_FLAG_RST | TCP_FLAG_ACK;
+ th->recv_ack = rte_cpu_to_be_32(rte_be_to_cpu_32(th->sent_seq) + 1);
+ th->sent_seq = 0;
+ th->data_off &= 0x0f;
+ th->data_off |= (sizeof(*th) / 4) << 4;
+ th->cksum = 0;
+
+ if (pi->tf.type == TLE_V4) {
+ ip_h = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr*,
+ m->l2_len);
+ ip_addr = ip_h->src_addr;
+ ip_h->src_addr = ip_h->dst_addr;
+ ip_h->dst_addr = ip_addr;
+ ip_h->total_length = rte_cpu_to_be_16(
+ rte_be_to_cpu_16(ip_h->total_length) -
+ (m->l4_len - sizeof(*th)));
+ ip_h->hdr_checksum = 0;
+ th->cksum = rte_ipv4_udptcp_cksum(ip_h, th);
+ ip_h->hdr_checksum = rte_ipv4_cksum(ip_h);
+ } else {
+ ipv6_h = rte_pktmbuf_mtod_offset(m, struct ipv6_hdr*,
+ m->l2_len);
+ rte_memcpy(&ipv6_addr, ipv6_h->src_addr,
+ sizeof(struct in6_addr));
+ rte_memcpy(ipv6_h->src_addr, ipv6_h->dst_addr,
+ sizeof(struct in6_addr));
+ rte_memcpy(ipv6_h->dst_addr, &ipv6_addr,
+ sizeof(struct in6_addr));
+ ipv6_h->payload_len = rte_cpu_to_be_16(
+ rte_be_to_cpu_16(ipv6_h->payload_len) -
+ (m->l4_len - sizeof(*th)));
+ th->cksum = rte_ipv6_udptcp_cksum(ipv6_h, th);
+ }
+
+ if (m->pkt_len < ETHER_MIN_LEN)
+ rte_pktmbuf_append(m, ETHER_MIN_LEN - m->pkt_len);
+
+ if (send_pkt(s, dev, m) != 0)
+ rte_pktmbuf_free(m);
+ else
+ TCP_INC_STATS(TCP_MIB_OUTRSTS);
+}
static inline uint32_t
rx_syn(struct tle_dev *dev, uint32_t type, uint32_t ts,
@@ -1846,20 +2022,35 @@ rx_syn(struct tle_dev *dev, uint32_t type, uint32_t ts,
uint32_t i, k;
int32_t ret;
- s = rx_obtain_listen_stream(dev, &pi[0], type);
+ s = rx_obtain_listen_stream(dev, &pi[0], type, 0);
if (s == NULL) {
- for (i = 0; i != num; i++) {
- rc[i] = ENOENT;
- rp[i] = mb[i];
+ /* no socket listening this syn, send rst to refuse connect */
+ s = TCP_STREAM(get_stream(dev->ctx));
+ if (s != NULL) {
+ sync_refuse(s, dev, &pi[0], mb[0]);
+ put_stream(dev->ctx, &s->s, 0);
+ i = 1;
+ } else {
+ i = 0;
}
- return 0;
+ k = 0;
+ for (; i != num; i++) {
+ rc[k] = ENOENT;
+ rp[k] = mb[i];
+ k++;
+ }
+ return num - k;
}
k = 0;
for (i = 0; i != num; i++) {
-
+ /* check if stream has space to maintain new connection */
+ if (rte_ring_free_count(s->rx.q) == 0 ||
+ (s->s.ctx->streams.nb_free == 0 &&
+ s->s.ctx->streams.nb_cur >= s->s.ctx->prm.max_streams - 1))
+ ret = -ENOSPC;
/* check that this remote is allowed to connect */
- if (rx_check_stream(s, &pi[i]) != 0)
+ else if (rx_check_stream(s, &pi[i]) != 0)
ret = -ENOENT;
else
/* syncokie: reply with <SYN,ACK> */
@@ -1882,43 +2073,34 @@ tle_tcp_rx_bulk(struct tle_dev *dev, struct rte_mbuf *pkt[],
{
struct stbl *st;
struct tle_ctx *ctx;
- uint32_t i, j, k, mt, n, t, ts;
+ uint32_t i, j, k, n, t;
+ uint64_t ts;
union pkt_info pi[num];
union seg_info si[num];
- union {
- uint8_t t[TLE_VNUM];
- uint32_t raw;
- } stu;
+
+ TCP_ADD_STATS(TCP_MIB_INSEGS, num);
ctx = dev->ctx;
ts = tcp_get_tms(ctx->cycles_ms_shift);
st = CTX_TCP_STLB(ctx);
- mt = ((ctx->prm.flags & TLE_CTX_FLAG_ST) == 0);
-
- stu.raw = 0;
/* extract packet info and check the L3/L4 csums */
for (i = 0; i != num; i++) {
get_pkt_info(pkt[i], &pi[i], &si[i]);
-
t = pi[i].tf.type;
- pi[i].csf = check_pkt_csum(pkt[i], pi[i].csf, t, IPPROTO_TCP);
- stu.t[t] = mt;
+ pi[i].csf = check_pkt_csum(pkt[i], t, IPPROTO_TCP);
}
- if (stu.t[TLE_V4] != 0)
- stbl_lock(st, TLE_V4);
- if (stu.t[TLE_V6] != 0)
- stbl_lock(st, TLE_V6);
-
k = 0;
for (i = 0; i != num; i += j) {
-
t = pi[i].tf.type;
/*basic checks for incoming packet */
- if (t >= TLE_VNUM || pi[i].csf != 0 || dev->dp[t] == NULL) {
+ if (t >= TLE_VNUM || pi[i].csf != 0) {
+ TCP_INC_STATS(TCP_MIB_INERRS);
+ if (t < TLE_VNUM)
+ TCP_INC_STATS(TCP_MIB_CSUMERRORS);
rc[k] = EINVAL;
rp[k] = pkt[i];
j = 1;
@@ -1937,11 +2119,6 @@ tle_tcp_rx_bulk(struct tle_dev *dev, struct rte_mbuf *pkt[],
}
}
- if (stu.t[TLE_V4] != 0)
- stbl_unlock(st, TLE_V4);
- if (stu.t[TLE_V6] != 0)
- stbl_unlock(st, TLE_V6);
-
return num - k;
}
@@ -1953,21 +2130,37 @@ tle_tcp_stream_accept(struct tle_stream *ts, struct tle_stream *rs[],
struct tle_tcp_stream *s;
s = TCP_STREAM(ts);
- n = _rte_ring_dequeue_burst(s->rx.q, (void **)rs, num);
- if (n == 0)
- return 0;
- /*
- * if we still have packets to read,
- * then rearm stream RX event.
- */
- if (n == num && rte_ring_count(s->rx.q) != 0) {
- if (tcp_stream_try_acquire(s) > 0 && s->rx.ev != NULL)
- tle_event_raise(s->rx.ev);
+ if (tcp_stream_try_acquire(s) > 0) {
+ if (s->tcb.state != TCP_ST_LISTEN) {
+ tcp_stream_release(s);
+ rte_errno = EINVAL;
+ return 0;
+ }
+
+ n = _rte_ring_dequeue_burst(s->rx.q, (void **)rs, num);
+ if (n == 0)
+ {
+ tcp_stream_release(s);
+ rte_errno = EAGAIN;
+ return 0;
+ }
+
+ /*
+ * if we still have packets to read,
+ * then rearm stream RX event.
+ */
+ if (n == num && rte_ring_count(s->rx.q) != 0) {
+ if (s->rx.ev != NULL)
+ tle_event_raise(s->rx.ev);
+ }
+ tcp_stream_release(s);
+ return n;
+ } else {
tcp_stream_release(s);
+ rte_errno = EINVAL;
+ return 0;
}
-
- return n;
}
uint16_t
@@ -1995,6 +2188,7 @@ tle_tcp_tx_bulk(struct tle_dev *dev, struct rte_mbuf *pkt[], uint16_t num)
stream_drb_free(s, drb + i, j - i);
}
+ TCP_ADD_STATS(TCP_MIB_OUTSEGS, n);
return n;
}
@@ -2010,73 +2204,17 @@ stream_fill_pkt_info(const struct tle_tcp_stream *s, union pkt_info *pi)
pi->tf.type = s->s.type;
}
-static int
-stream_fill_addr(struct tle_tcp_stream *s, const struct sockaddr *addr)
-{
- const struct sockaddr_in *in4;
- const struct sockaddr_in6 *in6;
- const struct tle_dev_param *prm;
- int32_t rc;
-
- rc = 0;
- s->s.pmsk.raw = UINT32_MAX;
-
- /* setup L4 src ports and src address fields. */
- if (s->s.type == TLE_V4) {
- in4 = (const struct sockaddr_in *)addr;
- if (in4->sin_addr.s_addr == INADDR_ANY || in4->sin_port == 0)
- return -EINVAL;
-
- s->s.port.src = in4->sin_port;
- s->s.ipv4.addr.src = in4->sin_addr.s_addr;
- s->s.ipv4.mask.src = INADDR_NONE;
- s->s.ipv4.mask.dst = INADDR_NONE;
-
- } else if (s->s.type == TLE_V6) {
- in6 = (const struct sockaddr_in6 *)addr;
- if (memcmp(&in6->sin6_addr, &tle_ipv6_any,
- sizeof(tle_ipv6_any)) == 0 ||
- in6->sin6_port == 0)
- return -EINVAL;
-
- s->s.port.src = in6->sin6_port;
- rte_memcpy(&s->s.ipv6.addr.src, &in6->sin6_addr,
- sizeof(s->s.ipv6.addr.src));
- rte_memcpy(&s->s.ipv6.mask.src, &tle_ipv6_none,
- sizeof(s->s.ipv6.mask.src));
- rte_memcpy(&s->s.ipv6.mask.dst, &tle_ipv6_none,
- sizeof(s->s.ipv6.mask.dst));
- }
-
- /* setup the destination device. */
- rc = stream_fill_dest(s);
- if (rc != 0)
- return rc;
-
- /* setup L4 dst address from device param */
- prm = &s->tx.dst.dev->prm;
- if (s->s.type == TLE_V4) {
- if (s->s.ipv4.addr.dst == INADDR_ANY)
- s->s.ipv4.addr.dst = prm->local_addr4.s_addr;
- } else if (memcmp(&s->s.ipv6.addr.dst, &tle_ipv6_any,
- sizeof(tle_ipv6_any)) == 0)
- memcpy(&s->s.ipv6.addr.dst, &prm->local_addr6,
- sizeof(s->s.ipv6.addr.dst));
-
- return rc;
-}
-
static inline int
-tx_syn(struct tle_tcp_stream *s, const struct sockaddr *addr)
+tx_syn(struct tle_tcp_stream *s)
{
int32_t rc;
- uint32_t tms, seq;
+ uint32_t seq;
+ uint64_t tms;
union pkt_info pi;
struct stbl *st;
struct stbl_entry *se;
- /* fill stream address */
- rc = stream_fill_addr(s, addr);
+ rc = stream_fill_dest(s);
if (rc != 0)
return rc;
@@ -2107,7 +2245,7 @@ tx_syn(struct tle_tcp_stream *s, const struct sockaddr *addr)
/* add the stream in stream table */
st = CTX_TCP_STLB(s->s.ctx);
- se = stbl_add_stream_lock(st, s);
+ se = stbl_add_stream(st, &s->s);
if (se == NULL)
return -ENOBUFS;
s->ste = se;
@@ -2115,6 +2253,7 @@ tx_syn(struct tle_tcp_stream *s, const struct sockaddr *addr)
/* put stream into the to-send queue */
txs_enqueue(s->s.ctx, s);
+ TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
return 0;
}
@@ -2147,7 +2286,7 @@ tle_tcp_stream_connect(struct tle_stream *ts, const struct sockaddr *addr)
/* fill stream, prepare and transmit syn pkt */
s->tcb.uop |= TCP_OP_CONNECT;
- rc = tx_syn(s, addr);
+ rc = tx_syn(s);
tcp_stream_release(s);
/* error happened, do a cleanup */
@@ -2160,13 +2299,29 @@ tle_tcp_stream_connect(struct tle_stream *ts, const struct sockaddr *addr)
uint16_t
tle_tcp_stream_recv(struct tle_stream *ts, struct rte_mbuf *pkt[], uint16_t num)
{
- uint32_t n;
+ uint32_t n, i;
+ uint32_t free_slots;
struct tle_tcp_stream *s;
s = TCP_STREAM(ts);
+
+ free_slots = rte_ring_free_count(s->rx.q);
+
n = _rte_ring_mcs_dequeue_burst(s->rx.q, (void **)pkt, num);
- if (n == 0)
+ if (n == 0) {
+ if (s->tcb.err != 0) {
+ rte_errno = s->tcb.err;
+ } else {
+ rte_errno = EAGAIN;
+ }
return 0;
+ }
+
+ for (i = 0; i < n; ++i)
+ s->tcb.rcv.cpy += rte_pktmbuf_pkt_len(pkt[i]);
+
+ /* update receive window with left recv buffer*/
+ s->tcb.rcv.wnd = calc_rx_wnd(s, s->tcb.rcv.wscale);
/*
* if we still have packets to read,
@@ -2176,28 +2331,99 @@ tle_tcp_stream_recv(struct tle_stream *ts, struct rte_mbuf *pkt[], uint16_t num)
if (tcp_stream_try_acquire(s) > 0 && s->rx.ev != NULL)
tle_event_raise(s->rx.ev);
tcp_stream_release(s);
+ /* if we have received fin, no more data will come, raise err event. */
+ } else if (s->tcb.rcv.frs.on == 2) {
+ if (tcp_stream_try_acquire(s) > 0 && s->err.ev != NULL)
+ tle_event_raise(s->err.ev);
+ tcp_stream_release(s);
+ }
+
+ /* update recv win to the remote */
+ if (free_slots < RECV_WIN_NOTIFY_THRESH &&
+ rte_ring_free_count(s->rx.q) >= RECV_WIN_NOTIFY_THRESH) {
+ s->tcb.snd.update_rcv = true;
+ txs_enqueue(s->s.ctx, s);
}
return n;
}
+uint16_t
+tle_tcp_stream_inq(struct tle_stream *ts)
+{
+ struct tle_tcp_stream *s;
+
+ s = TCP_STREAM(ts);
+ return s->tcb.rcv.nxt - s->tcb.rcv.cpy;
+}
+
+#define DECONST(type, var) ((type)(uintptr_t)(const void *)(var))
+
+ssize_t
+tle_tcp_stream_readv(struct tle_stream *ts, const struct iovec *iov, int iovcnt)
+{
+ struct msghdr msg = {0};
+
+ msg.msg_iov = DECONST(struct iovec *, iov); /* Recover const later */
+ msg.msg_iovlen = iovcnt;
+ return tle_tcp_stream_recvmsg(ts, &msg);
+}
+
ssize_t
-tle_tcp_stream_readv(struct tle_stream *ts, const struct iovec *iov,
- int iovcnt)
+tle_tcp_stream_recvmsg(struct tle_stream *ts, struct msghdr *msg)
{
+ size_t sz;
int32_t i;
uint32_t mn, n, tn;
- size_t sz;
+ uint32_t free_slots;
struct tle_tcp_stream *s;
struct iovec iv;
struct rxq_objs mo[2];
+ struct sockaddr_in *addr;
+ struct sockaddr_in6 *addr6;
+ const struct iovec *iov = msg->msg_iov;
+ int iovcnt = msg->msg_iovlen;
s = TCP_STREAM(ts);
+ free_slots = rte_ring_free_count(s->rx.q);
+
/* get group of packets */
mn = tcp_rxq_get_objs(s, mo);
- if (mn == 0)
- return 0;
+ if (mn == 0) {
+ if (s->tcb.err != 0)
+ rte_errno = s->tcb.err;
+ else
+ rte_errno = EAGAIN;
+ return -1;
+ }
+
+ if (!ts->option.timestamp)
+ ts->timestamp = mo[0].mb[0]->timestamp;
+
+ if (msg->msg_control != NULL) {
+ if (ts->option.timestamp)
+ tle_set_timestamp(msg, mo[0].mb[0]);
+ else
+ msg->msg_controllen = 0;
+ }
+
+ if (msg->msg_name != NULL) {
+ if (s->s.type == TLE_V4) {
+ addr = (struct sockaddr_in*)msg->msg_name;
+ addr->sin_family = AF_INET;
+ addr->sin_addr.s_addr = s->s.ipv4.addr.src;
+ addr->sin_port = s->s.port.src;
+ msg->msg_namelen = sizeof(struct sockaddr_in);
+ } else {
+ addr6 = (struct sockaddr_in6*)msg->msg_name;
+ addr6->sin6_family = AF_INET6;
+ rte_memcpy(&addr6->sin6_addr, &s->s.ipv6.addr.src,
+ sizeof(struct sockaddr_in6));
+ addr6->sin6_port = s->s.port.src;
+ msg->msg_namelen = sizeof(struct sockaddr_in6);
+ }
+ }
sz = 0;
n = 0;
@@ -2229,6 +2455,8 @@ tle_tcp_stream_readv(struct tle_stream *ts, const struct iovec *iov,
}
tcp_rxq_consume(s, tn);
+ /* update receive window with left recv buffer*/
+ s->tcb.rcv.wnd = calc_rx_wnd(s, s->tcb.rcv.wscale);
/*
* if we still have packets to read,
@@ -2238,6 +2466,20 @@ tle_tcp_stream_readv(struct tle_stream *ts, const struct iovec *iov,
if (tcp_stream_try_acquire(s) > 0 && s->rx.ev != NULL)
tle_event_raise(s->rx.ev);
tcp_stream_release(s);
+ /* if we have received fin, no more data will come, raise err event. */
+ } else if (s->tcb.rcv.frs.on == 2) {
+ if (tcp_stream_try_acquire(s) > 0 && s->err.ev != NULL)
+ tle_event_raise(s->err.ev);
+ tcp_stream_release(s);
+ }
+
+ s->tcb.rcv.cpy += sz;
+
+ /* update recv win to the remote */
+ if (free_slots < RECV_WIN_NOTIFY_THRESH &&
+ rte_ring_free_count(s->rx.q) >= RECV_WIN_NOTIFY_THRESH) {
+ s->tcb.snd.update_rcv = true;
+ txs_enqueue(s->s.ctx, s);
}
return sz;
@@ -2263,48 +2505,35 @@ tx_segments(struct tle_tcp_stream *s, uint64_t ol_flags,
if (i == num) {
/* queue packets for further transmission. */
rc = _rte_ring_enqueue_bulk(s->tx.q, (void **)segs, num);
- if (rc != 0)
+ if (rc != 0) {
+ rc = -EAGAIN;
free_mbufs(segs, num);
+ }
}
return rc;
}
-uint16_t
-tle_tcp_stream_send(struct tle_stream *ts, struct rte_mbuf *pkt[], uint16_t num)
+static inline uint16_t
+stream_send(struct tle_tcp_stream *s, struct rte_mbuf *pkt[],
+ uint16_t num, uint16_t mss, uint64_t ol_flags)
{
- uint32_t i, j, k, mss, n, state;
+ uint16_t i, j, k;
int32_t rc;
- uint64_t ol_flags;
- struct tle_tcp_stream *s;
+ uint32_t n, free_slots;
struct rte_mbuf *segs[TCP_MAX_PKT_SEG];
-
- s = TCP_STREAM(ts);
-
- /* mark stream as not closable. */
- if (tcp_stream_acquire(s) < 0) {
- rte_errno = EAGAIN;
- return 0;
- }
-
- state = s->tcb.state;
- if (state != TCP_ST_ESTABLISHED && state != TCP_ST_CLOSE_WAIT) {
- rte_errno = ENOTCONN;
- tcp_stream_release(s);
- return 0;
- }
-
- mss = s->tcb.snd.mss;
- ol_flags = s->tx.dst.ol_flags;
+ int32_t pkt_len;
k = 0;
rc = 0;
+ pkt_len = 0;
while (k != num) {
/* prepare and check for TX */
for (i = k; i != num; i++) {
if (pkt[i]->pkt_len > mss ||
pkt[i]->nb_segs > TCP_MAX_PKT_SEG)
break;
+ pkt_len += pkt[i]->pkt_len;
rc = tcp_fill_mbuf(pkt[i], s, &s->tx.dst, ol_flags,
s->s.port, 0, TCP_FLAG_ACK, 0, 0);
if (rc != 0)
@@ -2328,6 +2557,7 @@ tle_tcp_stream_send(struct tle_stream *ts, struct rte_mbuf *pkt[], uint16_t num)
pkt[j]->l3_len +
pkt[j]->l4_len);
pkt[j]->ol_flags &= ol_flags;
+ pkt_len -= pkt[j]->pkt_len;
}
break;
}
@@ -2339,8 +2569,10 @@ tle_tcp_stream_send(struct tle_stream *ts, struct rte_mbuf *pkt[], uint16_t num)
/* segment large packet and enqueue for sending */
} else if (i != num) {
+ free_slots = rte_ring_free_count(s->tx.q);
+ free_slots = RTE_MIN(free_slots, RTE_DIM(segs));
/* segment the packet. */
- rc = tcp_segmentation(pkt[i], segs, RTE_DIM(segs),
+ rc = tcp_segmentation(pkt[i], segs, free_slots,
&s->tx.dst, mss);
if (rc < 0) {
rte_errno = -rc;
@@ -2351,19 +2583,161 @@ tle_tcp_stream_send(struct tle_stream *ts, struct rte_mbuf *pkt[], uint16_t num)
if (rc == 0) {
/* free the large mbuf */
rte_pktmbuf_free(pkt[i]);
+ pkt_len += pkt[i]->pkt_len;
/* set the mbuf as consumed */
k++;
- } else
+ } else {
/* no space left in tx queue */
+ RTE_VERIFY(0);
break;
+ }
}
}
+ s->tcb.snd.waitlen += pkt_len;
+ return k;
+}
+
+static inline uint16_t
+stream_send_tso(struct tle_tcp_stream *s, struct rte_mbuf *pkt[],
+ uint16_t num, uint16_t mss, uint64_t ol_flags)
+{
+ uint16_t i, k, nb_segs;
+ int32_t rc, pkt_len;
+ uint64_t ol_flags1;
+ struct rte_mbuf *pre_tail;
+
+ k = 0;
+ rc = 0;
+ while (k != num) {
+ /* Make sure there is at least one slot available */
+ if (rte_ring_free_count(s->tx.q) == 0)
+ break;
+
+ /* prepare and check for TX */
+ nb_segs = 0;
+ pkt_len = 0;
+ pre_tail = NULL;
+ for (i = k; i != num; i++) {
+ if (pkt[i]->nb_segs != 1)
+ rte_panic("chained mbuf: %p\n", pkt[i]);
+ /* We shall consider cwnd and snd wnd when limit len */
+ if (nb_segs + pkt[i]->nb_segs <= TCP_MAX_PKT_SEG &&
+ pkt_len + pkt[i]->pkt_len <= 65535 - RESERVE_HEADER_LEN) {
+ nb_segs += pkt[i]->nb_segs;
+ pkt_len += pkt[i]->pkt_len;
+ if (pre_tail)
+ pre_tail->next = pkt[i];
+ pre_tail = rte_pktmbuf_lastseg(pkt[i]);
+ } else {
+ /* enqueue this one now */
+ break;
+ }
+ }
+
+ if (unlikely(i == k)) {
+ /* pkt[k] is a too big packet, now we fall back to
+ * non-tso send; we can optimize it later by
+ * splitting the mbuf.
+ */
+ if (stream_send(s, &pkt[k], 1, mss, ol_flags) == 1) {
+ k++;
+ continue;
+ } else
+ break;
+ }
+
+ pkt[k]->nb_segs = nb_segs;
+ pkt[k]->pkt_len = pkt_len;
+
+ ol_flags1 = ol_flags;
+ if (pkt_len > mss)
+ ol_flags1 |= PKT_TX_TCP_SEG;
+
+ rc = tcp_fill_mbuf(pkt[k], s, &s->tx.dst, ol_flags1,
+ s->s.port, 0, TCP_FLAG_ACK, 0, 0);
+ if (rc != 0) /* hard to recover */
+ rte_panic("failed to fill mbuf: %p\n", pkt[k]);
+
+ /* correct mss */
+ pkt[k]->tso_segsz = mss;
+
+ s->tcb.snd.waitlen += pkt_len;
+ /* We already make sure there is at least one slot */
+ if (_rte_ring_enqueue_burst(s->tx.q, (void **)pkt + k, 1) < 1)
+ RTE_VERIFY(0);
+
+ k = i;
+ }
+
+ return k;
+}
+
+uint16_t
+tle_tcp_stream_send(struct tle_stream *ts, struct rte_mbuf *pkt[], uint16_t num)
+{
+ uint16_t k, mss, state;
+ uint64_t ol_flags;
+ struct tle_tcp_stream *s;
+
+ s = TCP_STREAM(ts);
+
+ if (s->tcb.err != 0) {
+ rte_errno = s->tcb.err;
+ return 0;
+ }
+
+ /* mark stream as not closable. */
+ if (tcp_stream_acquire(s) < 0) {
+ rte_errno = EAGAIN;
+ return 0;
+ }
+
+ state = s->tcb.state;
+ switch (state) {
+ case TCP_ST_ESTABLISHED:
+ case TCP_ST_CLOSE_WAIT:
+ break;
+ case TCP_ST_FIN_WAIT_1:
+ case TCP_ST_FIN_WAIT_2:
+ case TCP_ST_CLOSING:
+ case TCP_ST_LAST_ACK:
+ rte_errno = EPIPE;
+ tcp_stream_release(s);
+ return 0;
+ default:
+ rte_errno = ENOTCONN;
+ tcp_stream_release(s);
+ return 0;
+ }
+
+ mss = s->tcb.snd.mss;
+
+ ol_flags = s->tx.dst.ol_flags;
+
+ /* Some reference number on the case:
+ * "<netperf with uss> - tap - <kernel stack> - <netserver>"
+ * ~2Gbps with tso disabled;
+ * ~16Gbps with tso enabled.
+ */
+ if (rte_ring_free_count(s->tx.q) == 0) {
+ /* Block send may try without waiting for tx event (raised by acked
+ * data), so here we will still put this stream for further process
+ */
+ txs_enqueue(s->s.ctx, s);
+ rte_errno = EAGAIN;
+ k = 0;
+ } else if (s->tx.dst.dev->prm.tx_offload & DEV_TX_OFFLOAD_TCP_TSO)
+ k = stream_send_tso(s, pkt, num, mss, ol_flags);
+ else
+ k = stream_send(s, pkt, num, mss, ol_flags);
+
/* notify BE about more data to send */
if (k != 0)
txs_enqueue(s->s.ctx, s);
+
/* if possible, re-arm stream write event. */
- if (rte_ring_free_count(s->tx.q) != 0 && s->tx.ev != NULL)
+ if (rte_ring_free_count(s->tx.q) && s->tx.ev != NULL && k == num)
tle_event_raise(s->tx.ev);
tcp_stream_release(s);
@@ -2382,9 +2756,15 @@ tle_tcp_stream_writev(struct tle_stream *ts, struct rte_mempool *mp,
struct tle_tcp_stream *s;
struct iovec iv;
struct rte_mbuf *mb[2 * MAX_PKT_BURST];
+ uint16_t mss;
s = TCP_STREAM(ts);
+ if (s->tcb.err != 0) {
+ rte_errno = s->tcb.err;
+ return -1;
+ }
+
/* mark stream as not closable. */
if (tcp_stream_acquire(s) < 0) {
rte_errno = EAGAIN;
@@ -2392,7 +2772,18 @@ tle_tcp_stream_writev(struct tle_stream *ts, struct rte_mempool *mp,
}
state = s->tcb.state;
- if (state != TCP_ST_ESTABLISHED && state != TCP_ST_CLOSE_WAIT) {
+ switch (state) {
+ case TCP_ST_ESTABLISHED:
+ case TCP_ST_CLOSE_WAIT:
+ break;
+ case TCP_ST_FIN_WAIT_1:
+ case TCP_ST_FIN_WAIT_2:
+ case TCP_ST_CLOSING:
+ case TCP_ST_LAST_ACK:
+ rte_errno = EPIPE;
+ tcp_stream_release(s);
+ return -1;
+ default:
rte_errno = ENOTCONN;
tcp_stream_release(s);
return -1;
@@ -2403,11 +2794,24 @@ tle_tcp_stream_writev(struct tle_stream *ts, struct rte_mempool *mp,
for (i = 0; i != iovcnt; i++)
tsz += iov[i].iov_len;
+ if (tsz == 0) {
+ tcp_stream_release(s);
+ return 0;
+ }
+
slen = rte_pktmbuf_data_room_size(mp);
- slen = RTE_MIN(slen, s->tcb.snd.mss);
+ mss = s->tcb.snd.mss;
+
+ slen = RTE_MIN(slen, mss);
num = (tsz + slen - 1) / slen;
n = rte_ring_free_count(s->tx.q);
+
+ if (n == 0) {
+ tcp_stream_release(s);
+ return 0;
+ }
+
num = RTE_MIN(num, n);
n = RTE_MIN(num, RTE_DIM(mb));
@@ -2451,7 +2855,6 @@ tle_tcp_stream_writev(struct tle_stream *ts, struct rte_mempool *mp,
k = 0;
if (k != j) {
-
/* free pkts that were not enqueued */
free_mbufs(mb + k, j - k);
@@ -2466,14 +2869,16 @@ tle_tcp_stream_writev(struct tle_stream *ts, struct rte_mempool *mp,
}
}
- if (k != 0) {
-
+ if (k != 0) {
/* notify BE about more data to send */
txs_enqueue(s->s.ctx, s);
/* if possible, re-arm stream write event. */
if (rte_ring_free_count(s->tx.q) != 0 && s->tx.ev != NULL)
tle_event_raise(s->tx.ev);
+ } else {
+ rte_errno = EAGAIN;
+ sz = -1;
}
tcp_stream_release(s);
@@ -2485,7 +2890,7 @@ static inline void
tx_data_fin(struct tle_tcp_stream *s, uint32_t tms, uint32_t state)
{
/* try to send some data */
- tx_nxt_data(s, tms);
+ uint32_t tn = tx_nxt_data(s, tms);
/* we also have to send a FIN */
if (state != TCP_ST_ESTABLISHED &&
@@ -2495,6 +2900,13 @@ tx_data_fin(struct tle_tcp_stream *s, uint32_t tms, uint32_t state)
s->tcb.snd.fss = ++s->tcb.snd.nxt;
send_ack(s, tms, TCP_FLAG_FIN | TCP_FLAG_ACK);
}
+
+ if (s->tcb.snd.update_rcv) {
+ if (tn == 0)
+ send_ack(s, tms, TCP_FLAG_ACK); /* update recv window */
+
+ s->tcb.snd.update_rcv = false;
+ }
}
static inline void
@@ -2507,7 +2919,7 @@ tx_stream(struct tle_tcp_stream *s, uint32_t tms)
if (state == TCP_ST_SYN_SENT) {
/* send the SYN, start the rto timer */
send_ack(s, tms, TCP_FLAG_SYN);
- timer_start(s);
+ timer_start(s, TIMER_RTO, s->tcb.snd.rto);
} else if (state >= TCP_ST_ESTABLISHED && state <= TCP_ST_LAST_ACK) {
@@ -2515,7 +2927,7 @@ tx_stream(struct tle_tcp_stream *s, uint32_t tms)
/* start RTO timer. */
if (s->tcb.snd.nxt != s->tcb.snd.una)
- timer_start(s);
+ timer_start(s, TIMER_RTO, s->tcb.snd.rto);
}
}
@@ -2544,7 +2956,6 @@ rto_stream(struct tle_tcp_stream *s, uint32_t tms)
if (s->tcb.snd.nb_retx < s->tcb.snd.nb_retm) {
if (state >= TCP_ST_ESTABLISHED && state <= TCP_ST_LAST_ACK) {
-
/* update SND.CWD and SND.SSTHRESH */
rto_cwnd_update(&s->tcb);
@@ -2570,50 +2981,131 @@ rto_stream(struct tle_tcp_stream *s, uint32_t tms)
* than one SYN or SYN/ACK retransmissions or true loss
* detection has been made.
*/
- if (s->tcb.snd.nb_retx != 0)
+ if (s->tcb.snd.nb_retx != 0) {
s->tcb.snd.cwnd = s->tcb.snd.mss;
+ CWND_INFO("synsent", s->tcb.snd.cwnd);
+ }
send_ack(s, tms, TCP_FLAG_SYN);
-
- } else if (state == TCP_ST_TIME_WAIT) {
- stream_term(s);
+ TCP_INC_STATS(TCP_MIB_RETRANSSEGS);
}
/* RFC6298:5.5 back off the timer */
s->tcb.snd.rto = rto_roundup(2 * s->tcb.snd.rto);
s->tcb.snd.nb_retx++;
- timer_restart(s);
+ timer_restart(s, TIMER_RTO, s->tcb.snd.rto);
} else {
- send_rst(s, s->tcb.snd.nxt);
+ if (state == TCP_ST_SYN_SENT) {
+ if (stream_fill_dest(s) != 0 ||
+ is_broadcast_ether_addr((struct ether_addr *)s->tx.dst.hdr))
+ s->tcb.err = EHOSTUNREACH;
+ else
+ /* TODO: do we send rst on this */
+ s->tcb.err = ENOTCONN;
+ } else
+ send_rst(s, s->tcb.snd.una);
stream_term(s);
}
}
+static inline void
+set_keepalive_timer(struct tle_tcp_stream *s)
+{
+ if (s->s.option.keepalive) {
+ if (s->tcb.state == TCP_ST_ESTABLISHED) {
+ if (s->tcb.snd.nb_keepalive == 0)
+ timer_reset(s, TIMER_KEEPALIVE,
+ s->s.option.keepidle * MS_PER_S);
+ else
+ timer_reset(s, TIMER_KEEPALIVE,
+ s->s.option.keepintvl * MS_PER_S);
+ }
+ } else {
+ timer_stop(s, TIMER_KEEPALIVE);
+ s->tcb.snd.nb_keepalive = 0;
+ }
+}
+
int
tle_tcp_process(struct tle_ctx *ctx, uint32_t num)
{
- uint32_t i, k, tms;
+ uint8_t type;
+ uint32_t i, k;
+ uint64_t tms;
struct sdr *dr;
struct tle_timer_wheel *tw;
struct tle_stream *p;
struct tle_tcp_stream *s, *rs[num];
- /* process streams with RTO exipred */
+ tms = tcp_get_tms(ctx->cycles_ms_shift);
+ /* process streams with RTO exipred */
tw = CTX_TCP_TMWHL(ctx);
- tms = tcp_get_tms(ctx->cycles_ms_shift);
tle_timer_expire(tw, tms);
k = tle_timer_get_expired_bulk(tw, (void **)rs, RTE_DIM(rs));
for (i = 0; i != k; i++) {
-
- s = rs[i];
- s->timer.handle = NULL;
- if (tcp_stream_try_acquire(s) > 0)
- rto_stream(s, tms);
- tcp_stream_release(s);
+ s = timer_stream(rs[i]);
+ type = timer_type(rs[i]);
+ s->timer.handle[type] = NULL;
+
+ switch (type) {
+ case TIMER_RTO:
+ /* FE cannot change stream into below states,
+ * that's why we don't put it into lock
+ */
+ if (s->tcb.state == TCP_ST_TIME_WAIT ||
+ s->tcb.state == TCP_ST_FIN_WAIT_2) {
+ tcp_stream_down(s);
+ stream_term(s);
+ tcp_stream_up(s);
+ } else if (tcp_stream_acquire(s) > 0) {
+ /*
+ * stream may be closed in frontend concurrently.
+ * if stream has already been closed, it need not
+ * to retransmit anymore.
+ */
+ if (s->tcb.state != TCP_ST_CLOSED)
+ rto_stream(s, tms);
+ tcp_stream_release(s);
+ }
+ /* Fail to aquire lock? FE is shutdown or close this
+ * stream, either FIN or RST needs to be sent, which
+ * means it's in tsq, will be processed later.
+ */
+ break;
+ case TIMER_DACK:
+ if (rte_atomic32_read(&s->tx.arm) == 0 &&
+ s->tcb.rcv.nxt != s->tcb.snd.ack &&
+ tcp_stream_acquire(s) > 0) {
+ s->s.option.tcpquickack = 8;
+ send_ack(s, tms, TCP_FLAG_ACK);
+ tcp_stream_release(s);
+ }
+ break;
+ case TIMER_KEEPALIVE:
+ if (s->tcb.snd.nb_keepalive < s->s.option.keepcnt) {
+ if (tcp_stream_try_acquire(s) > 0 &&
+ s->tcb.state == TCP_ST_ESTABLISHED) {
+ send_keepalive(s);
+ s->tcb.snd.nb_keepalive++;
+ timer_start(s, TIMER_KEEPALIVE,
+ s->s.option.keepintvl * MS_PER_S);
+ }
+ tcp_stream_release(s);
+ } else {
+ tcp_stream_down(s);
+ send_rst(s, s->tcb.snd.nxt);
+ s->tcb.err = ETIMEDOUT;
+ stream_term(s);
+ tcp_stream_up(s);
+ }
+ break;
+ default:
+ rte_panic("Invalid timer type: %d\n", type);
+ }
}
/* process streams from to-send queue */
@@ -2621,20 +3113,63 @@ tle_tcp_process(struct tle_ctx *ctx, uint32_t num)
k = txs_dequeue_bulk(ctx, rs, RTE_DIM(rs));
for (i = 0; i != k; i++) {
-
s = rs[i];
- rte_atomic32_set(&s->tx.arm, 0);
- if (tcp_stream_try_acquire(s) > 0)
+ if (s->tcb.uop & TCP_OP_RESET) {
+ /* already put into death row in close() */
+ send_rst(s, s->tcb.snd.nxt);
+ continue;
+ }
+
+ if (tcp_stream_acquire(s) > 0) {
+ if (s->tcb.uop & TCP_OP_KEEPALIVE) {
+ s->tcb.uop &= ~TCP_OP_KEEPALIVE;
+ set_keepalive_timer(s);
+ }
+
+ if (s->tcb.state == TCP_ST_FIN_WAIT_2 &&
+ s->tcb.uop & TCP_OP_CLOSE) {
+ /* This could happen after:
+ * 1) shutdown;
+ * 2) FIN sent;
+ * 3) ack received;
+ * 4) close;
+ */
+ timer_start(s, TIMER_RTO, s->tcb.snd.rto_fw);
+ tcp_stream_release(s);
+ continue;
+ }
+
+ if (s->tcb.state == TCP_ST_ESTABLISHED &&
+ s->s.option.tcpcork) {
+ if (s->tcb.snd.cork_ts == 0)
+ s->tcb.snd.cork_ts = (uint32_t)tms;
+
+ if (s->tcb.snd.waitlen < s->tcb.snd.mss &&
+ (uint32_t)tms - s->tcb.snd.cork_ts < 200) {
+ txs_enqueue(s->s.ctx, s);
+ tcp_stream_release(s);
+ continue;
+ }
+
+ s->tcb.snd.cork_ts = 0;
+ }
+
tx_stream(s, tms);
- else
+ tcp_stream_release(s);
+ continue;
+ }
+
+ if (s->tcb.state != TCP_ST_CLOSED)
txs_enqueue(s->s.ctx, s);
- tcp_stream_release(s);
+
+ /* TCP_ST_CLOSED? See close with TCP_ST_CLOSED state */
}
/* collect streams to close from the death row */
dr = CTX_TCP_SDR(ctx);
+ rte_spinlock_lock(&dr->lock);
for (k = 0, p = STAILQ_FIRST(&dr->be);
k != num && p != NULL;
k++, p = STAILQ_NEXT(p, link))
@@ -2645,9 +3180,21 @@ tle_tcp_process(struct tle_ctx *ctx, uint32_t num)
else
STAILQ_FIRST(&dr->be) = p;
+ /* if stream still in tsq, wait one more round */
+ for (i = 0; i != k; i++) {
+ if (rte_atomic32_read(&rs[i]->tx.arm) > 0) {
+ STAILQ_INSERT_TAIL(&dr->be, &rs[i]->s, link);
+ rs[i] = NULL;
+ }
+ }
+
+ rte_spinlock_unlock(&dr->lock);
+
/* cleanup closed streams */
for (i = 0; i != k; i++) {
s = rs[i];
+ if (s == NULL)
+ continue;
tcp_stream_down(s);
tcp_stream_reset(ctx, s);
}
diff --git a/lib/libtle_l4p/tcp_rxtx.h b/lib/libtle_l4p/tcp_rxtx.h
new file mode 100644
index 0000000..e7f8e3e
--- /dev/null
+++ b/lib/libtle_l4p/tcp_rxtx.h
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2016-2017 Intel Corporation.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _TCP_RXTX_H_
+#define _TCP_RXTX_H_
+
+#include "tcp_stream.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static inline uint32_t
+calc_seg_cnt(uint32_t plen, uint32_t mss)
+{
+ if (plen > mss)
+ return (plen + mss - 1) / mss;
+ else
+ return 1;
+}
+
+static inline uint32_t
+get_ip_pid(struct tle_dev *dev, uint32_t num, uint32_t type, uint32_t st)
+{
+ uint32_t pid;
+ rte_atomic32_t *pa;
+
+ pa = &dev->tx.packet_id[type];
+
+ if (st == 0) {
+ pid = rte_atomic32_add_return(pa, num);
+ return pid - num;
+ } else {
+ pid = rte_atomic32_read(pa);
+ rte_atomic32_set(pa, pid + num);
+ return pid;
+ }
+}
+
+static inline void
+fill_tcph(struct tcp_hdr *l4h, const struct tcb *tcb, union l4_ports port,
+ uint32_t seq, uint8_t hlen, uint8_t flags)
+{
+ uint16_t wnd;
+
+ l4h->src_port = port.dst;
+ l4h->dst_port = port.src;
+
+ wnd = (flags & TCP_FLAG_SYN) ?
+ RTE_MIN(tcb->rcv.wnd, (uint32_t)UINT16_MAX) :
+ tcb->rcv.wnd >> tcb->rcv.wscale;
+
+ /* ??? use sse shuffle to hton all remaining 16 bytes at once. ??? */
+ l4h->sent_seq = rte_cpu_to_be_32(seq);
+ l4h->recv_ack = rte_cpu_to_be_32(tcb->rcv.nxt);
+ l4h->data_off = hlen / TCP_DATA_ALIGN << TCP_DATA_OFFSET;
+ l4h->tcp_flags = flags;
+ l4h->rx_win = rte_cpu_to_be_16(wnd);
+ l4h->cksum = 0;
+ l4h->tcp_urp = 0;
+
+ if (flags & TCP_FLAG_SYN)
+ fill_syn_opts(l4h + 1, &tcb->so);
+ else if ((flags & TCP_FLAG_RST) == 0 && tcb->so.ts.raw != 0)
+ fill_tms_opts(l4h + 1, tcb->snd.ts, tcb->rcv.ts);
+}
+
+static inline int
+tcp_fill_mbuf(struct rte_mbuf *m, const struct tle_tcp_stream *s,
+ const struct tle_dest *dst, uint64_t ol_flags,
+ union l4_ports port, uint32_t seq, uint32_t flags,
+ uint32_t pid, uint32_t swcsm)
+{
+ uint32_t l4, len, plen;
+ struct tcp_hdr *l4h;
+ char *l2h, *l3;
+
+ len = dst->l2_len + dst->l3_len;
+ plen = m->pkt_len;
+
+ if (flags & TCP_FLAG_SYN) {
+ /* basic length */
+ l4 = sizeof(*l4h) + TCP_OPT_LEN_MSS;
+
+ /* add wscale space and nop */
+ if (s->tcb.so.wscale) {
+ l4 += TCP_OPT_LEN_WSC + TCP_OPT_LEN_NOP;
+ }
+
+ /* add timestamp space and nop */
+ if (s->tcb.so.ts.raw) {
+ l4 += TCP_TX_OPT_LEN_TMS;
+ }
+ } else if ((flags & TCP_FLAG_RST) == 0 && s->tcb.rcv.ts != 0) {
+ l4 = sizeof(*l4h) + TCP_TX_OPT_LEN_TMS;
+ } else {
+ l4 = sizeof(*l4h);
+ }
+
+ /* adjust mbuf to put L2/L3/L4 headers into it. */
+ l2h = rte_pktmbuf_prepend(m, len + l4);
+ if (l2h == NULL)
+ return -EINVAL;
+
+ /* copy L2/L3 header */
+ rte_memcpy(l2h, dst->hdr, len);
+
+ /* setup TCP header & options */
+ l4h = (struct tcp_hdr *)(l2h + len);
+ fill_tcph(l4h, &s->tcb, port, seq, l4, flags);
+
+ /* setup mbuf TX offload related fields. */
+ m->tx_offload = _mbuf_tx_offload(dst->l2_len, dst->l3_len, l4, 0, 0, 0);
+ m->ol_flags |= ol_flags;
+
+ /* update proto specific fields. */
+
+ l3 = l2h + dst->l2_len;
+ if (((struct ipv4_hdr*)l3)->version_ihl>>4 == 4) {
+ struct ipv4_hdr *l3h;
+ l3h = (struct ipv4_hdr *)l3;
+ l3h->packet_id = rte_cpu_to_be_16(pid);
+ l3h->total_length = rte_cpu_to_be_16(plen + dst->l3_len + l4);
+
+ if ((ol_flags & PKT_TX_TCP_CKSUM) != 0)
+ l4h->cksum = _ipv4x_phdr_cksum(l3h, m->l3_len,
+ ol_flags);
+ else if (swcsm != 0)
+ l4h->cksum = _ipv4_udptcp_mbuf_cksum(m, len, l3h);
+
+ if ((ol_flags & PKT_TX_IP_CKSUM) == 0 && swcsm != 0)
+ l3h->hdr_checksum = _ipv4x_cksum(l3h, m->l3_len);
+ } else {
+ struct ipv6_hdr *l3h;
+ l3h = (struct ipv6_hdr *)l3;
+ l3h->payload_len = rte_cpu_to_be_16(plen + l4);
+ if ((ol_flags & PKT_TX_TCP_CKSUM) != 0)
+ l4h->cksum = rte_ipv6_phdr_cksum(l3h, ol_flags);
+ else if (swcsm != 0)
+ l4h->cksum = _ipv6_udptcp_mbuf_cksum(m, len, l3h);
+ }
+
+ return 0;
+}
+
+static inline int
+stream_drb_empty(struct tle_tcp_stream *s)
+{
+ return rte_ring_empty(s->tx.drb.r);
+}
+
+static inline void
+stream_drb_free(struct tle_tcp_stream *s, struct tle_drb *drbs[],
+ uint32_t nb_drb)
+{
+ _rte_ring_enqueue_burst(s->tx.drb.r, (void **)drbs, nb_drb);
+}
+
+static inline uint32_t
+stream_drb_alloc(struct tle_tcp_stream *s, struct tle_drb *drbs[],
+ uint32_t nb_drb)
+{
+ return _rte_ring_dequeue_burst(s->tx.drb.r, (void **)drbs, nb_drb);
+}
+
+/*
+ * queue standalone packet to he particular output device
+ * It assumes that:
+ * - L2/L3/L4 headers should be already set.
+ * - packet fits into one segment.
+ */
+static inline int
+send_pkt(struct tle_tcp_stream *s, struct tle_dev *dev, struct rte_mbuf *m)
+{
+ uint32_t n, nb;
+ struct tle_drb *drb;
+
+ if (stream_drb_alloc(s, &drb, 1) == 0)
+ return -ENOBUFS;
+
+ /* enqueue pkt for TX. */
+ nb = 1;
+ n = tle_dring_mp_enqueue(&dev->tx.dr, (const void * const*)&m, 1,
+ &drb, &nb);
+
+ /* free unused drbs. */
+ if (nb != 0)
+ stream_drb_free(s, &drb, 1);
+
+ return (n == 1) ? 0 : -ENOBUFS;
+}
+
+#define TCP_OLFLAGS_CKSUM(flags) (flags & (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM))
+
+static inline int
+send_ctrl_pkt(struct tle_tcp_stream *s, struct rte_mbuf *m, uint32_t seq,
+ uint32_t flags)
+{
+ const struct tle_dest *dst;
+ uint32_t pid, type;
+ int32_t rc;
+
+ dst = &s->tx.dst;
+ type = s->s.type;
+ pid = get_ip_pid(dst->dev, 1, type, (s->flags & TLE_CTX_FLAG_ST) != 0);
+
+ rc = tcp_fill_mbuf(m, s, dst, TCP_OLFLAGS_CKSUM(dst->ol_flags),
+ s->s.port, seq, flags, pid, 1);
+ if (rc == 0)
+ rc = send_pkt(s, dst->dev, m);
+
+ return rc;
+}
+
+static inline int
+send_rst(struct tle_tcp_stream *s, uint32_t seq)
+{
+ struct rte_mbuf *m;
+ int32_t rc;
+
+ m = rte_pktmbuf_alloc(s->tx.dst.head_mp);
+ if (m == NULL)
+ return -ENOMEM;
+
+ rc = send_ctrl_pkt(s, m, seq, TCP_FLAG_RST | TCP_FLAG_ACK);
+ if (rc != 0)
+ rte_pktmbuf_free(m);
+ else
+ TCP_INC_STATS(TCP_MIB_OUTRSTS);
+
+ return rc;
+}
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _TCP_RXTX_H_ */
diff --git a/lib/libtle_l4p/tcp_stream.c b/lib/libtle_l4p/tcp_stream.c
index 676521b..4a65053 100644
--- a/lib/libtle_l4p/tcp_stream.c
+++ b/lib/libtle_l4p/tcp_stream.c
@@ -20,6 +20,8 @@
#include <rte_ip.h>
#include <rte_tcp.h>
+#include <netinet/tcp.h>
+
#include "tcp_stream.h"
#include "tcp_timer.h"
#include "stream_table.h"
@@ -27,6 +29,7 @@
#include "tcp_ctl.h"
#include "tcp_ofo.h"
#include "tcp_txq.h"
+#include "tcp_rxtx.h"
static void
unuse_stream(struct tle_tcp_stream *s)
@@ -38,25 +41,27 @@ unuse_stream(struct tle_tcp_stream *s)
static void
fini_stream(struct tle_tcp_stream *s)
{
- if (s != NULL) {
- rte_free(s->rx.q);
- tcp_ofo_free(s->rx.ofo);
- rte_free(s->tx.q);
- rte_free(s->tx.drb.r);
- }
+ rte_free(s);
}
static void
tcp_fini_streams(struct tle_ctx *ctx)
{
- uint32_t i;
struct tcp_streams *ts;
+ struct tle_stream *s;
ts = CTX_TCP_STREAMS(ctx);
if (ts != NULL) {
stbl_fini(&ts->st);
- for (i = 0; i != ctx->prm.max_streams; i++)
- fini_stream(&ts->s[i]);
+
+ /* TODO: free those in use? may be not necessary, as we assume
+ * all streams have been closed and are free.
+ */
+ while (ctx->streams.nb_free--) {
+ s = STAILQ_FIRST(&ctx->streams.free);
+ STAILQ_FIRST(&ctx->streams.free) = STAILQ_NEXT(s, link);
+ fini_stream(TCP_STREAM(s));
+ }
/* free the timer wheel */
tle_timer_free(ts->tmr);
@@ -94,61 +99,100 @@ alloc_ring(uint32_t n, uint32_t flags, int32_t socket)
return r;
}
+/* stream memory layout:
+ * [tle_tcp_stream] [rx.q] [rx.ofo] [tx.q] [tx.drb.r]
+ */
static int
-init_stream(struct tle_ctx *ctx, struct tle_tcp_stream *s)
+add_stream(struct tle_ctx *ctx)
{
- size_t bsz, rsz, sz;
- uint32_t f, i, k, n, nb;
+ size_t sz_s, sz_rxq, sz_ofo, sz_txq, sz_drb_r, sz;
+ /* for rx.q */
+ uint32_t n_rxq;
+ /* for rx.ofo */
+ struct ofo *ofo;
+ struct rte_mbuf **obj;
+ uint32_t ndb, nobj;
+ size_t dsz, osz;
+ /* for tx.q */
+ uint32_t n_txq;
+ /* for tx.drb.r */
+ size_t bsz, rsz;
struct tle_drb *drb;
- char name[RTE_RING_NAMESIZE];
-
- f = ((ctx->prm.flags & TLE_CTX_FLAG_ST) == 0) ? 0 :
- (RING_F_SP_ENQ | RING_F_SC_DEQ);
-
- /* init RX part. */
-
- n = RTE_MAX(ctx->prm.max_stream_rbufs, 1U);
- s->rx.q = alloc_ring(n, f | RING_F_SP_ENQ, ctx->prm.socket_id);
- if (s->rx.q == NULL)
- return -ENOMEM;
-
- s->rx.ofo = tcp_ofo_alloc(n, ctx->prm.socket_id);
- if (s->rx.ofo == NULL)
- return -ENOMEM;
-
- /* init TX part. */
+ uint32_t k, nb, n_drb;
- n = RTE_MAX(ctx->prm.max_stream_sbufs, 1U);
- s->tx.q = alloc_ring(n, f | RING_F_SC_DEQ, ctx->prm.socket_id);
- if (s->tx.q == NULL)
- return -ENOMEM;
+ uint32_t f, i;
+ char name[RTE_RING_NAMESIZE];
+ struct tle_tcp_stream *s;
+ // stream
+ sz_s = RTE_ALIGN_CEIL(sizeof(*s), RTE_CACHE_LINE_SIZE);
+
+ // rx.q
+ n_rxq = RTE_MAX(ctx->prm.max_stream_rbufs, 1U);
+ n_rxq = rte_align32pow2(n_rxq);
+ sz_rxq = rte_ring_get_memsize(n_rxq);
+ sz_rxq = RTE_ALIGN_CEIL(sz_rxq, RTE_CACHE_LINE_SIZE);
+
+ // rx.ofo
+ calc_ofo_elems(n_rxq, &nobj, &ndb);
+ osz = sizeof(*ofo) + sizeof(ofo->db[0]) * ndb;
+ dsz = sizeof(ofo->db[0].obj[0]) * nobj * ndb;
+ sz_ofo = osz + dsz;
+ sz_ofo = RTE_ALIGN_CEIL(sz_ofo, RTE_CACHE_LINE_SIZE);
+
+ // tx.q
+ n_txq = RTE_MAX(ctx->prm.max_stream_sbufs, 1U);
+ n_txq = rte_align32pow2(n_txq);
+ sz_txq = rte_ring_get_memsize(n_txq);
+ sz_txq = RTE_ALIGN_CEIL(sz_txq, RTE_CACHE_LINE_SIZE);
+
+ // tx.drb.r
nb = drb_nb_elem(ctx);
k = calc_stream_drb_num(ctx, nb);
- n = rte_align32pow2(k);
-
- /* size of the drbs ring */
- rsz = rte_ring_get_memsize(n);
+ n_drb = rte_align32pow2(k);
+ rsz = rte_ring_get_memsize(n_drb); /* size of the drbs ring */
rsz = RTE_ALIGN_CEIL(rsz, RTE_CACHE_LINE_SIZE);
+ bsz = tle_drb_calc_size(nb); /* size of the drb. */
+ sz_drb_r = rsz + bsz * k; /* total stream drbs size. */
+ sz_drb_r = RTE_ALIGN_CEIL(sz_drb_r, RTE_CACHE_LINE_SIZE);
- /* size of the drb. */
- bsz = tle_drb_calc_size(nb);
-
- /* total stream drbs size. */
- sz = rsz + bsz * k;
-
- s->tx.drb.r = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE,
- ctx->prm.socket_id);
- if (s->tx.drb.r == NULL) {
- TCP_LOG(ERR, "%s(%p): allocation of %zu bytes on socket %d "
+ sz = sz_s + sz_rxq + sz_ofo + sz_txq + sz_drb_r;
+ s = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE,
+ ctx->prm.socket_id);
+ if (s == NULL) {
+ TCP_LOG(ERR, "%s: allocation of %zu bytes on socket %d "
"failed with error code: %d\n",
- __func__, s, sz, ctx->prm.socket_id, rte_errno);
+ __func__, sz, ctx->prm.socket_id, rte_errno);
return -ENOMEM;
}
- snprintf(name, sizeof(name), "%p@%zu", s, sz);
- rte_ring_init(s->tx.drb.r, name, n, f);
+ s->rx.q = (struct rte_ring *)((uintptr_t)s + sz_s);
+ s->rx.ofo = (struct ofo *)((uintptr_t)s->rx.q + sz_rxq);
+ ofo = s->rx.ofo;
+ s->tx.q = (struct rte_ring *)((uintptr_t)s->rx.ofo + sz_ofo);
+ s->tx.drb.r = (struct rte_ring *)((uintptr_t)s->tx.q + sz_txq);
+ // ring flags
+ f = ((ctx->prm.flags & TLE_CTX_FLAG_ST) == 0) ? 0 :
+ (RING_F_SP_ENQ | RING_F_SC_DEQ);
+
+ /* init RX part. */
+ snprintf(name, sizeof(name), "%p@%zu", s->rx.q, sz_rxq);
+ rte_ring_init(s->rx.q, name, n_rxq, f);
+
+ obj = (struct rte_mbuf **)&ofo->db[ndb];
+ for (i = 0; i != ndb; i++) {
+ ofo->db[i].nb_max = nobj;
+ ofo->db[i].obj = obj + i * nobj;
+ }
+ ofo->nb_max = ndb;
+
+ /* init TX part. */
+ snprintf(name, sizeof(name), "%p@%zu", s->tx.q, sz_txq);
+ rte_ring_init(s->tx.q, name, n_txq, f);
+
+ snprintf(name, sizeof(name), "%p@%zu", s->tx.drb.r, sz_drb_r);
+ rte_ring_init(s->tx.drb.r, name, n_drb, f);
for (i = 0; i != k; i++) {
drb = (struct tle_drb *)((uintptr_t)s->tx.drb.r +
rsz + bsz * i);
@@ -200,7 +244,7 @@ tcp_init_streams(struct tle_ctx *ctx)
f = ((ctx->prm.flags & TLE_CTX_FLAG_ST) == 0) ? 0 :
(RING_F_SP_ENQ | RING_F_SC_DEQ);
- sz = sizeof(*ts) + sizeof(ts->s[0]) * ctx->prm.max_streams;
+ sz = sizeof(*ts);
ts = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE,
ctx->prm.socket_id);
if (ts == NULL) {
@@ -210,6 +254,7 @@ tcp_init_streams(struct tle_ctx *ctx)
return -ENOMEM;
}
+ rte_spinlock_init(&ts->dr.lock);
STAILQ_INIT(&ts->dr.fe);
STAILQ_INIT(&ts->dr.be);
@@ -228,12 +273,11 @@ tcp_init_streams(struct tle_ctx *ctx)
if (ts->tsq == NULL)
rc = -ENOMEM;
else
- rc = stbl_init(&ts->st, ctx->prm.max_streams,
- ctx->prm.socket_id);
+ rc = stbl_init(&ts->st, (ctx->prm.flags & TLE_CTX_FLAG_ST) == 0);
}
- for (i = 0; rc == 0 && i != ctx->prm.max_streams; i++)
- rc = init_stream(ctx, &ts->s[i]);
+ for (i = 0; rc == 0 && i != ctx->prm.min_streams; i++)
+ rc = add_stream(ctx);
if (rc != 0) {
TCP_LOG(ERR, "initalisation of %u-th stream failed", i);
@@ -243,11 +287,30 @@ tcp_init_streams(struct tle_ctx *ctx)
return rc;
}
-static void __attribute__((constructor))
+/*
+ * Note this function is not thread-safe, and we did not lock here as we
+ * have the assumption that this ctx is dedicated to one thread.
+ */
+static uint32_t
+tcp_more_streams(struct tle_ctx *ctx)
+{
+ uint32_t i, nb;
+ uint32_t nb_max = ctx->prm.max_streams - 1;
+ uint32_t nb_cur = ctx->streams.nb_cur;
+
+ nb = RTE_MIN(ctx->prm.delta_streams, nb_max - nb_cur);
+ for (i = 0; i < nb; i++)
+ if (add_stream(ctx) != 0)
+ break;
+ return i;
+}
+
+static void __attribute__((constructor(101)))
tcp_stream_setup(void)
{
static const struct stream_ops tcp_ops = {
.init_streams = tcp_init_streams,
+ .more_streams = tcp_more_streams,
.fini_streams = tcp_fini_streams,
.free_drbs = tcp_free_drbs,
};
@@ -305,16 +368,12 @@ tle_tcp_stream_open(struct tle_ctx *ctx,
s = (struct tle_tcp_stream *)get_stream(ctx);
if (s == NULL) {
- rte_errno = ENFILE;
- return NULL;
-
- /* some TX still pending for that stream. */
- } else if (TCP_STREAM_TX_PENDING(s)) {
- put_stream(ctx, &s->s, 0);
rte_errno = EAGAIN;
return NULL;
}
+ s->s.option.raw = prm->option;
+
/* setup L4 ports and L3 addresses fields. */
rc = stream_fill_ctx(ctx, &s->s,
(const struct sockaddr *)&prm->addr.local,
@@ -336,12 +395,14 @@ tle_tcp_stream_open(struct tle_ctx *ctx,
/* store other params */
s->flags = ctx->prm.flags;
+ s->tcb.err = 0;
s->tcb.snd.nb_retm = (prm->cfg.nb_retries != 0) ? prm->cfg.nb_retries :
TLE_TCP_DEFAULT_RETRIES;
s->tcb.snd.cwnd = (ctx->prm.icw == 0) ? TCP_INITIAL_CWND_MAX :
ctx->prm.icw;
s->tcb.snd.rto_tw = (ctx->prm.timewait == TLE_TCP_TIMEWAIT_DEFAULT) ?
TCP_RTO_2MSL : ctx->prm.timewait;
+ s->tcb.snd.rto_fw = TLE_TCP_FINWAIT_TIMEOUT;
tcp_stream_up(s);
return &s->s;
@@ -354,9 +415,16 @@ static inline int
stream_close(struct tle_ctx *ctx, struct tle_tcp_stream *s)
{
uint16_t uop;
- uint32_t state;
static const struct tle_stream_cb zcb;
+ /* Put uop operation into this wlock; or it may cause this stream
+ * to be put into death ring twice, for example:
+ * 1) FE sets OP_CLOSE;
+ * 2) BE stream_term sets state as TCP_ST_CLOSED, and put in queue;
+ * 3) FE down the stream, and calls stream_term again.
+ */
+ tcp_stream_down(s);
+
/* check was close() already invoked */
uop = s->tcb.uop;
if ((uop & TCP_OP_CLOSE) != 0)
@@ -366,47 +434,66 @@ stream_close(struct tle_ctx *ctx, struct tle_tcp_stream *s)
if (rte_atomic16_cmpset(&s->tcb.uop, uop, uop | TCP_OP_CLOSE) == 0)
return -EDEADLK;
- /* mark stream as unavaialbe for RX/TX. */
- tcp_stream_down(s);
-
/* reset events/callbacks */
- s->rx.ev = NULL;
s->tx.ev = NULL;
+ s->rx.ev = NULL;
s->err.ev = NULL;
s->rx.cb = zcb;
s->tx.cb = zcb;
s->err.cb = zcb;
- state = s->tcb.state;
-
- /* CLOSED, LISTEN, SYN_SENT - we can close the stream straighway */
- if (state <= TCP_ST_SYN_SENT) {
+ switch (s->tcb.state) {
+ case TCP_ST_LISTEN:
+ /* close the stream straightway */
tcp_stream_reset(ctx, s);
return 0;
- }
-
- /* generate FIN and proceed with normal connection termination */
- if (state == TCP_ST_ESTABLISHED || state == TCP_ST_CLOSE_WAIT) {
-
- /* change state */
- s->tcb.state = (state == TCP_ST_ESTABLISHED) ?
- TCP_ST_FIN_WAIT_1 : TCP_ST_LAST_ACK;
-
- /* mark stream as writable/readable again */
+ case TCP_ST_CLOSED:
+ /* it could be put into this state if a RST packet is
+ * received, but this stream could be still in tsq trying
+ * to send something.
+ */
+ /* fallthrough */
+ case TCP_ST_SYN_SENT:
+ /* timer on and could be in tsq (SYN retrans) */
+ stream_term(s);
+ /* fallthrough */
+ case TCP_ST_FIN_WAIT_1:
+ /* fallthrough */
+ case TCP_ST_CLOSING:
+ /* fallthrough */
+ case TCP_ST_TIME_WAIT:
+ /* fallthrough */
+ case TCP_ST_LAST_ACK:
tcp_stream_up(s);
-
- /* queue stream into to-send queue */
- txs_enqueue(ctx, s);
return 0;
+ case TCP_ST_ESTABLISHED:
+ /* fallthrough */
+ case TCP_ST_CLOSE_WAIT:
+ if (s->tcb.state == TCP_ST_ESTABLISHED) {
+ s->tcb.state = TCP_ST_FIN_WAIT_1;
+ TCP_DEC_STATS_ATOMIC(TCP_MIB_CURRESTAB);
+ } else
+ s->tcb.state = TCP_ST_LAST_ACK;
+
+ if (!rte_ring_empty(s->rx.q)) {
+ TCP_INC_STATS(TCP_MIB_ESTABRESETS);
+ s->tcb.uop |= TCP_OP_RESET;
+ stream_term(s);
+ }
+ break;
+ case TCP_ST_FIN_WAIT_2:
+ /* Can reach this state if shutdown was called, but the timer
+ * shall be set after this close.
+ */
+ break;
+ default:
+ rte_panic("Invalid state when close: %d\n", s->tcb.state);
}
- /*
- * accroding to the state, close() was already invoked,
- * should never that point.
- */
- RTE_ASSERT(0);
- return -EINVAL;
+ tcp_stream_up(s);
+ txs_enqueue(ctx, s);
+ return 0;
}
uint32_t
@@ -453,6 +540,64 @@ tle_tcp_stream_close(struct tle_stream *ts)
}
int
+tle_tcp_stream_shutdown(struct tle_stream *ts, int how)
+{
+ int ret;
+ bool wakeup;
+ uint32_t state;
+ struct tle_tcp_stream *s;
+
+ s = TCP_STREAM(ts);
+ if (ts == NULL || s->s.type >= TLE_VNUM)
+ return -EINVAL;
+
+ /* Refer to linux/net/ipv4/tcp.c:tcp_shutdown() */
+ if (how == SHUT_RD)
+ return 0;
+
+ tcp_stream_down(s);
+
+ state = s->tcb.state;
+
+ switch (state) {
+ case TCP_ST_LISTEN:
+ /* fallthrough */
+ case TCP_ST_SYN_SENT:
+ s->tcb.state = TCP_ST_CLOSED;
+ wakeup = true;
+ ret = 0;
+ break;
+ case TCP_ST_ESTABLISHED:
+ /* fallthrough */
+ case TCP_ST_CLOSE_WAIT:
+ if (state == TCP_ST_ESTABLISHED) {
+ TCP_DEC_STATS_ATOMIC(TCP_MIB_CURRESTAB);
+ s->tcb.state = TCP_ST_FIN_WAIT_1;
+ } else
+ s->tcb.state = TCP_ST_LAST_ACK;
+ txs_enqueue(ts->ctx, s);
+ wakeup = true;
+ ret = 0;
+ break;
+ default:
+ wakeup = false;
+ rte_errno = ENOTCONN;
+ ret = -1;
+ }
+
+ if (wakeup) {
+ /* Notify other threads which may wait on the event */
+ if (s->tx.ev)
+ tle_event_raise(s->tx.ev);
+ if (how == SHUT_RDWR && s->err.ev)
+ tle_event_raise(s->err.ev);
+ }
+
+ tcp_stream_up(s);
+ return ret;
+}
+
+int
tle_tcp_stream_get_addr(const struct tle_stream *ts,
struct tle_tcp_stream_addr *addr)
{
@@ -617,3 +762,73 @@ tle_tcp_stream_get_mss(const struct tle_stream * ts)
s = TCP_STREAM(ts);
return s->tcb.snd.mss;
}
+
+int
+tle_tcp_stream_get_info(const struct tle_stream * ts, void *info, socklen_t *optlen)
+{
+ struct tle_tcp_stream *s;
+ struct tcp_info i;
+
+ if (ts == NULL)
+ return -EINVAL;
+
+ s = TCP_STREAM(ts);
+
+ memset(&i, 0, sizeof(struct tcp_info));
+
+ /* transform from tldk state into linux kernel state */
+ switch (s->tcb.state) {
+ case TCP_ST_CLOSED:
+ i.tcpi_state = TCP_CLOSE;
+ break;
+ case TCP_ST_LISTEN:
+ i.tcpi_state = TCP_LISTEN;
+ break;
+ case TCP_ST_SYN_SENT:
+ i.tcpi_state = TCP_SYN_SENT;
+ break;
+ case TCP_ST_SYN_RCVD:
+ i.tcpi_state = TCP_SYN_RECV;
+ break;
+ case TCP_ST_ESTABLISHED:
+ i.tcpi_state = TCP_ESTABLISHED;
+ break;
+ case TCP_ST_FIN_WAIT_1:
+ i.tcpi_state = TCP_FIN_WAIT1;
+ break;
+ case TCP_ST_FIN_WAIT_2:
+ i.tcpi_state = TCP_FIN_WAIT2;
+ break;
+ case TCP_ST_CLOSE_WAIT:
+ i.tcpi_state = TCP_CLOSE_WAIT;
+ break;
+ case TCP_ST_CLOSING:
+ i.tcpi_state = TCP_CLOSING;
+ break;
+ case TCP_ST_LAST_ACK:
+ i.tcpi_state = TCP_LAST_ACK;
+ break;
+ case TCP_ST_TIME_WAIT:
+ i.tcpi_state = TCP_TIME_WAIT;
+ break;
+ }
+
+ /* fix me, total retrans? */
+ i.tcpi_total_retrans = s->tcb.snd.nb_retx;
+
+ if (*optlen > sizeof(struct tcp_info))
+ *optlen = sizeof(struct tcp_info);
+ rte_memcpy(info, &i, *optlen);
+ return 0;
+}
+
+void
+tle_tcp_stream_set_keepalive(struct tle_stream *ts)
+{
+ struct tle_tcp_stream *s;
+
+ s = TCP_STREAM(ts);
+
+ s->tcb.uop |= TCP_OP_KEEPALIVE;
+ txs_enqueue(ts->ctx, s);
+}
diff --git a/lib/libtle_l4p/tcp_stream.h b/lib/libtle_l4p/tcp_stream.h
index 4629fe6..1202574 100644
--- a/lib/libtle_l4p/tcp_stream.h
+++ b/lib/libtle_l4p/tcp_stream.h
@@ -17,6 +17,8 @@
#define _TCP_STREAM_H_
#include <rte_vect.h>
+#include <rte_mbuf.h>
+
#include <tle_dring.h>
#include <tle_tcp.h>
#include <tle_event.h>
@@ -45,23 +47,28 @@ enum {
};
enum {
- TCP_OP_LISTEN = 0x1,
- TCP_OP_ACCEPT = 0x2,
- TCP_OP_CONNECT = 0x4,
- TCP_OP_CLOSE = 0x8,
+ TCP_OP_LISTEN = 0x1,
+ TCP_OP_ACCEPT = 0x2,
+ TCP_OP_CONNECT = 0x4,
+ TCP_OP_CLOSE = 0x8,
+ TCP_OP_RESET = 0x10,
+ TCP_OP_KEEPALIVE = 0x20
};
struct tcb {
+ int err;
volatile uint16_t state;
volatile uint16_t uop; /* operations by user performed */
struct {
uint32_t nxt;
+ uint32_t cpy; /* head of yet unread data */
uint32_t irs; /* initial received sequence */
uint32_t wnd;
uint32_t ts;
struct {
uint32_t seq;
- uint32_t on;
+ uint32_t on; /* on == 1: received an out-of-order fin
+ * on == 2: received an in order fin */
} frs;
uint32_t srtt; /* smoothed round trip time (scaled by >> 3) */
uint32_t rttvar; /* rtt variance */
@@ -83,15 +90,32 @@ struct tcb {
uint32_t ssthresh; /* slow start threshold */
uint32_t rto; /* retransmission timeout */
uint32_t rto_tw; /* TIME_WAIT retransmission timeout */
+ uint32_t rto_fw; /* FIN_WAIT_2 waiting timeout */
uint32_t iss; /* initial send sequence */
+ uint32_t waitlen; /* total length of unacknowledged pkt */
+ uint32_t cork_ts;
uint16_t mss;
uint8_t wscale;
uint8_t nb_retx; /* number of retransmission */
uint8_t nb_retm; /**< max number of retx attempts. */
+ uint8_t nb_keepalive;/* number of sended keepalive */
+ bool update_rcv; /* Flag for updating recv wind