aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.gitreview1
-rw-r--r--Makefile39
-rw-r--r--README200
-rw-r--r--afl/lower_constructor_priority.diff16
-rwxr-xr-xafl/run.sh1
-rw-r--r--afl/seeds/seed.txtbin0 -> 90 bytes
-rw-r--r--angora/dpdk_abilist.txt1756
-rw-r--r--angora/run.sh1
-rw-r--r--angora/seeds/seed.txt1
-rw-r--r--dpdk/Makefile101
-rw-r--r--dpdk/dpdk-v18.11_patches/0001-eal-don-t-start-the-interrupt-mp-thread.patch35
-rw-r--r--dpdk/dpdk-v18.11_patches/0002-eal-prioritize-constructor.patch25
-rw-r--r--dpdk/dpdk-v18.11_patches/0003-mbuf-add-single-linked-list.patch33
-rw-r--r--dpdk/dpdk-v18.11_patches/0004-net-virtio-user-add-rss-update-for-virtio-user.patch43
-rw-r--r--dpdk/dpdk-v18.11_patches/0005-net-virtio-user-support-raw-socket-as-backend.patch645
-rw-r--r--dpdk/dpdk-v18.11_patches/0006-mempool-add-dynamic-mempool-support.patch247
-rw-r--r--dpdk/dpdk-v18.11_patches/0007-mbuf-add-dynamic-mbuf-mempool-support.patch305
-rw-r--r--dpdk/dpdk-v18.11_patches/0008-mempool-prioritize-constructor.patch30
-rw-r--r--dpdk/dpdk-v18.11_patches/0009-net-virtio-fill-desc-limit.patch42
-rw-r--r--examples/Makefile2
-rw-r--r--examples/l4fwd/main.c1
-rw-r--r--examples/l4fwd/port.h11
-rw-r--r--lib/Makefile1
-rw-r--r--lib/libtle_glue/Makefile62
-rw-r--r--lib/libtle_glue/arp.c935
-rw-r--r--lib/libtle_glue/be.c256
-rw-r--r--lib/libtle_glue/config.h71
-rw-r--r--lib/libtle_glue/ctx.c535
-rw-r--r--lib/libtle_glue/ctx.h147
-rw-r--r--lib/libtle_glue/epoll.c577
-rw-r--r--lib/libtle_glue/fd.c122
-rw-r--r--lib/libtle_glue/fd.h113
-rw-r--r--lib/libtle_glue/gateway.h96
-rw-r--r--lib/libtle_glue/icmp.c297
-rw-r--r--lib/libtle_glue/init.c155
-rw-r--r--lib/libtle_glue/internal.h152
-rw-r--r--lib/libtle_glue/log.h77
-rw-r--r--lib/libtle_glue/ndp.h33
-rw-r--r--lib/libtle_glue/packetdrill.c544
-rw-r--r--lib/libtle_glue/packetdrill.h111
-rw-r--r--lib/libtle_glue/poll.c138
-rw-r--r--lib/libtle_glue/port.c246
-rw-r--r--lib/libtle_glue/rxcb.c834
-rw-r--r--lib/libtle_glue/rxtx.c573
-rw-r--r--lib/libtle_glue/select.c201
-rw-r--r--lib/libtle_glue/sock.h154
-rw-r--r--lib/libtle_glue/socket.c720
-rw-r--r--lib/libtle_glue/sym.c129
-rw-r--r--lib/libtle_glue/sym.h118
-rw-r--r--lib/libtle_glue/tcp.c558
-rw-r--r--lib/libtle_glue/tle_glue.h114
-rw-r--r--lib/libtle_glue/udp.c419
-rw-r--r--lib/libtle_glue/util.c60
-rw-r--r--lib/libtle_glue/util.h377
-rw-r--r--lib/libtle_glue/zerocopy.h59
-rw-r--r--lib/libtle_l4p/Makefile1
-rw-r--r--lib/libtle_l4p/ctx.c349
-rw-r--r--lib/libtle_l4p/ctx.h38
-rw-r--r--lib/libtle_l4p/misc.h66
-rw-r--r--lib/libtle_l4p/net_misc.h21
-rw-r--r--lib/libtle_l4p/port_statmap.h127
-rw-r--r--lib/libtle_l4p/stream.h55
-rw-r--r--lib/libtle_l4p/stream_table.c65
-rw-r--r--lib/libtle_l4p/stream_table.h490
-rw-r--r--lib/libtle_l4p/syncookie.h4
-rw-r--r--lib/libtle_l4p/tcp_ctl.h68
-rw-r--r--lib/libtle_l4p/tcp_misc.h34
-rw-r--r--lib/libtle_l4p/tcp_ofo.c39
-rw-r--r--lib/libtle_l4p/tcp_ofo.h14
-rw-r--r--lib/libtle_l4p/tcp_rxq.h4
-rw-r--r--lib/libtle_l4p/tcp_rxtx.c1445
-rw-r--r--lib/libtle_l4p/tcp_rxtx.h252
-rw-r--r--lib/libtle_l4p/tcp_stream.c395
-rw-r--r--lib/libtle_l4p/tcp_stream.h37
-rw-r--r--lib/libtle_l4p/tcp_timer.h40
-rw-r--r--lib/libtle_l4p/tcp_tx_seg.h12
-rw-r--r--lib/libtle_l4p/tcp_txq.h29
-rw-r--r--lib/libtle_l4p/tle_ctx.h41
-rw-r--r--lib/libtle_l4p/tle_event.h2
-rw-r--r--lib/libtle_l4p/tle_stats.h101
-rw-r--r--lib/libtle_l4p/tle_tcp.h60
-rw-r--r--lib/libtle_l4p/tle_udp.h49
-rw-r--r--lib/libtle_l4p/udp_rxtx.c186
-rw-r--r--lib/libtle_l4p/udp_stream.c347
-rw-r--r--lib/libtle_l4p/udp_stream.h9
-rw-r--r--lib/libtle_timer/timer.c43
-rw-r--r--mk/tle.app.mk4
-rw-r--r--mk/tle.lib.mk4
-rw-r--r--test/Makefile4
-rw-r--r--test/gtest/Makefile1
-rw-r--r--test/gtest/test_tle_ctx.cpp1
-rw-r--r--test/gtest/test_tle_tcp_stream.cpp4
-rw-r--r--test/gtest/test_tle_tcp_stream.h4
-rw-r--r--test/gtest/test_tle_udp_destroy.cpp1
-rw-r--r--test/gtest/test_tle_udp_stream_gen.cpp53
-rw-r--r--test/gtest/test_tle_udp_stream_gen.h2
-rw-r--r--test/packetdrill/COPYING339
-rw-r--r--test/packetdrill/Makefile2
-rw-r--r--test/packetdrill/Makefile.FreeBSD2
-rw-r--r--test/packetdrill/Makefile.Linux2
-rw-r--r--test/packetdrill/Makefile.NetBSD2
-rw-r--r--test/packetdrill/Makefile.OpenBSD2
-rw-r--r--test/packetdrill/Makefile.common63
-rw-r--r--test/packetdrill/README58
-rw-r--r--test/packetdrill/assert.h10
-rw-r--r--test/packetdrill/capability.h102
-rw-r--r--test/packetdrill/checksum.c239
-rw-r--r--test/packetdrill/checksum.h54
-rw-r--r--test/packetdrill/checksum_test.c140
-rw-r--r--test/packetdrill/code.c777
-rw-r--r--test/packetdrill/code.h122
-rw-r--r--test/packetdrill/config.c605
-rw-r--r--test/packetdrill/config.h204
-rwxr-xr-xtest/packetdrill/configure3
-rw-r--r--test/packetdrill/contrib/packetdrill.el45
-rw-r--r--test/packetdrill/contrib/packetdrill.vim125
-rw-r--r--test/packetdrill/epoll.c55
-rw-r--r--test/packetdrill/epoll.h62
-rw-r--r--test/packetdrill/ethernet.h75
-rw-r--r--test/packetdrill/fd_state.h64
-rw-r--r--test/packetdrill/file.c55
-rw-r--r--test/packetdrill/file.h52
-rw-r--r--test/packetdrill/fmemopen.c81
-rw-r--r--test/packetdrill/fmemopen.h37
-rw-r--r--test/packetdrill/gre.h102
-rw-r--r--test/packetdrill/gre_packet.c56
-rw-r--r--test/packetdrill/gre_packet.h45
-rw-r--r--test/packetdrill/hash.c430
-rw-r--r--test/packetdrill/hash.h43
-rw-r--r--test/packetdrill/hash_map.c162
-rw-r--r--test/packetdrill/hash_map.h56
-rw-r--r--test/packetdrill/header.h93
-rw-r--r--test/packetdrill/icmp.h97
-rw-r--r--test/packetdrill/icmp_packet.c406
-rw-r--r--test/packetdrill/icmp_packet.h55
-rw-r--r--test/packetdrill/icmpv6.h81
-rw-r--r--test/packetdrill/ip.h108
-rw-r--r--test/packetdrill/ip_address.c379
-rw-r--r--test/packetdrill/ip_address.h131
-rw-r--r--test/packetdrill/ip_packet.c221
-rw-r--r--test/packetdrill/ip_packet.h80
-rw-r--r--test/packetdrill/ip_prefix.c148
-rw-r--r--test/packetdrill/ip_prefix.h69
-rw-r--r--test/packetdrill/ipv6.h92
-rw-r--r--test/packetdrill/lexer.l280
-rw-r--r--test/packetdrill/link_layer.c104
-rw-r--r--test/packetdrill/link_layer.h38
-rw-r--r--test/packetdrill/logging.c51
-rw-r--r--test/packetdrill/logging.h46
-rw-r--r--test/packetdrill/mpls.h113
-rw-r--r--test/packetdrill/mpls_packet.c77
-rw-r--r--test/packetdrill/mpls_packet.h57
-rw-r--r--test/packetdrill/net_utils.c172
-rw-r--r--test/packetdrill/net_utils.h56
-rw-r--r--test/packetdrill/netdev.c502
-rw-r--r--test/packetdrill/netdev.h99
-rw-r--r--test/packetdrill/open_memstream.c142
-rw-r--r--test/packetdrill/open_memstream.h37
-rw-r--r--test/packetdrill/packet.c327
-rw-r--r--test/packetdrill/packet.h425
-rw-r--r--test/packetdrill/packet_checksum.c116
-rw-r--r--test/packetdrill/packet_checksum.h33
-rw-r--r--test/packetdrill/packet_parser.c625
-rw-r--r--test/packetdrill/packet_parser.h53
-rw-r--r--test/packetdrill/packet_parser_test.c484
-rw-r--r--test/packetdrill/packet_socket.h69
-rw-r--r--test/packetdrill/packet_socket_linux.c280
-rw-r--r--test/packetdrill/packet_socket_pcap.c290
-rw-r--r--test/packetdrill/packet_to_string.c303
-rw-r--r--test/packetdrill/packet_to_string.h44
-rw-r--r--test/packetdrill/packet_to_string_test.c301
-rw-r--r--test/packetdrill/packetdrill.c113
-rw-r--r--test/packetdrill/packetdrill.h108
-rw-r--r--test/packetdrill/parse.h62
-rw-r--r--test/packetdrill/parser.y1739
-rw-r--r--test/packetdrill/pipe.c55
-rw-r--r--test/packetdrill/pipe.h54
-rw-r--r--test/packetdrill/platforms.h121
-rw-r--r--test/packetdrill/run.c695
-rw-r--r--test/packetdrill/run.h197
-rw-r--r--test/packetdrill/run_command.c55
-rw-r--r--test/packetdrill/run_command.h38
-rw-r--r--test/packetdrill/run_packet.c1934
-rw-r--r--test/packetdrill/run_packet.h61
-rw-r--r--test/packetdrill/run_system_call.c3561
-rw-r--r--test/packetdrill/run_system_call.h104
-rw-r--r--test/packetdrill/script.c745
-rw-r--r--test/packetdrill/script.h308
-rw-r--r--test/packetdrill/sctp.h40
-rw-r--r--test/packetdrill/so_testing.c169
-rw-r--r--test/packetdrill/so_testing.h55
-rw-r--r--test/packetdrill/socket.c80
-rw-r--r--test/packetdrill/socket.h311
-rw-r--r--test/packetdrill/symbols.h42
-rw-r--r--test/packetdrill/symbols_freebsd.c310
-rw-r--r--test/packetdrill/symbols_linux.c502
-rw-r--r--test/packetdrill/symbols_netbsd.c320
-rw-r--r--test/packetdrill/symbols_openbsd.c281
-rw-r--r--test/packetdrill/system.c52
-rw-r--r--test/packetdrill/system.h35
-rw-r--r--test/packetdrill/tcp.h339
-rw-r--r--test/packetdrill/tcp_options.c70
-rw-r--r--test/packetdrill/tcp_options.h129
-rw-r--r--test/packetdrill/tcp_options_iterator.c169
-rw-r--r--test/packetdrill/tcp_options_iterator.h53
-rw-r--r--test/packetdrill/tcp_options_to_string.c167
-rw-r--r--test/packetdrill/tcp_options_to_string.h40
-rw-r--r--test/packetdrill/tcp_packet.c166
-rw-r--r--test/packetdrill/tcp_packet.h51
-rw-r--r--test/packetdrill/tests/bsd/fast_retransmit/fr-4pkt-sack-bsd.pkt38
-rw-r--r--test/packetdrill/tests/linux/README7
-rw-r--r--test/packetdrill/tests/linux/blocking/blocking-accept.pkt15
-rw-r--r--test/packetdrill/tests/linux/blocking/blocking-read.pkt25
-rw-r--r--test/packetdrill/tests/linux/close/close-read-data-fin.pkt38
-rw-r--r--test/packetdrill/tests/linux/close/close-so-linger-onoff-1-linger-0-rst.pkt28
-rw-r--r--test/packetdrill/tests/linux/close/close-unread-data-rst.pkt38
-rw-r--r--test/packetdrill/tests/linux/connect/http-get-nonblocking-ts.pkt34
-rw-r--r--test/packetdrill/tests/linux/early_retransmit/er-delayed-2pkt-sack.pkt27
-rw-r--r--test/packetdrill/tests/linux/early_retransmit/er-delayed-3pkt-sack.pkt28
-rw-r--r--test/packetdrill/tests/linux/early_retransmit/er-delayed-filled-3pkt-sack.pkt31
-rw-r--r--test/packetdrill/tests/linux/early_retransmit/er-delayed-get-ack-3pkt-sack.pkt35
-rw-r--r--test/packetdrill/tests/linux/early_retransmit/er-quick-2pkt-sack.pkt27
-rw-r--r--test/packetdrill/tests/linux/early_retransmit/er-quick-3pkt-sack.pkt28
-rw-r--r--test/packetdrill/tests/linux/fast_recovery/prr-ss-ack-below-snd_una-reno.pkt51
-rw-r--r--test/packetdrill/tests/linux/fast_retransmit/fr-4pkt-sack-linux.pkt35
-rw-r--r--test/packetdrill/tests/linux/icmp/icmp-all-types.pkt71
-rw-r--r--test/packetdrill/tests/linux/inet_diag/inet-diag-ipv4-mapped-ipv6.pkt29
-rw-r--r--test/packetdrill/tests/linux/inet_diag/inet-diag-ipv4.pkt28
-rw-r--r--test/packetdrill/tests/linux/inet_diag/inet-diag-ipv6.pkt29
-rw-r--r--test/packetdrill/tests/linux/init_rto/init_rto_passive_open.pkt17
-rwxr-xr-xtest/packetdrill/tests/linux/initial_window/iw10-base-case.pkt21
-rwxr-xr-xtest/packetdrill/tests/linux/initial_window/iw10-short-response.pkt21
-rw-r--r--test/packetdrill/tests/linux/ioctl/ioctl-siocinq-fin.pkt30
-rw-r--r--test/packetdrill/tests/linux/listen/listen-incoming-ack.pkt20
-rw-r--r--test/packetdrill/tests/linux/listen/listen-incoming-no-tcp-flags.pkt21
-rw-r--r--test/packetdrill/tests/linux/listen/listen-incoming-rst.pkt22
-rw-r--r--test/packetdrill/tests/linux/listen/listen-incoming-syn-ack.pkt20
-rw-r--r--test/packetdrill/tests/linux/listen/listen-incoming-syn-rst.pkt22
-rw-r--r--test/packetdrill/tests/linux/listen/listen-unbound.pkt5
-rw-r--r--test/packetdrill/tests/linux/mss/mss-getsockopt-tcp_maxseg-client-ts.pkt17
-rw-r--r--test/packetdrill/tests/linux/mss/mss-getsockopt-tcp_maxseg-client.pkt14
-rw-r--r--test/packetdrill/tests/linux/mss/mss-getsockopt-tcp_maxseg-server-advmss-ipv4.pkt29
-rw-r--r--test/packetdrill/tests/linux/mss/mss-getsockopt-tcp_maxseg-server-advmss-ts-ipv4.pkt30
-rw-r--r--test/packetdrill/tests/linux/mss/mss-getsockopt-tcp_maxseg-server-ts.pkt20
-rw-r--r--test/packetdrill/tests/linux/mss/mss-getsockopt-tcp_maxseg-server.pkt17
-rw-r--r--test/packetdrill/tests/linux/mss/mss-setsockopt-tcp_maxseg-client.pkt24
-rw-r--r--test/packetdrill/tests/linux/mss/mss-setsockopt-tcp_maxseg-server.pkt27
-rw-r--r--test/packetdrill/tests/linux/pmtu_discovery/pmtud-10pkt-1460-to-1160.pkt54
-rw-r--r--test/packetdrill/tests/linux/pmtu_discovery/pmtud-1pkt-1460-to-1160.pkt36
-rw-r--r--test/packetdrill/tests/linux/receiver_rtt/rcv-rtt-with-timestamps-new.pkt57
-rw-r--r--test/packetdrill/tests/linux/receiver_rtt/rcv-rtt-without-timestamps-new.pkt62
-rwxr-xr-xtest/packetdrill/tests/linux/run_tests.sh6
-rw-r--r--test/packetdrill/tests/linux/sack/sack-shift-sacked-1-2-3-fack.pkt47
-rw-r--r--test/packetdrill/tests/linux/sack/sack-shift-sacked-1-2:6-fack.pkt39
-rw-r--r--test/packetdrill/tests/linux/shutdown/shutdown-rd-close.pkt29
-rw-r--r--test/packetdrill/tests/linux/shutdown/shutdown-rd-wr-close.pkt45
-rw-r--r--test/packetdrill/tests/linux/shutdown/shutdown-rdwr-close.pkt26
-rw-r--r--test/packetdrill/tests/linux/shutdown/shutdown-wr-close.pkt29
-rw-r--r--test/packetdrill/tests/linux/undo/undo-fr-ack-then-dsack-on-ack-below-snd_una.pkt55
-rw-r--r--test/packetdrill/tests/linux/undo/undo-fr-acks-dropped-then-dsack.pkt44
-rw-r--r--test/packetdrill/tests/tldk/delay_ack/delay-ack-tldk.pkt26
-rw-r--r--test/packetdrill/tests/tldk/fast_retransmit/fr-4pkt-tldk.pkt35
-rw-r--r--test/packetdrill/tests/tldk/keep_alive/keep-alive-after-accept-tldk.pkt50
-rw-r--r--test/packetdrill/tests/tldk/keep_alive/keep-alive-before-connect-tldk.pkt37
-rw-r--r--test/packetdrill/tests/tldk/keep_alive/keep-alive-enable-disable-tldk.pkt26
-rw-r--r--test/packetdrill/tests/tldk/out_of_order/ofo-simple-3pkt-tldk.pkt27
-rw-r--r--test/packetdrill/tests/tldk/tso/tso-segment-split.pkt63
-rw-r--r--test/packetdrill/tun.h117
-rw-r--r--test/packetdrill/types.c44
-rw-r--r--test/packetdrill/types.h207
-rw-r--r--test/packetdrill/uapi_linux.h296
-rw-r--r--test/packetdrill/udp.h44
-rw-r--r--test/packetdrill/udp_packet.c91
-rw-r--r--test/packetdrill/udp_packet.h44
-rw-r--r--test/packetdrill/unaligned.h53
-rw-r--r--test/packetdrill/wire_client.c302
-rw-r--r--test/packetdrill/wire_client.h69
-rw-r--r--test/packetdrill/wire_client_netdev.c167
-rw-r--r--test/packetdrill/wire_client_netdev.h37
-rw-r--r--test/packetdrill/wire_conn.c254
-rw-r--r--test/packetdrill/wire_conn.h88
-rw-r--r--test/packetdrill/wire_protocol.c49
-rw-r--r--test/packetdrill/wire_protocol.h66
-rw-r--r--test/packetdrill/wire_server.c537
-rw-r--r--test/packetdrill/wire_server.h36
-rw-r--r--test/packetdrill/wire_server_netdev.c204
-rw-r--r--test/packetdrill/wire_server_netdev.h47
-rw-r--r--test/packetdrill/wrap.c125
-rw-r--r--test/packetdrill/wrap.h32
289 files changed, 45114 insertions, 1313 deletions
diff --git a/.gitreview b/.gitreview
index 3559d4a..418bfa7 100644
--- a/.gitreview
+++ b/.gitreview
@@ -2,3 +2,4 @@
host=gerrit.fd.io
port=29418
project=tldk
+defaultbranch=dev-next-socket
diff --git a/Makefile b/Makefile
index 474ada6..10c276d 100644
--- a/Makefile
+++ b/Makefile
@@ -22,6 +22,7 @@ endif
RTE_TARGET ?= x86_64-native-linuxapp-gcc
+DIRS-y += dpdk
DIRS-y += lib
DIRS-y += examples
DIRS-y += test
@@ -32,11 +33,18 @@ MAKEFLAGS += --no-print-directory
O ?= $(TLDK_ROOT)/${RTE_TARGET}
BASE_OUTPUT ?= $(abspath $(O))
+DPDK_LIBS_PATH := $(TLDK_ROOT)/dpdk/install/lib
+TLDK_LIBS_PATH := $(TLDK_ROOT)/$(RTE_TARGET)/lib
+LIBS :=
+
.PHONY: all
all: $(DIRS-y)
.PHONY: clean
-clean: $(DIRS-y)
+clean:
+ @make clean -C test/packetdrill
+ @rm -rf $(RTE_TARGET)
+ @rm -rf libtldk.so libtldk.a
.PHONY: $(DIRS-y)
$(DIRS-y): $(RTE_SDK)/mk/rte.vars.mk
@@ -48,8 +56,37 @@ $(DIRS-y): $(RTE_SDK)/mk/rte.vars.mk
CUR_SUBDIR=$(CUR_SUBDIR)/$(@) \
S=$(CURDIR)/$(@) \
RTE_TARGET=$(RTE_TARGET) \
+ EXTRA_CFLAGS="-fPIC" \
$(filter-out $(DIRS-y),$(MAKECMDGOALS))
+test: libtldk.a libtldk.so
+
+libtldk.so: lib
+ $(eval LIBS = $(wildcard $(DPDK_LIBS_PATH)/librte*.a $(TLDK_LIBS_PATH)/*.a))
+ @gcc -shared -o libtldk.so -L$(DPDK_LIBS_PATH) -L$(TLDK_LIBS_PATH) \
+ -Wl,--whole-archive $(LIBS) -Wl,--no-whole-archive \
+ -lpthread -ldl -lnuma
+
+define repack
+@echo -- repack $1 ---
+@rm -rf tmpxyz; rm -f $1; mkdir tmpxyz; cd tmpxyz; \
+ for f in $(LIBS) ; do \
+ fn=$$(basename $$f) ; \
+ echo $$fn ; \
+ mkdir $$fn"_obj" ; \
+ cd $$fn"_obj" ; \
+ ar x $$f ; \
+ cd .. ; \
+ done; \
+ar cru ../$1 $$(find */*.o | paste -sd " " -); cd ..; rm -rf tmpxyz
+endef
+
+libtldk.a: lib
+ $(eval LIBS = $(wildcard $(DPDK_LIBS_PATH)/librte*.a))
+ $(call repack,libdpdk.a)
+ $(eval LIBS = $(wildcard $(DPDK_LIBS_PATH)/librte*.a $(TLDK_LIBS_PATH)/*.a))
+ $(call repack,libtldk.a)
+
$(RTE_SDK)/mk/rte.vars.mk:
ifeq ($(RTE_SDK),$(LOCAL_RTE_SDK))
@make RTE_TARGET=$(RTE_TARGET) config all -C $(TLDK_ROOT)/dpdk/
diff --git a/README b/README
index 2ca150b..792bdef 100644
--- a/README
+++ b/README
@@ -1,7 +1,5 @@
1. OVERVIEW
- TLDK project scope is as follows:
-
1) To implement a set of libraries for L4 protocol processing (UDP, TCP etc.)
for both IPv4 and IPv6.
@@ -16,8 +14,7 @@
code for setup, manage and perform actual IO over underlying devices are
all out of scope of these libraries.
- The only information these libraries need to know about the
- underlying devices:
+ The only information these libraries need about the underlying devices:
- supported HW offloads
- MTU and L3/L2 addresses
That allows the libraries to fill L2/L3 headers and mbuf metadata
@@ -36,12 +33,22 @@
The library uses siphash logic from the below source
https://github.com/veorq/SipHash
+2. APIs
+
+ TLDK provides three series of APIs:
+ - TLDK native APIs, provided by libtle_l4p.
+ - Posix APIs, provided by libtle_glue with PRELOAD compile macro.
+ - Posix APIs with changed symbol names, provided by libtle_glue without PRELOAD macro.
+
+
+3. INSTALLATION GUIDE
-2. INSTALLATION GUIDE
+ - Original guide
+ ----------------
1) Obtain latest supported DPDK version and build it.
(refer to http://dpdk.org for information how to download and build it).
- Currently supported(tested) DPDK versions: 18.11 LTS.
+ Currently supported(tested) DPDK versions: 16.11 LTS, 17.11 LTS, 18.02.
2) Make sure that RTE_SDK and RTE_TARGET DPDK related environment variables
are setup correctly.
3) Go to the TLDK root directory and type: 'make all'.
@@ -58,6 +65,29 @@
make all
./x86_64-native-linuxapp-gcc/app/l4fwd ...
+
+ - For preload use
+ -----------------
+
+ Debug:
+
+ $ make DPDK_DEBUG=y EXTRA_CFLAGS="-g -O0 -fPIC -DPRELOAD" all
+
+ Release:
+
+ $ make EXTRA_CFLAGS="-g -fPIC -DPRELOAD" all
+
+ - For TLDK API use
+ ------------------
+
+ Debug:
+
+ $ make DPDK_DEBUG=y EXTRA_CFLAGS="-g -O0 -fPIC" all
+
+ Release:
+
+ $ make EXTRA_CFLAGS="-g -O3 -fPIC" all
+
3. CONTENTS
$(TLDK_ROOT)
@@ -74,6 +104,8 @@
| +--libtle_l4p - implementation of the TCP/UDP packet processing
| |
| +--libtle_timer - implementation of the timer library
+ | |
+ | +--libtle_glue - socket glue layer with arp, icmp, epoll, etc
|
+----examples
| |
@@ -88,3 +120,159 @@
| | (googletest)
| |
| +--timer - UT for libtle_timer (standalone app)
+ | |
+ | +--packetdrill - UT for stack (standalone app)
+
+
+5. Features
+
+ Done:
+ - posix interface
+ - loopback device
+ - regression test
+ - multi-thread
+ - lightweight mem
+ - tcp_info (paritial)
+ - fd management
+ - arp request/reply
+ - icmp reply
+ - interrupt mode
+ - blocking recv/send
+ - TSO
+ - UFO
+
+ TODO:
+ - fuzzing
+ - SACK
+ - RACK
+ - zerocopy APIs
+ - batching APIs
+ - multi-process
+ - numa awareness
+ - context recycle on thread exit
+
+5. Thread model
+
+ - Multi-process is still not fully supported.
+
+ - Symmetric multi-thread
+
+ (app thread) (app thread) (app thread)
+ \ \ \
+ / / /
+ \ \ \
+ --------------------------------------------------------
+ | FD management, Socket APIs (FE) |
+ --------------------------------------------------------
+
+ ----------- ----------- -----------
+ | | | | | |
+ | ctx | | ctx | | ctx |
+ | | | | | |
+ ----------- ----------- -----------
+ \__ | __/
+ \__ | __/
+ \__ | __/
+ \__ | __/
+ -------------------------
+ | (RSS) NIC (FDIR) |
+ -------------------------
+
+ - Lookaside multi-thread
+
+ (app thread) (app thread) (io thread)
+ \ \ \
+ / / /
+ \ \ \
+ ------------------------------------------------------
+ | FD management, Socket APIs (FE) |
+ ------------------------------------------------------
+ /
+ \
+ /
+ ------------------------------------------------------
+ | |
+ | ctx |
+ | |
+ ------------------------------------------------------
+ |
+ |
+ -------------------------
+ | NIC |
+ -------------------------
+
+6. How to run
+
+ We have two setups which need their own preparation.
+
+ - virtio-user: test with virtio-user + vhost-kernel;
+ - physical NIC: test with physical NIC bound to vfio.
+
+ If you are using physical NIC:
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ a. Set Linux boot options (Only needed if you will use physical NIC)
+ a1. Add below configuration into GRUB_CMDLINE_LINUX in /etc/default/grub
+ "intel_iommu=on iommu=pt"
+
+ a2. Update grub
+ $ sudo grub2-mkconfig -o /boot/grub2/grub.cfg
+
+ If you want to use 1GB hugepages, you can also add below content in the
+ boot cmdline:
+ "default_hugepagesz=1G hugepagesz=1G hugepages=2"
+
+ b. Adjust RLIMIT_MEMLOCK (Only needed if you will use physical NIC)
+ Add below two lines into /etc/security/limits.conf
+ "* soft memlock 4194304
+ * hard memlock 4194304"
+
+ c. Reboot system
+
+ d. Bind NIC to vfio-pci
+
+ $ sudo modprobe vfio-pci
+ $ sudo ./usertools/dpdk-devbind.py -b vfio-pci 0000:01:00.1
+ $ sudo chmod 666 /dev/vfio/16 (16 is just an example)
+
+ If you are using virtio-user:
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ a. Prepare vhost-kernel
+
+ $ sudo modprobe vhost-net
+ (if you don't have those modules, have to compile by yourself)
+ $ sudo chmod 666 /dev/vhost-net
+ $ sudo tunctl -u <your username>
+
+ b. Prepare the vNIC
+
+ $ export DPDK_VNIC="--vdev=virtio_user0,path=/dev/vhost-net,queue_size=1024,iface=tap0"
+
+ For both cases, we need to:
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ $ sudo chmod 777 /dev/hugepages
+ $ export DPDK_IP=1.1.1.1
+
+ Note: for specific test example, you can refer to the example commit log.
+
+7. How to run packetdrill tests:
+
+ Compile it in LOOK_ASIDE_BACKEND mode:
+
+ $ make PACKETDRILL=y EXTRA_CFLAGS="-g -O0 -fPIC -march=native -DLOOK_ASIDE_BACKEND " all
+
+ To run it:
+
+ $ test/packetdrill/packetdrill --local_ip=192.168.0.2 \
+ --remote_ip=192.0.2.1 --so_filename=`pwd`/libtldk.so \
+ test/packetdrill/tests/tldk/fast_retransmit/fr-4pkt-tldk.pkt
+
+8. Tested Examples
+
+ - examples/client
+ - examples/server
+ - wget (epoll)
+ - curl (poll)
+ - haproxy (multi-thread mode)
diff --git a/afl/lower_constructor_priority.diff b/afl/lower_constructor_priority.diff
new file mode 100644
index 0000000..b1eba07
--- /dev/null
+++ b/afl/lower_constructor_priority.diff
@@ -0,0 +1,16 @@
+diff --git a/llvm_mode/afl-llvm-rt.o.c b/llvm_mode/afl-llvm-rt.o.c
+index debde20..69e2e4c 100644
+--- a/llvm_mode/afl-llvm-rt.o.c
++++ b/llvm_mode/afl-llvm-rt.o.c
+@@ -39,9 +39,9 @@
+ the LLVM-generated runtime initialization pass, not before. */
+
+ #ifdef USE_TRACE_PC
+-# define CONST_PRIO 5
++# define CONST_PRIO 2005
+ #else
+-# define CONST_PRIO 0
++# define CONST_PRIO 2000
+ #endif /* ^USE_TRACE_PC */
+
+ #include <sys/mman.h>
diff --git a/afl/run.sh b/afl/run.sh
new file mode 100755
index 0000000..23213f6
--- /dev/null
+++ b/afl/run.sh
@@ -0,0 +1 @@
+`pwd`/AFLplusplus/afl-fuzz -m 4096 -i seeds -o output ../x86_64-native-linuxapp-gcc/app/tcp_lo 127.0.0.1 1234 @@
diff --git a/afl/seeds/seed.txt b/afl/seeds/seed.txt
new file mode 100644
index 0000000..1ed4d76
--- /dev/null
+++ b/afl/seeds/seed.txt
Binary files differ
diff --git a/angora/dpdk_abilist.txt b/angora/dpdk_abilist.txt
new file mode 100644
index 0000000..f02f7c7
--- /dev/null
+++ b/angora/dpdk_abilist.txt
@@ -0,0 +1,1756 @@
+fun:pci_find_max_end_va=uninstrumented
+fun:pci_parse_one_sysfs_resource=uninstrumented
+fun:pci_update_device=uninstrumented
+fun:rte_pci_get_iommu_class=uninstrumented
+fun:rte_pci_ioport_map=uninstrumented
+fun:rte_pci_ioport_read=uninstrumented
+fun:rte_pci_ioport_unmap=uninstrumented
+fun:rte_pci_ioport_write=uninstrumented
+fun:rte_pci_map_device=uninstrumented
+fun:rte_pci_read_config=uninstrumented
+fun:rte_pci_scan=uninstrumented
+fun:rte_pci_unmap_device=uninstrumented
+fun:rte_pci_write_config=uninstrumented
+fun:pci_name_set=uninstrumented
+fun:rte_pci_add_device=uninstrumented
+fun:rte_pci_dump=uninstrumented
+fun:rte_pci_get_sysfs_path=uninstrumented
+fun:rte_pci_insert_device=uninstrumented
+fun:rte_pci_match=uninstrumented
+fun:rte_pci_probe=uninstrumented
+fun:rte_pci_register=uninstrumented
+fun:rte_pci_unregister=uninstrumented
+fun:pci_uio_map_resource=uninstrumented
+fun:pci_uio_remap_resource=uninstrumented
+fun:pci_uio_unmap_resource=uninstrumented
+fun:rte_pci_dev_iterate=uninstrumented
+fun:pci_uio_alloc_resource=uninstrumented
+fun:pci_uio_free_resource=uninstrumented
+fun:pci_uio_ioport_map=uninstrumented
+fun:pci_uio_ioport_read=uninstrumented
+fun:pci_uio_ioport_unmap=uninstrumented
+fun:pci_uio_ioport_write=uninstrumented
+fun:pci_uio_map_resource_by_index=uninstrumented
+fun:pci_uio_read_config=uninstrumented
+fun:pci_uio_write_config=uninstrumented
+fun:pci_vfio_ioport_map=uninstrumented
+fun:pci_vfio_ioport_read=uninstrumented
+fun:pci_vfio_ioport_unmap=uninstrumented
+fun:pci_vfio_ioport_write=uninstrumented
+fun:pci_vfio_is_enabled=uninstrumented
+fun:pci_vfio_map_resource=uninstrumented
+fun:pci_vfio_read_config=uninstrumented
+fun:pci_vfio_unmap_resource=uninstrumented
+fun:pci_vfio_write_config=uninstrumented
+fun:rte_vdev_add_custom_scan=uninstrumented
+fun:rte_vdev_find_device=uninstrumented
+fun:rte_vdev_init=uninstrumented
+fun:rte_vdev_register=uninstrumented
+fun:rte_vdev_remove_custom_scan=uninstrumented
+fun:rte_vdev_uninit=uninstrumented
+fun:rte_vdev_unregister=uninstrumented
+fun:rte_vdev_dev_iterate=uninstrumented
+fun:cmdline_free=uninstrumented
+fun:cmdline_in=uninstrumented
+fun:cmdline_interact=uninstrumented
+fun:cmdline_new=uninstrumented
+fun:cmdline_poll=uninstrumented
+fun:cmdline_printf=uninstrumented
+fun:cmdline_quit=uninstrumented
+fun:cmdline_set_prompt=uninstrumented
+fun:cmdline_write_char=uninstrumented
+fun:cirbuf_add_buf_head=uninstrumented
+fun:cirbuf_add_buf_tail=uninstrumented
+fun:cirbuf_add_head=uninstrumented
+fun:cirbuf_add_head_safe=uninstrumented
+fun:cirbuf_add_tail=uninstrumented
+fun:cirbuf_add_tail_safe=uninstrumented
+fun:cirbuf_align_left=uninstrumented
+fun:cirbuf_align_right=uninstrumented
+fun:cirbuf_del_buf_head=uninstrumented
+fun:cirbuf_del_buf_tail=uninstrumented
+fun:cirbuf_del_head=uninstrumented
+fun:cirbuf_del_head_safe=uninstrumented
+fun:cirbuf_del_tail=uninstrumented
+fun:cirbuf_del_tail_safe=uninstrumented
+fun:cirbuf_get_buf_head=uninstrumented
+fun:cirbuf_get_buf_tail=uninstrumented
+fun:cirbuf_get_head=uninstrumented
+fun:cirbuf_get_tail=uninstrumented
+fun:cirbuf_init=uninstrumented
+fun:cmdline_complete=uninstrumented
+fun:cmdline_isendofcommand=uninstrumented
+fun:cmdline_isendoftoken=uninstrumented
+fun:cmdline_parse=uninstrumented
+fun:cmdline_get_help_etheraddr=uninstrumented
+fun:cmdline_parse_etheraddr=uninstrumented
+fun:cmdline_get_help_ipaddr=uninstrumented
+fun:cmdline_parse_ipaddr=uninstrumented
+fun:cmdline_get_help_num=uninstrumented
+fun:cmdline_parse_num=uninstrumented
+fun:cmdline_get_help_portlist=uninstrumented
+fun:cmdline_parse_portlist=uninstrumented
+fun:cmdline_complete_get_elt_string=uninstrumented
+fun:cmdline_complete_get_nb_string=uninstrumented
+fun:cmdline_get_help_string=uninstrumented
+fun:cmdline_parse_string=uninstrumented
+fun:rdline_add_history=uninstrumented
+fun:rdline_char_in=uninstrumented
+fun:rdline_clear_history=uninstrumented
+fun:rdline_get_buffer=uninstrumented
+fun:rdline_get_history_item=uninstrumented
+fun:rdline_init=uninstrumented
+fun:rdline_newline=uninstrumented
+fun:rdline_quit=uninstrumented
+fun:rdline_redisplay=uninstrumented
+fun:rdline_reset=uninstrumented
+fun:rdline_restart=uninstrumented
+fun:rdline_stop=uninstrumented
+fun:cmdline_file_new=uninstrumented
+fun:cmdline_stdin_exit=uninstrumented
+fun:cmdline_stdin_new=uninstrumented
+fun:vt100_init=uninstrumented
+fun:vt100_parser=uninstrumented
+fun:eal_create_runtime_dir=uninstrumented
+fun:eal_parse_sysfs_value=uninstrumented
+fun:eal_proc_type_detect=uninstrumented
+fun:rte_eal_check_module=uninstrumented
+fun:rte_eal_cleanup=uninstrumented
+fun:rte_eal_create_uio_dev=uninstrumented
+fun:rte_eal_get_configuration=uninstrumented
+fun:rte_eal_get_runtime_dir=uninstrumented
+fun:rte_eal_has_hugepages=uninstrumented
+fun:rte_eal_has_pci=uninstrumented
+fun:rte_eal_init=uninstrumented
+fun:rte_eal_iopl_init=uninstrumented
+fun:rte_eal_iova_mode=uninstrumented
+fun:rte_eal_lcore_role=uninstrumented
+fun:rte_eal_mbuf_user_pool_ops=uninstrumented
+fun:rte_eal_process_type=uninstrumented
+fun:rte_eal_vfio_intr_mode=uninstrumented
+fun:rte_set_application_usage_hook=uninstrumented
+fun:rte_eal_alarm_cancel=uninstrumented
+fun:rte_eal_alarm_init=uninstrumented
+fun:rte_eal_alarm_set=uninstrumented
+fun:rte_bus_dump=uninstrumented
+fun:rte_bus_find=uninstrumented
+fun:rte_bus_find_by_device=uninstrumented
+fun:rte_bus_find_by_device_name=uninstrumented
+fun:rte_bus_find_by_name=uninstrumented
+fun:rte_bus_get_iommu_class=uninstrumented
+fun:rte_bus_probe=uninstrumented
+fun:rte_bus_register=uninstrumented
+fun:rte_bus_scan=uninstrumented
+fun:rte_bus_sigbus_handler=uninstrumented
+fun:rte_bus_unregister=uninstrumented
+fun:rte_class_find=uninstrumented
+fun:rte_class_find_by_name=uninstrumented
+fun:rte_class_register=uninstrumented
+fun:rte_class_unregister=uninstrumented
+fun:rte_cpu_check_supported=uninstrumented
+fun:rte_cpu_is_supported=uninstrumented
+fun:local_dev_probe=uninstrumented
+fun:local_dev_remove=uninstrumented
+fun:rte_dev_event_callback_process=uninstrumented
+fun:rte_dev_event_callback_register=uninstrumented
+fun:rte_dev_event_callback_unregister=uninstrumented
+fun:rte_dev_is_probed=uninstrumented
+fun:rte_dev_iterator_init=uninstrumented
+fun:rte_dev_iterator_next=uninstrumented
+fun:rte_dev_probe=uninstrumented
+fun:rte_dev_remove=uninstrumented
+fun:rte_eal_hotplug_add=uninstrumented
+fun:rte_eal_hotplug_remove=uninstrumented
+fun:rte_devargs_add=uninstrumented
+fun:rte_devargs_dump=uninstrumented
+fun:rte_devargs_insert=uninstrumented
+fun:rte_devargs_layers_parse=uninstrumented
+fun:rte_devargs_next=uninstrumented
+fun:rte_devargs_parse=uninstrumented
+fun:rte_devargs_parsef=uninstrumented
+fun:rte_devargs_remove=uninstrumented
+fun:rte_devargs_type_count=uninstrumented
+fun:rte_strerror=uninstrumented
+fun:rte_fbarray_attach=uninstrumented
+fun:rte_fbarray_destroy=uninstrumented
+fun:rte_fbarray_detach=uninstrumented
+fun:rte_fbarray_dump_metadata=uninstrumented
+fun:rte_fbarray_find_contig_free=uninstrumented
+fun:rte_fbarray_find_contig_used=uninstrumented
+fun:rte_fbarray_find_idx=uninstrumented
+fun:rte_fbarray_find_next_free=uninstrumented
+fun:rte_fbarray_find_next_n_free=uninstrumented
+fun:rte_fbarray_find_next_n_used=uninstrumented
+fun:rte_fbarray_find_next_used=uninstrumented
+fun:rte_fbarray_find_prev_free=uninstrumented
+fun:rte_fbarray_find_prev_n_free=uninstrumented
+fun:rte_fbarray_find_prev_n_used=uninstrumented
+fun:rte_fbarray_find_prev_used=uninstrumented
+fun:rte_fbarray_find_rev_contig_free=uninstrumented
+fun:rte_fbarray_find_rev_contig_used=uninstrumented
+fun:rte_fbarray_get=uninstrumented
+fun:rte_fbarray_init=uninstrumented
+fun:rte_fbarray_is_used=uninstrumented
+fun:rte_fbarray_set_free=uninstrumented
+fun:rte_fbarray_set_used=uninstrumented
+fun:rte_hexdump=uninstrumented
+fun:rte_memdump=uninstrumented
+fun:rte_hypervisor_get_name=uninstrumented
+fun:rte_eal_get_lcore_state=uninstrumented
+fun:rte_eal_mp_remote_launch=uninstrumented
+fun:rte_eal_mp_wait_lcore=uninstrumented
+fun:rte_eal_wait_lcore=uninstrumented
+fun:rte_eal_cpu_init=uninstrumented
+fun:rte_socket_count=uninstrumented
+fun:rte_socket_id_by_idx=uninstrumented
+fun:eal_log_set_default=uninstrumented
+fun:rte_log=uninstrumented
+fun:rte_log_cur_msg_loglevel=uninstrumented
+fun:rte_log_cur_msg_logtype=uninstrumented
+fun:rte_log_dump=uninstrumented
+fun:rte_log_get_global_level=uninstrumented
+fun:rte_log_get_level=uninstrumented
+fun:rte_log_register=uninstrumented
+fun:rte_log_register_type_and_pick_level=uninstrumented
+fun:rte_log_save_pattern=uninstrumented
+fun:rte_log_save_regexp=uninstrumented
+fun:rte_log_set_global_level=uninstrumented
+fun:rte_log_set_level=uninstrumented
+fun:rte_log_set_level_pattern=uninstrumented
+fun:rte_log_set_level_regexp=uninstrumented
+fun:rte_openlog_stream=uninstrumented
+fun:rte_vlog=uninstrumented
+fun:eal_memalloc_is_contig=uninstrumented
+fun:eal_memalloc_mem_alloc_validate=uninstrumented
+fun:eal_memalloc_mem_alloc_validator_register=uninstrumented
+fun:eal_memalloc_mem_alloc_validator_unregister=uninstrumented
+fun:eal_memalloc_mem_event_callback_register=uninstrumented
+fun:eal_memalloc_mem_event_callback_unregister=uninstrumented
+fun:eal_memalloc_mem_event_notify=uninstrumented
+fun:eal_get_virtual_area=uninstrumented
+fun:rte_dump_physmem_layout=uninstrumented
+fun:rte_eal_get_physmem_size=uninstrumented
+fun:rte_eal_memory_init=uninstrumented
+fun:rte_mem_alloc_validator_register=uninstrumented
+fun:rte_mem_alloc_validator_unregister=uninstrumented
+fun:rte_mem_check_dma_mask=uninstrumented
+fun:rte_mem_check_dma_mask_thread_unsafe=uninstrumented
+fun:rte_mem_event_callback_register=uninstrumented
+fun:rte_mem_event_callback_unregister=uninstrumented
+fun:rte_mem_iova2virt=uninstrumented
+fun:rte_mem_lock_page=uninstrumented
+fun:rte_mem_set_dma_mask=uninstrumented
+fun:rte_mem_virt2memseg=uninstrumented
+fun:rte_mem_virt2memseg_list=uninstrumented
+fun:rte_memory_get_nchannel=uninstrumented
+fun:rte_memory_get_nrank=uninstrumented
+fun:rte_memseg_contig_walk=uninstrumented
+fun:rte_memseg_contig_walk_thread_unsafe=uninstrumented
+fun:rte_memseg_get_fd=uninstrumented
+fun:rte_memseg_get_fd_offset=uninstrumented
+fun:rte_memseg_get_fd_offset_thread_unsafe=uninstrumented
+fun:rte_memseg_get_fd_thread_unsafe=uninstrumented
+fun:rte_memseg_list_walk=uninstrumented
+fun:rte_memseg_list_walk_thread_unsafe=uninstrumented
+fun:rte_memseg_walk=uninstrumented
+fun:rte_memseg_walk_thread_unsafe=uninstrumented
+fun:rte_eal_memzone_init=uninstrumented
+fun:rte_memzone_dump=uninstrumented
+fun:rte_memzone_free=uninstrumented
+fun:rte_memzone_lookup=uninstrumented
+fun:rte_memzone_reserve=uninstrumented
+fun:rte_memzone_reserve_aligned=uninstrumented
+fun:rte_memzone_reserve_bounded=uninstrumented
+fun:rte_memzone_walk=uninstrumented
+fun:eal_adjust_config=uninstrumented
+fun:eal_check_common_options=uninstrumented
+fun:eal_common_usage=uninstrumented
+fun:eal_option_device_parse=uninstrumented
+fun:eal_parse_common_option=uninstrumented
+fun:eal_plugins_init=uninstrumented
+fun:eal_reset_internal_config=uninstrumented
+fun:rte_eal_primary_proc_alive=uninstrumented
+fun:rte_mp_action_register=uninstrumented
+fun:rte_mp_action_unregister=uninstrumented
+fun:rte_mp_channel_init=uninstrumented
+fun:rte_mp_reply=uninstrumented
+fun:rte_mp_request_async=uninstrumented
+fun:rte_mp_request_sync=uninstrumented
+fun:rte_mp_sendmsg=uninstrumented
+fun:rte_strscpy=uninstrumented
+fun:rte_strsplit=uninstrumented
+fun:rte_dump_tailq=uninstrumented
+fun:rte_eal_tailq_lookup=uninstrumented
+fun:rte_eal_tailq_register=uninstrumented
+fun:rte_eal_tailqs_init=uninstrumented
+fun:eal_cpuset_socket_id=uninstrumented
+fun:eal_thread_dump_affinity=uninstrumented
+fun:rte_ctrl_thread_create=uninstrumented
+fun:rte_lcore_has_role=uninstrumented
+fun:rte_socket_id=uninstrumented
+fun:rte_thread_get_affinity=uninstrumented
+fun:rte_thread_set_affinity=uninstrumented
+fun:rte_delay_us_block=uninstrumented
+fun:rte_delay_us_callback_register=uninstrumented
+fun:rte_delay_us_sleep=uninstrumented
+fun:rte_get_tsc_hz=uninstrumented
+fun:set_tsc_freq=uninstrumented
+fun:rte_uuid_compare=uninstrumented
+fun:rte_uuid_is_null=uninstrumented
+fun:rte_uuid_parse=uninstrumented
+fun:rte_uuid_unparse=uninstrumented
+fun:rte_cpu_getauxval=uninstrumented
+fun:rte_cpu_strcmp_auxval=uninstrumented
+fun:__rte_panic=uninstrumented
+fun:rte_dump_registers=uninstrumented
+fun:rte_dump_stack=uninstrumented
+fun:rte_exit=uninstrumented
+fun:dev_sigbus_handler_register=uninstrumented
+fun:dev_sigbus_handler_unregister=uninstrumented
+fun:rte_dev_event_monitor_start=uninstrumented
+fun:rte_dev_event_monitor_stop=uninstrumented
+fun:rte_dev_hotplug_handle_disable=uninstrumented
+fun:rte_dev_hotplug_handle_enable=uninstrumented
+fun:eal_hugepage_info_init=uninstrumented
+fun:eal_hugepage_info_read=uninstrumented
+fun:rte_eal_intr_init=uninstrumented
+fun:rte_epoll_ctl=uninstrumented
+fun:rte_epoll_wait=uninstrumented
+fun:rte_intr_allow_others=uninstrumented
+fun:rte_intr_callback_register=uninstrumented
+fun:rte_intr_callback_unregister=uninstrumented
+fun:rte_intr_cap_multiple=uninstrumented
+fun:rte_intr_disable=uninstrumented
+fun:rte_intr_dp_is_en=uninstrumented
+fun:rte_intr_efd_disable=uninstrumented
+fun:rte_intr_efd_enable=uninstrumented
+fun:rte_intr_enable=uninstrumented
+fun:rte_intr_free_epoll_fd=uninstrumented
+fun:rte_intr_rx_ctl=uninstrumented
+fun:rte_intr_tls_epfd=uninstrumented
+fun:eal_cpu_core_id=uninstrumented
+fun:eal_cpu_detected=uninstrumented
+fun:eal_cpu_socket_id=uninstrumented
+fun:rte_eal_log_init=uninstrumented
+fun:eal_memalloc_alloc_seg=uninstrumented
+fun:eal_memalloc_alloc_seg_bulk=uninstrumented
+fun:eal_memalloc_free_seg=uninstrumented
+fun:eal_memalloc_free_seg_bulk=uninstrumented
+fun:eal_memalloc_get_seg_fd=uninstrumented
+fun:eal_memalloc_get_seg_fd_offset=uninstrumented
+fun:eal_memalloc_init=uninstrumented
+fun:eal_memalloc_set_seg_fd=uninstrumented
+fun:eal_memalloc_sync_with_primary=uninstrumented
+fun:rte_eal_hugepage_attach=uninstrumented
+fun:rte_eal_hugepage_init=uninstrumented
+fun:rte_eal_memseg_init=uninstrumented
+fun:rte_eal_using_phys_addrs=uninstrumented
+fun:rte_mem_virt2iova=uninstrumented
+fun:rte_mem_virt2phy=uninstrumented
+fun:eal_thread_init_master=uninstrumented
+fun:eal_thread_loop=uninstrumented
+fun:rte_eal_remote_launch=uninstrumented
+fun:rte_sys_gettid=uninstrumented
+fun:rte_thread_setname=uninstrumented
+fun:get_tsc_freq=uninstrumented
+fun:rte_eal_hpet_init=uninstrumented
+fun:rte_eal_timer_init=uninstrumented
+fun:rte_get_hpet_cycles=uninstrumented
+fun:rte_get_hpet_hz=uninstrumented
+fun:rte_vfio_clear_group=uninstrumented
+fun:rte_vfio_container_create=uninstrumented
+fun:rte_vfio_container_destroy=uninstrumented
+fun:rte_vfio_container_dma_map=uninstrumented
+fun:rte_vfio_container_dma_unmap=uninstrumented
+fun:rte_vfio_container_group_bind=uninstrumented
+fun:rte_vfio_container_group_unbind=uninstrumented
+fun:rte_vfio_dma_map=uninstrumented
+fun:rte_vfio_dma_unmap=uninstrumented
+fun:rte_vfio_enable=uninstrumented
+fun:rte_vfio_get_container_fd=uninstrumented
+fun:rte_vfio_get_group_fd=uninstrumented
+fun:rte_vfio_get_group_num=uninstrumented
+fun:rte_vfio_is_enabled=uninstrumented
+fun:rte_vfio_noiommu_is_enabled=uninstrumented
+fun:rte_vfio_release_device=uninstrumented
+fun:rte_vfio_setup_device=uninstrumented
+fun:vfio_get_default_container_fd=uninstrumented
+fun:vfio_has_supported_extensions=uninstrumented
+fun:vfio_set_iommu_type=uninstrumented
+fun:vfio_mp_sync_setup=uninstrumented
+fun:eal_dev_hotplug_request_to_primary=uninstrumented
+fun:eal_dev_hotplug_request_to_secondary=uninstrumented
+fun:rte_mp_dev_hotplug_init=uninstrumented
+fun:malloc_elem_alloc=uninstrumented
+fun:malloc_elem_can_hold=uninstrumented
+fun:malloc_elem_dump=uninstrumented
+fun:malloc_elem_find_max_iova_contig=uninstrumented
+fun:malloc_elem_free=uninstrumented
+fun:malloc_elem_free_list_index=uninstrumented
+fun:malloc_elem_free_list_insert=uninstrumented
+fun:malloc_elem_free_list_remove=uninstrumented
+fun:malloc_elem_hide_region=uninstrumented
+fun:malloc_elem_init=uninstrumented
+fun:malloc_elem_insert=uninstrumented
+fun:malloc_elem_join_adjacent_free=uninstrumented
+fun:malloc_elem_resize=uninstrumented
+fun:alloc_pages_on_heap=uninstrumented
+fun:malloc_heap_add_external_memory=uninstrumented
+fun:malloc_heap_alloc=uninstrumented
+fun:malloc_heap_alloc_biggest=uninstrumented
+fun:malloc_heap_create=uninstrumented
+fun:malloc_heap_destroy=uninstrumented
+fun:malloc_heap_dump=uninstrumented
+fun:malloc_heap_free=uninstrumented
+fun:malloc_heap_free_pages=uninstrumented
+fun:malloc_heap_get_stats=uninstrumented
+fun:malloc_heap_remove_external_memory=uninstrumented
+fun:malloc_heap_resize=uninstrumented
+fun:malloc_socket_to_heap_id=uninstrumented
+fun:rollback_expand_heap=uninstrumented
+fun:rte_eal_malloc_heap_init=uninstrumented
+fun:register_mp_requests=uninstrumented
+fun:request_sync=uninstrumented
+fun:request_to_primary=uninstrumented
+fun:rte_cpu_get_flag_enabled=uninstrumented
+fun:rte_cpu_get_flag_name=uninstrumented
+fun:get_tsc_freq_arch=uninstrumented
+fun:rte_hypervisor_get=uninstrumented
+fun:rte_keepalive_create=uninstrumented
+fun:rte_keepalive_dispatch_pings=uninstrumented
+fun:rte_keepalive_mark_alive=uninstrumented
+fun:rte_keepalive_mark_sleep=uninstrumented
+fun:rte_keepalive_register_core=uninstrumented
+fun:rte_keepalive_register_relay_callback=uninstrumented
+fun:rte_calloc=uninstrumented
+fun:rte_calloc_socket=uninstrumented
+fun:rte_free=uninstrumented
+fun:rte_malloc=uninstrumented
+fun:rte_malloc_dump_heaps=uninstrumented
+fun:rte_malloc_dump_stats=uninstrumented
+fun:rte_malloc_get_socket_stats=uninstrumented
+fun:rte_malloc_heap_create=uninstrumented
+fun:rte_malloc_heap_destroy=uninstrumented
+fun:rte_malloc_heap_get_socket=uninstrumented
+fun:rte_malloc_heap_memory_add=uninstrumented
+fun:rte_malloc_heap_memory_attach=uninstrumented
+fun:rte_malloc_heap_memory_detach=uninstrumented
+fun:rte_malloc_heap_memory_remove=uninstrumented
+fun:rte_malloc_heap_socket_is_external=uninstrumented
+fun:rte_malloc_set_limit=uninstrumented
+fun:rte_malloc_socket=uninstrumented
+fun:rte_malloc_validate=uninstrumented
+fun:rte_malloc_virt2iova=uninstrumented
+fun:rte_realloc=uninstrumented
+fun:rte_zmalloc=uninstrumented
+fun:rte_zmalloc_socket=uninstrumented
+fun:rte_option_init=uninstrumented
+fun:rte_option_parse=uninstrumented
+fun:rte_option_register=uninstrumented
+fun:rte_reciprocal_value=uninstrumented
+fun:rte_reciprocal_value_u64=uninstrumented
+fun:rte_service_attr_get=uninstrumented
+fun:rte_service_attr_reset_all=uninstrumented
+fun:rte_service_component_register=uninstrumented
+fun:rte_service_component_runstate_set=uninstrumented
+fun:rte_service_component_unregister=uninstrumented
+fun:rte_service_dump=uninstrumented
+fun:rte_service_finalize=uninstrumented
+fun:rte_service_get_by_name=uninstrumented
+fun:rte_service_get_count=uninstrumented
+fun:rte_service_get_name=uninstrumented
+fun:rte_service_init=uninstrumented
+fun:rte_service_lcore_add=uninstrumented
+fun:rte_service_lcore_attr_get=uninstrumented
+fun:rte_service_lcore_attr_reset_all=uninstrumented
+fun:rte_service_lcore_count=uninstrumented
+fun:rte_service_lcore_count_services=uninstrumented
+fun:rte_service_lcore_del=uninstrumented
+fun:rte_service_lcore_list=uninstrumented
+fun:rte_service_lcore_reset_all=uninstrumented
+fun:rte_service_lcore_start=uninstrumented
+fun:rte_service_lcore_stop=uninstrumented
+fun:rte_service_map_lcore_get=uninstrumented
+fun:rte_service_map_lcore_set=uninstrumented
+fun:rte_service_may_be_active=uninstrumented
+fun:rte_service_probe_capability=uninstrumented
+fun:rte_service_run_iter_on_app_lcore=uninstrumented
+fun:rte_service_runstate_get=uninstrumented
+fun:rte_service_runstate_set=uninstrumented
+fun:rte_service_set_runstate_mapped_check=uninstrumented
+fun:rte_service_set_stats_enable=uninstrumented
+fun:rte_service_start_with_defaults=uninstrumented
+fun:eth_dev_to_id=uninstrumented
+fun:eth_find_device=uninstrumented
+fun:rte_eth_devargs_parse_list=uninstrumented
+fun:rte_eth_devargs_parse_representor_ports=uninstrumented
+fun:__rte_eth_dev_profile_init=uninstrumented
+fun:_rte_eth_dev_callback_process=uninstrumented
+fun:_rte_eth_dev_reset=uninstrumented
+fun:rte_eth_add_first_rx_callback=uninstrumented
+fun:rte_eth_add_rx_callback=uninstrumented
+fun:rte_eth_add_tx_callback=uninstrumented
+fun:rte_eth_allmulticast_disable=uninstrumented
+fun:rte_eth_allmulticast_enable=uninstrumented
+fun:rte_eth_allmulticast_get=uninstrumented
+fun:rte_eth_dev_adjust_nb_rx_tx_desc=uninstrumented
+fun:rte_eth_dev_allocate=uninstrumented
+fun:rte_eth_dev_allocated=uninstrumented
+fun:rte_eth_dev_attach_secondary=uninstrumented
+fun:rte_eth_dev_callback_register=uninstrumented
+fun:rte_eth_dev_callback_unregister=uninstrumented
+fun:rte_eth_dev_close=uninstrumented
+fun:rte_eth_dev_configure=uninstrumented
+fun:rte_eth_dev_count=uninstrumented
+fun:rte_eth_dev_count_avail=uninstrumented
+fun:rte_eth_dev_count_total=uninstrumented
+fun:rte_eth_dev_create=uninstrumented
+fun:rte_eth_dev_default_mac_addr_set=uninstrumented
+fun:rte_eth_dev_destroy=uninstrumented
+fun:rte_eth_dev_filter_ctrl=uninstrumented
+fun:rte_eth_dev_filter_supported=uninstrumented
+fun:rte_eth_dev_flow_ctrl_get=uninstrumented
+fun:rte_eth_dev_flow_ctrl_set=uninstrumented
+fun:rte_eth_dev_fw_version_get=uninstrumented
+fun:rte_eth_dev_get_dcb_info=uninstrumented
+fun:rte_eth_dev_get_eeprom=uninstrumented
+fun:rte_eth_dev_get_eeprom_length=uninstrumented
+fun:rte_eth_dev_get_module_eeprom=uninstrumented
+fun:rte_eth_dev_get_module_info=uninstrumented
+fun:rte_eth_dev_get_mtu=uninstrumented
+fun:rte_eth_dev_get_name_by_port=uninstrumented
+fun:rte_eth_dev_get_port_by_name=uninstrumented
+fun:rte_eth_dev_get_reg_info=uninstrumented
+fun:rte_eth_dev_get_sec_ctx=uninstrumented
+fun:rte_eth_dev_get_supported_ptypes=uninstrumented
+fun:rte_eth_dev_get_vlan_offload=uninstrumented
+fun:rte_eth_dev_info_get=uninstrumented
+fun:rte_eth_dev_is_removed=uninstrumented
+fun:rte_eth_dev_is_valid_port=uninstrumented
+fun:rte_eth_dev_l2_tunnel_eth_type_conf=uninstrumented
+fun:rte_eth_dev_l2_tunnel_offload_set=uninstrumented
+fun:rte_eth_dev_mac_addr_add=uninstrumented
+fun:rte_eth_dev_mac_addr_remove=uninstrumented
+fun:rte_eth_dev_owner_delete=uninstrumented
+fun:rte_eth_dev_owner_get=uninstrumented
+fun:rte_eth_dev_owner_new=uninstrumented
+fun:rte_eth_dev_owner_set=uninstrumented
+fun:rte_eth_dev_owner_unset=uninstrumented
+fun:rte_eth_dev_pool_ops_supported=uninstrumented
+fun:rte_eth_dev_priority_flow_ctrl_set=uninstrumented
+fun:rte_eth_dev_probing_finish=uninstrumented
+fun:rte_eth_dev_release_port=uninstrumented
+fun:rte_eth_dev_reset=uninstrumented
+fun:rte_eth_dev_rss_hash_conf_get=uninstrumented
+fun:rte_eth_dev_rss_hash_update=uninstrumented
+fun:rte_eth_dev_rss_reta_query=uninstrumented
+fun:rte_eth_dev_rss_reta_update=uninstrumented
+fun:rte_eth_dev_rx_intr_ctl=uninstrumented
+fun:rte_eth_dev_rx_intr_ctl_q=uninstrumented
+fun:rte_eth_dev_rx_intr_ctl_q_get_fd=uninstrumented
+fun:rte_eth_dev_rx_intr_disable=uninstrumented
+fun:rte_eth_dev_rx_intr_enable=uninstrumented
+fun:rte_eth_dev_rx_offload_name=uninstrumented
+fun:rte_eth_dev_rx_queue_start=uninstrumented
+fun:rte_eth_dev_rx_queue_stop=uninstrumented
+fun:rte_eth_dev_set_eeprom=uninstrumented
+fun:rte_eth_dev_set_link_down=uninstrumented
+fun:rte_eth_dev_set_link_up=uninstrumented
+fun:rte_eth_dev_set_mc_addr_list=uninstrumented
+fun:rte_eth_dev_set_mtu=uninstrumented
+fun:rte_eth_dev_set_rx_queue_stats_mapping=uninstrumented
+fun:rte_eth_dev_set_tx_queue_stats_mapping=uninstrumented
+fun:rte_eth_dev_set_vlan_ether_type=uninstrumented
+fun:rte_eth_dev_set_vlan_offload=uninstrumented
+fun:rte_eth_dev_set_vlan_pvid=uninstrumented
+fun:rte_eth_dev_set_vlan_strip_on_queue=uninstrumented
+fun:rte_eth_dev_socket_id=uninstrumented
+fun:rte_eth_dev_start=uninstrumented
+fun:rte_eth_dev_stop=uninstrumented
+fun:rte_eth_dev_tx_offload_name=uninstrumented
+fun:rte_eth_dev_tx_queue_start=uninstrumented
+fun:rte_eth_dev_tx_queue_stop=uninstrumented
+fun:rte_eth_dev_uc_all_hash_table_set=uninstrumented
+fun:rte_eth_dev_uc_hash_table_set=uninstrumented
+fun:rte_eth_dev_udp_tunnel_port_add=uninstrumented
+fun:rte_eth_dev_udp_tunnel_port_delete=uninstrumented
+fun:rte_eth_dev_vlan_filter=uninstrumented
+fun:rte_eth_devargs_parse=uninstrumented
+fun:rte_eth_dma_zone_reserve=uninstrumented
+fun:rte_eth_find_next=uninstrumented
+fun:rte_eth_find_next_owned_by=uninstrumented
+fun:rte_eth_iterator_cleanup=uninstrumented
+fun:rte_eth_iterator_init=uninstrumented
+fun:rte_eth_iterator_next=uninstrumented
+fun:rte_eth_led_off=uninstrumented
+fun:rte_eth_led_on=uninstrumented
+fun:rte_eth_link_get=uninstrumented
+fun:rte_eth_link_get_nowait=uninstrumented
+fun:rte_eth_macaddr_get=uninstrumented
+fun:rte_eth_mirror_rule_reset=uninstrumented
+fun:rte_eth_mirror_rule_set=uninstrumented
+fun:rte_eth_promiscuous_disable=uninstrumented
+fun:rte_eth_promiscuous_enable=uninstrumented
+fun:rte_eth_promiscuous_get=uninstrumented
+fun:rte_eth_remove_rx_callback=uninstrumented
+fun:rte_eth_remove_tx_callback=uninstrumented
+fun:rte_eth_rx_queue_info_get=uninstrumented
+fun:rte_eth_rx_queue_setup=uninstrumented
+fun:rte_eth_set_queue_rate_limit=uninstrumented
+fun:rte_eth_speed_bitflag=uninstrumented
+fun:rte_eth_stats_get=uninstrumented
+fun:rte_eth_stats_reset=uninstrumented
+fun:rte_eth_switch_domain_alloc=uninstrumented
+fun:rte_eth_switch_domain_free=uninstrumented
+fun:rte_eth_timesync_adjust_time=uninstrumented
+fun:rte_eth_timesync_disable=uninstrumented
+fun:rte_eth_timesync_enable=uninstrumented
+fun:rte_eth_timesync_read_rx_timestamp=uninstrumented
+fun:rte_eth_timesync_read_time=uninstrumented
+fun:rte_eth_timesync_read_tx_timestamp=uninstrumented
+fun:rte_eth_timesync_write_time=uninstrumented
+fun:rte_eth_tx_buffer_count_callback=uninstrumented
+fun:rte_eth_tx_buffer_drop_callback=uninstrumented
+fun:rte_eth_tx_buffer_init=uninstrumented
+fun:rte_eth_tx_buffer_set_err_callback=uninstrumented
+fun:rte_eth_tx_done_cleanup=uninstrumented
+fun:rte_eth_tx_queue_info_get=uninstrumented
+fun:rte_eth_tx_queue_setup=uninstrumented
+fun:rte_eth_xstats_get=uninstrumented
+fun:rte_eth_xstats_get_by_id=uninstrumented
+fun:rte_eth_xstats_get_id_by_name=uninstrumented
+fun:rte_eth_xstats_get_names=uninstrumented
+fun:rte_eth_xstats_get_names_by_id=uninstrumented
+fun:rte_eth_xstats_reset=uninstrumented
+fun:rte_flow_conv=uninstrumented
+fun:rte_flow_copy=uninstrumented
+fun:rte_flow_create=uninstrumented
+fun:rte_flow_destroy=uninstrumented
+fun:rte_flow_error_set=uninstrumented
+fun:rte_flow_expand_rss=uninstrumented
+fun:rte_flow_flush=uninstrumented
+fun:rte_flow_isolate=uninstrumented
+fun:rte_flow_ops_get=uninstrumented
+fun:rte_flow_query=uninstrumented
+fun:rte_flow_validate=uninstrumented
+fun:rte_mtr_capabilities_get=uninstrumented
+fun:rte_mtr_create=uninstrumented
+fun:rte_mtr_destroy=uninstrumented
+fun:rte_mtr_meter_disable=uninstrumented
+fun:rte_mtr_meter_dscp_table_update=uninstrumented
+fun:rte_mtr_meter_enable=uninstrumented
+fun:rte_mtr_meter_profile_add=uninstrumented
+fun:rte_mtr_meter_profile_delete=uninstrumented
+fun:rte_mtr_meter_profile_update=uninstrumented
+fun:rte_mtr_ops_get=uninstrumented
+fun:rte_mtr_policer_actions_update=uninstrumented
+fun:rte_mtr_stats_read=uninstrumented
+fun:rte_mtr_stats_update=uninstrumented
+fun:rte_tm_capabilities_get=uninstrumented
+fun:rte_tm_get_number_of_leaf_nodes=uninstrumented
+fun:rte_tm_hierarchy_commit=uninstrumented
+fun:rte_tm_level_capabilities_get=uninstrumented
+fun:rte_tm_mark_ip_dscp=uninstrumented
+fun:rte_tm_mark_ip_ecn=uninstrumented
+fun:rte_tm_mark_vlan_dei=uninstrumented
+fun:rte_tm_node_add=uninstrumented
+fun:rte_tm_node_capabilities_get=uninstrumented
+fun:rte_tm_node_cman_update=uninstrumented
+fun:rte_tm_node_delete=uninstrumented
+fun:rte_tm_node_parent_update=uninstrumented
+fun:rte_tm_node_resume=uninstrumented
+fun:rte_tm_node_shaper_update=uninstrumented
+fun:rte_tm_node_shared_shaper_update=uninstrumented
+fun:rte_tm_node_shared_wred_context_update=uninstrumented
+fun:rte_tm_node_stats_read=uninstrumented
+fun:rte_tm_node_stats_update=uninstrumented
+fun:rte_tm_node_suspend=uninstrumented
+fun:rte_tm_node_type_get=uninstrumented
+fun:rte_tm_node_wfq_weight_mode_update=uninstrumented
+fun:rte_tm_node_wred_context_update=uninstrumented
+fun:rte_tm_ops_get=uninstrumented
+fun:rte_tm_shaper_profile_add=uninstrumented
+fun:rte_tm_shaper_profile_delete=uninstrumented
+fun:rte_tm_shared_shaper_add_update=uninstrumented
+fun:rte_tm_shared_shaper_delete=uninstrumented
+fun:rte_tm_shared_wred_context_add_update=uninstrumented
+fun:rte_tm_shared_wred_context_delete=uninstrumented
+fun:rte_tm_wred_profile_add=uninstrumented
+fun:rte_tm_wred_profile_delete=uninstrumented
+fun:gro_tcp4_reassemble=uninstrumented
+fun:gro_tcp4_tbl_create=uninstrumented
+fun:gro_tcp4_tbl_destroy=uninstrumented
+fun:gro_tcp4_tbl_pkt_count=uninstrumented
+fun:gro_tcp4_tbl_timeout_flush=uninstrumented
+fun:gro_vxlan_tcp4_reassemble=uninstrumented
+fun:gro_vxlan_tcp4_tbl_create=uninstrumented
+fun:gro_vxlan_tcp4_tbl_destroy=uninstrumented
+fun:gro_vxlan_tcp4_tbl_pkt_count=uninstrumented
+fun:gro_vxlan_tcp4_tbl_timeout_flush=uninstrumented
+fun:rte_gro_ctx_create=uninstrumented
+fun:rte_gro_ctx_destroy=uninstrumented
+fun:rte_gro_get_pkt_count=uninstrumented
+fun:rte_gro_reassemble=uninstrumented
+fun:rte_gro_reassemble_burst=uninstrumented
+fun:rte_gro_timeout_flush=uninstrumented
+fun:rte_hash_add_key=uninstrumented
+fun:rte_hash_add_key_data=uninstrumented
+fun:rte_hash_add_key_with_hash=uninstrumented
+fun:rte_hash_add_key_with_hash_data=uninstrumented
+fun:rte_hash_count=uninstrumented
+fun:rte_hash_create=uninstrumented
+fun:rte_hash_del_key=uninstrumented
+fun:rte_hash_del_key_with_hash=uninstrumented
+fun:rte_hash_find_existing=uninstrumented
+fun:rte_hash_free=uninstrumented
+fun:rte_hash_free_key_with_position=uninstrumented
+fun:rte_hash_get_key_with_position=uninstrumented
+fun:rte_hash_hash=uninstrumented
+fun:rte_hash_iterate=uninstrumented
+fun:rte_hash_lookup=uninstrumented
+fun:rte_hash_lookup_bulk=uninstrumented
+fun:rte_hash_lookup_bulk_data=uninstrumented
+fun:rte_hash_lookup_data=uninstrumented
+fun:rte_hash_lookup_with_hash=uninstrumented
+fun:rte_hash_lookup_with_hash_data=uninstrumented
+fun:rte_hash_reset=uninstrumented
+fun:rte_hash_set_cmp_func=uninstrumented
+fun:rte_fbk_hash_create=uninstrumented
+fun:rte_fbk_hash_find_existing=uninstrumented
+fun:rte_fbk_hash_free=uninstrumented
+fun:ip_frag_find=uninstrumented
+fun:ip_frag_lookup=uninstrumented
+fun:ip_frag_process=uninstrumented
+fun:rte_frag_table_del_expired_entries=uninstrumented
+fun:rte_ip_frag_free_death_row=uninstrumented
+fun:rte_ip_frag_table_create=uninstrumented
+fun:rte_ip_frag_table_destroy=uninstrumented
+fun:rte_ip_frag_table_statistics_dump=uninstrumented
+fun:rte_ipv4_fragment_packet=uninstrumented
+fun:ipv4_frag_reassemble=uninstrumented
+fun:rte_ipv4_frag_reassemble_packet=uninstrumented
+fun:rte_ipv6_fragment_packet=uninstrumented
+fun:ipv6_frag_reassemble=uninstrumented
+fun:rte_ipv6_frag_reassemble_packet=uninstrumented
+fun:rte_kvargs_count=uninstrumented
+fun:rte_kvargs_free=uninstrumented
+fun:rte_kvargs_parse=uninstrumented
+fun:rte_kvargs_parse_delim=uninstrumented
+fun:rte_kvargs_process=uninstrumented
+fun:rte_kvargs_strcmp=uninstrumented
+fun:__rte_pktmbuf_read=uninstrumented
+fun:rte_get_rx_ol_flag_list=uninstrumented
+fun:rte_get_rx_ol_flag_name=uninstrumented
+fun:rte_get_tx_ol_flag_list=uninstrumented
+fun:rte_get_tx_ol_flag_name=uninstrumented
+fun:rte_mbuf_sanity_check=uninstrumented
+fun:rte_pktmbuf_dump=uninstrumented
+fun:rte_pktmbuf_dynamic_pool_create=uninstrumented
+fun:rte_pktmbuf_init=uninstrumented
+fun:rte_pktmbuf_pool_create=uninstrumented
+fun:rte_pktmbuf_pool_create_by_ops=uninstrumented
+fun:rte_pktmbuf_pool_init=uninstrumented
+fun:rte_mbuf_best_mempool_ops=uninstrumented
+fun:rte_mbuf_platform_mempool_ops=uninstrumented
+fun:rte_mbuf_set_platform_mempool_ops=uninstrumented
+fun:rte_mbuf_set_user_mempool_ops=uninstrumented
+fun:rte_mbuf_user_mempool_ops=uninstrumented
+fun:rte_get_ptype_inner_l2_name=uninstrumented
+fun:rte_get_ptype_inner_l3_name=uninstrumented
+fun:rte_get_ptype_inner_l4_name=uninstrumented
+fun:rte_get_ptype_l2_name=uninstrumented
+fun:rte_get_ptype_l3_name=uninstrumented
+fun:rte_get_ptype_l4_name=uninstrumented
+fun:rte_get_ptype_name=uninstrumented
+fun:rte_get_ptype_tunnel_name=uninstrumented
+fun:rte_mempool_audit=uninstrumented
+fun:rte_mempool_avail_count=uninstrumented
+fun:rte_mempool_cache_create=uninstrumented
+fun:rte_mempool_cache_free=uninstrumented
+fun:rte_mempool_calc_obj_size=uninstrumented
+fun:rte_mempool_check_cookies=uninstrumented
+fun:rte_mempool_contig_blocks_check_cookies=uninstrumented
+fun:rte_mempool_create=uninstrumented
+fun:rte_mempool_create_empty=uninstrumented
+fun:rte_mempool_dump=uninstrumented
+fun:rte_mempool_free=uninstrumented
+fun:rte_mempool_in_use_count=uninstrumented
+fun:rte_mempool_list_dump=uninstrumented
+fun:rte_mempool_lookup=uninstrumented
+fun:rte_mempool_mem_iter=uninstrumented
+fun:rte_mempool_obj_iter=uninstrumented
+fun:rte_mempool_populate_anon=uninstrumented
+fun:rte_mempool_populate_default=uninstrumented
+fun:rte_mempool_populate_iova=uninstrumented
+fun:rte_mempool_populate_virt=uninstrumented
+fun:rte_mempool_walk=uninstrumented
+fun:rte_mempool_ops_alloc=uninstrumented
+fun:rte_mempool_ops_calc_mem_size=uninstrumented
+fun:rte_mempool_ops_free=uninstrumented
+fun:rte_mempool_ops_get_count=uninstrumented
+fun:rte_mempool_ops_get_info=uninstrumented
+fun:rte_mempool_ops_populate=uninstrumented
+fun:rte_mempool_register_ops=uninstrumented
+fun:rte_mempool_set_ops_byname=uninstrumented
+fun:rte_mempool_op_calc_mem_size_default=uninstrumented
+fun:rte_mempool_op_populate_default=uninstrumented
+fun:rte_metrics_get_names=uninstrumented
+fun:rte_metrics_get_values=uninstrumented
+fun:rte_metrics_init=uninstrumented
+fun:rte_metrics_reg_name=uninstrumented
+fun:rte_metrics_reg_names=uninstrumented
+fun:rte_metrics_update_value=uninstrumented
+fun:rte_metrics_update_values=uninstrumented
+fun:rte_net_make_rarp_packet=uninstrumented
+fun:rte_net_get_ptype=uninstrumented
+fun:rte_net_skip_ip6_ext=uninstrumented
+fun:rte_net_crc_calc=uninstrumented
+fun:rte_net_crc_set_alg=uninstrumented
+fun:eal_parse_pci_BDF=uninstrumented
+fun:eal_parse_pci_DomBDF=uninstrumented
+fun:pci_map_resource=uninstrumented
+fun:pci_unmap_resource=uninstrumented
+fun:rte_eal_compare_pci_addr=uninstrumented
+fun:rte_pci_addr_cmp=uninstrumented
+fun:rte_pci_addr_parse=uninstrumented
+fun:rte_pci_device_name=uninstrumented
+fun:rte_eth_from_ring=uninstrumented
+fun:rte_eth_from_rings=uninstrumented
+fun:sock_support_features=uninstrumented
+fun:vhost_kernel_open_sock=uninstrumented
+fun:vhost_kernel_set_sock=uninstrumented
+fun:tap_support_features=uninstrumented
+fun:vhost_kernel_open_tap=uninstrumented
+fun:eth_virtio_dev_init=uninstrumented
+fun:virtio_dev_pause=uninstrumented
+fun:virtio_dev_resume=uninstrumented
+fun:virtio_inject_pkts=uninstrumented
+fun:virtio_interrupt_handler=uninstrumented
+fun:vtpci_get_status=uninstrumented
+fun:vtpci_init=uninstrumented
+fun:vtpci_isr=uninstrumented
+fun:vtpci_msix_detect=uninstrumented
+fun:vtpci_negotiate_features=uninstrumented
+fun:vtpci_read_dev_config=uninstrumented
+fun:vtpci_reinit_complete=uninstrumented
+fun:vtpci_reset=uninstrumented
+fun:vtpci_set_status=uninstrumented
+fun:vtpci_write_dev_config=uninstrumented
+fun:virtio_dev_cq_start=uninstrumented
+fun:virtio_dev_rx_queue_done=uninstrumented
+fun:virtio_dev_rx_queue_setup=uninstrumented
+fun:virtio_dev_rx_queue_setup_finish=uninstrumented
+fun:virtio_dev_tx_queue_setup=uninstrumented
+fun:virtio_dev_tx_queue_setup_finish=uninstrumented
+fun:virtio_recv_mergeable_pkts=uninstrumented
+fun:virtio_recv_mergeable_pkts_inorder=uninstrumented
+fun:virtio_recv_pkts=uninstrumented
+fun:virtio_xmit_pkts=uninstrumented
+fun:virtio_xmit_pkts_inorder=uninstrumented
+fun:vq_ring_free_chain=uninstrumented
+fun:vq_ring_free_inorder=uninstrumented
+fun:virtio_rxq_vec_setup=uninstrumented
+fun:virtio_recv_pkts_vec=uninstrumented
+fun:is_vhost_user_by_type=uninstrumented
+fun:virtio_user_dev_init=uninstrumented
+fun:virtio_user_dev_uninit=uninstrumented
+fun:virtio_user_handle_cq=uninstrumented
+fun:virtio_user_handle_mq=uninstrumented
+fun:virtio_user_start_device=uninstrumented
+fun:virtio_user_stop_device=uninstrumented
+fun:virtqueue_detach_unused=uninstrumented
+fun:virtqueue_rxvq_flush=uninstrumented
+fun:rte_ring_create=uninstrumented
+fun:rte_ring_dump=uninstrumented
+fun:rte_ring_free=uninstrumented
+fun:rte_ring_get_memsize=uninstrumented
+fun:rte_ring_init=uninstrumented
+fun:rte_ring_list_dump=uninstrumented
+fun:rte_ring_lookup=uninstrumented
+fun:rte_timer_dump_stats=uninstrumented
+fun:rte_timer_init=uninstrumented
+fun:rte_timer_manage=uninstrumented
+fun:rte_timer_pending=uninstrumented
+fun:rte_timer_reset=uninstrumented
+fun:rte_timer_reset_sync=uninstrumented
+fun:rte_timer_stop=uninstrumented
+fun:rte_timer_stop_sync=uninstrumented
+fun:rte_timer_subsystem_init=uninstrumented
+fun:pci_find_max_end_va=discard
+fun:pci_parse_one_sysfs_resource=discard
+fun:pci_update_device=discard
+fun:rte_pci_get_iommu_class=discard
+fun:rte_pci_ioport_map=discard
+fun:rte_pci_ioport_read=discard
+fun:rte_pci_ioport_unmap=discard
+fun:rte_pci_ioport_write=discard
+fun:rte_pci_map_device=discard
+fun:rte_pci_read_config=discard
+fun:rte_pci_scan=discard
+fun:rte_pci_unmap_device=discard
+fun:rte_pci_write_config=discard
+fun:pci_name_set=discard
+fun:rte_pci_add_device=discard
+fun:rte_pci_dump=discard
+fun:rte_pci_get_sysfs_path=discard
+fun:rte_pci_insert_device=discard
+fun:rte_pci_match=discard
+fun:rte_pci_probe=discard
+fun:rte_pci_register=discard
+fun:rte_pci_unregister=discard
+fun:pci_uio_map_resource=discard
+fun:pci_uio_remap_resource=discard
+fun:pci_uio_unmap_resource=discard
+fun:rte_pci_dev_iterate=discard
+fun:pci_uio_alloc_resource=discard
+fun:pci_uio_free_resource=discard
+fun:pci_uio_ioport_map=discard
+fun:pci_uio_ioport_read=discard
+fun:pci_uio_ioport_unmap=discard
+fun:pci_uio_ioport_write=discard
+fun:pci_uio_map_resource_by_index=discard
+fun:pci_uio_read_config=discard
+fun:pci_uio_write_config=discard
+fun:pci_vfio_ioport_map=discard
+fun:pci_vfio_ioport_read=discard
+fun:pci_vfio_ioport_unmap=discard
+fun:pci_vfio_ioport_write=discard
+fun:pci_vfio_is_enabled=discard
+fun:pci_vfio_map_resource=discard
+fun:pci_vfio_read_config=discard
+fun:pci_vfio_unmap_resource=discard
+fun:pci_vfio_write_config=discard
+fun:rte_vdev_add_custom_scan=discard
+fun:rte_vdev_find_device=discard
+fun:rte_vdev_init=discard
+fun:rte_vdev_register=discard
+fun:rte_vdev_remove_custom_scan=discard
+fun:rte_vdev_uninit=discard
+fun:rte_vdev_unregister=discard
+fun:rte_vdev_dev_iterate=discard
+fun:cmdline_free=discard
+fun:cmdline_in=discard
+fun:cmdline_interact=discard
+fun:cmdline_new=discard
+fun:cmdline_poll=discard
+fun:cmdline_printf=discard
+fun:cmdline_quit=discard
+fun:cmdline_set_prompt=discard
+fun:cmdline_write_char=discard
+fun:cirbuf_add_buf_head=discard
+fun:cirbuf_add_buf_tail=discard
+fun:cirbuf_add_head=discard
+fun:cirbuf_add_head_safe=discard
+fun:cirbuf_add_tail=discard
+fun:cirbuf_add_tail_safe=discard
+fun:cirbuf_align_left=discard
+fun:cirbuf_align_right=discard
+fun:cirbuf_del_buf_head=discard
+fun:cirbuf_del_buf_tail=discard
+fun:cirbuf_del_head=discard
+fun:cirbuf_del_head_safe=discard
+fun:cirbuf_del_tail=discard
+fun:cirbuf_del_tail_safe=discard
+fun:cirbuf_get_buf_head=discard
+fun:cirbuf_get_buf_tail=discard
+fun:cirbuf_get_head=discard
+fun:cirbuf_get_tail=discard
+fun:cirbuf_init=discard
+fun:cmdline_complete=discard
+fun:cmdline_isendofcommand=discard
+fun:cmdline_isendoftoken=discard
+fun:cmdline_parse=discard
+fun:cmdline_get_help_etheraddr=discard
+fun:cmdline_parse_etheraddr=discard
+fun:cmdline_get_help_ipaddr=discard
+fun:cmdline_parse_ipaddr=discard
+fun:cmdline_get_help_num=discard
+fun:cmdline_parse_num=discard
+fun:cmdline_get_help_portlist=discard
+fun:cmdline_parse_portlist=discard
+fun:cmdline_complete_get_elt_string=discard
+fun:cmdline_complete_get_nb_string=discard
+fun:cmdline_get_help_string=discard
+fun:cmdline_parse_string=discard
+fun:rdline_add_history=discard
+fun:rdline_char_in=discard
+fun:rdline_clear_history=discard
+fun:rdline_get_buffer=discard
+fun:rdline_get_history_item=discard
+fun:rdline_init=discard
+fun:rdline_newline=discard
+fun:rdline_quit=discard
+fun:rdline_redisplay=discard
+fun:rdline_reset=discard
+fun:rdline_restart=discard
+fun:rdline_stop=discard
+fun:cmdline_file_new=discard
+fun:cmdline_stdin_exit=discard
+fun:cmdline_stdin_new=discard
+fun:vt100_init=discard
+fun:vt100_parser=discard
+fun:eal_create_runtime_dir=discard
+fun:eal_parse_sysfs_value=discard
+fun:eal_proc_type_detect=discard
+fun:rte_eal_check_module=discard
+fun:rte_eal_cleanup=discard
+fun:rte_eal_create_uio_dev=discard
+fun:rte_eal_get_configuration=discard
+fun:rte_eal_get_runtime_dir=discard
+fun:rte_eal_has_hugepages=discard
+fun:rte_eal_has_pci=discard
+fun:rte_eal_init=discard
+fun:rte_eal_iopl_init=discard
+fun:rte_eal_iova_mode=discard
+fun:rte_eal_lcore_role=discard
+fun:rte_eal_mbuf_user_pool_ops=discard
+fun:rte_eal_process_type=discard
+fun:rte_eal_vfio_intr_mode=discard
+fun:rte_set_application_usage_hook=discard
+fun:rte_eal_alarm_cancel=discard
+fun:rte_eal_alarm_init=discard
+fun:rte_eal_alarm_set=discard
+fun:rte_bus_dump=discard
+fun:rte_bus_find=discard
+fun:rte_bus_find_by_device=discard
+fun:rte_bus_find_by_device_name=discard
+fun:rte_bus_find_by_name=discard
+fun:rte_bus_get_iommu_class=discard
+fun:rte_bus_probe=discard
+fun:rte_bus_register=discard
+fun:rte_bus_scan=discard
+fun:rte_bus_sigbus_handler=discard
+fun:rte_bus_unregister=discard
+fun:rte_class_find=discard
+fun:rte_class_find_by_name=discard
+fun:rte_class_register=discard
+fun:rte_class_unregister=discard
+fun:rte_cpu_check_supported=discard
+fun:rte_cpu_is_supported=discard
+fun:local_dev_probe=discard
+fun:local_dev_remove=discard
+fun:rte_dev_event_callback_process=discard
+fun:rte_dev_event_callback_register=discard
+fun:rte_dev_event_callback_unregister=discard
+fun:rte_dev_is_probed=discard
+fun:rte_dev_iterator_init=discard
+fun:rte_dev_iterator_next=discard
+fun:rte_dev_probe=discard
+fun:rte_dev_remove=discard
+fun:rte_eal_hotplug_add=discard
+fun:rte_eal_hotplug_remove=discard
+fun:rte_devargs_add=discard
+fun:rte_devargs_dump=discard
+fun:rte_devargs_insert=discard
+fun:rte_devargs_layers_parse=discard
+fun:rte_devargs_next=discard
+fun:rte_devargs_parse=discard
+fun:rte_devargs_parsef=discard
+fun:rte_devargs_remove=discard
+fun:rte_devargs_type_count=discard
+fun:rte_strerror=discard
+fun:rte_fbarray_attach=discard
+fun:rte_fbarray_destroy=discard
+fun:rte_fbarray_detach=discard
+fun:rte_fbarray_dump_metadata=discard
+fun:rte_fbarray_find_contig_free=discard
+fun:rte_fbarray_find_contig_used=discard
+fun:rte_fbarray_find_idx=discard
+fun:rte_fbarray_find_next_free=discard
+fun:rte_fbarray_find_next_n_free=discard
+fun:rte_fbarray_find_next_n_used=discard
+fun:rte_fbarray_find_next_used=discard
+fun:rte_fbarray_find_prev_free=discard
+fun:rte_fbarray_find_prev_n_free=discard
+fun:rte_fbarray_find_prev_n_used=discard
+fun:rte_fbarray_find_prev_used=discard
+fun:rte_fbarray_find_rev_contig_free=discard
+fun:rte_fbarray_find_rev_contig_used=discard
+fun:rte_fbarray_get=discard
+fun:rte_fbarray_init=discard
+fun:rte_fbarray_is_used=discard
+fun:rte_fbarray_set_free=discard
+fun:rte_fbarray_set_used=discard
+fun:rte_hexdump=discard
+fun:rte_memdump=discard
+fun:rte_hypervisor_get_name=discard
+fun:rte_eal_get_lcore_state=discard
+fun:rte_eal_mp_remote_launch=discard
+fun:rte_eal_mp_wait_lcore=discard
+fun:rte_eal_wait_lcore=discard
+fun:rte_eal_cpu_init=discard
+fun:rte_socket_count=discard
+fun:rte_socket_id_by_idx=discard
+fun:eal_log_set_default=discard
+fun:rte_log=discard
+fun:rte_log_cur_msg_loglevel=discard
+fun:rte_log_cur_msg_logtype=discard
+fun:rte_log_dump=discard
+fun:rte_log_get_global_level=discard
+fun:rte_log_get_level=discard
+fun:rte_log_register=discard
+fun:rte_log_register_type_and_pick_level=discard
+fun:rte_log_save_pattern=discard
+fun:rte_log_save_regexp=discard
+fun:rte_log_set_global_level=discard
+fun:rte_log_set_level=discard
+fun:rte_log_set_level_pattern=discard
+fun:rte_log_set_level_regexp=discard
+fun:rte_openlog_stream=discard
+fun:rte_vlog=discard
+fun:eal_memalloc_is_contig=discard
+fun:eal_memalloc_mem_alloc_validate=discard
+fun:eal_memalloc_mem_alloc_validator_register=discard
+fun:eal_memalloc_mem_alloc_validator_unregister=discard
+fun:eal_memalloc_mem_event_callback_register=discard
+fun:eal_memalloc_mem_event_callback_unregister=discard
+fun:eal_memalloc_mem_event_notify=discard
+fun:eal_get_virtual_area=discard
+fun:rte_dump_physmem_layout=discard
+fun:rte_eal_get_physmem_size=discard
+fun:rte_eal_memory_init=discard
+fun:rte_mem_alloc_validator_register=discard
+fun:rte_mem_alloc_validator_unregister=discard
+fun:rte_mem_check_dma_mask=discard
+fun:rte_mem_check_dma_mask_thread_unsafe=discard
+fun:rte_mem_event_callback_register=discard
+fun:rte_mem_event_callback_unregister=discard
+fun:rte_mem_iova2virt=discard
+fun:rte_mem_lock_page=discard
+fun:rte_mem_set_dma_mask=discard
+fun:rte_mem_virt2memseg=discard
+fun:rte_mem_virt2memseg_list=discard
+fun:rte_memory_get_nchannel=discard
+fun:rte_memory_get_nrank=discard
+fun:rte_memseg_contig_walk=discard
+fun:rte_memseg_contig_walk_thread_unsafe=discard
+fun:rte_memseg_get_fd=discard
+fun:rte_memseg_get_fd_offset=discard
+fun:rte_memseg_get_fd_offset_thread_unsafe=discard
+fun:rte_memseg_get_fd_thread_unsafe=discard
+fun:rte_memseg_list_walk=discard
+fun:rte_memseg_list_walk_thread_unsafe=discard
+fun:rte_memseg_walk=discard
+fun:rte_memseg_walk_thread_unsafe=discard
+fun:rte_eal_memzone_init=discard
+fun:rte_memzone_dump=discard
+fun:rte_memzone_free=discard
+fun:rte_memzone_lookup=discard
+fun:rte_memzone_reserve=discard
+fun:rte_memzone_reserve_aligned=discard
+fun:rte_memzone_reserve_bounded=discard
+fun:rte_memzone_walk=discard
+fun:eal_adjust_config=discard
+fun:eal_check_common_options=discard
+fun:eal_common_usage=discard
+fun:eal_option_device_parse=discard
+fun:eal_parse_common_option=discard
+fun:eal_plugins_init=discard
+fun:eal_reset_internal_config=discard
+fun:rte_eal_primary_proc_alive=discard
+fun:rte_mp_action_register=discard
+fun:rte_mp_action_unregister=discard
+fun:rte_mp_channel_init=discard
+fun:rte_mp_reply=discard
+fun:rte_mp_request_async=discard
+fun:rte_mp_request_sync=discard
+fun:rte_mp_sendmsg=discard
+fun:rte_strscpy=discard
+fun:rte_strsplit=discard
+fun:rte_dump_tailq=discard
+fun:rte_eal_tailq_lookup=discard
+fun:rte_eal_tailq_register=discard
+fun:rte_eal_tailqs_init=discard
+fun:eal_cpuset_socket_id=discard
+fun:eal_thread_dump_affinity=discard
+fun:rte_ctrl_thread_create=discard
+fun:rte_lcore_has_role=discard
+fun:rte_socket_id=discard
+fun:rte_thread_get_affinity=discard
+fun:rte_thread_set_affinity=discard
+fun:rte_delay_us_block=discard
+fun:rte_delay_us_callback_register=discard
+fun:rte_delay_us_sleep=discard
+fun:rte_get_tsc_hz=discard
+fun:set_tsc_freq=discard
+fun:rte_uuid_compare=discard
+fun:rte_uuid_is_null=discard
+fun:rte_uuid_parse=discard
+fun:rte_uuid_unparse=discard
+fun:rte_cpu_getauxval=discard
+fun:rte_cpu_strcmp_auxval=discard
+fun:__rte_panic=discard
+fun:rte_dump_registers=discard
+fun:rte_dump_stack=discard
+fun:rte_exit=discard
+fun:dev_sigbus_handler_register=discard
+fun:dev_sigbus_handler_unregister=discard
+fun:rte_dev_event_monitor_start=discard
+fun:rte_dev_event_monitor_stop=discard
+fun:rte_dev_hotplug_handle_disable=discard
+fun:rte_dev_hotplug_handle_enable=discard
+fun:eal_hugepage_info_init=discard
+fun:eal_hugepage_info_read=discard
+fun:rte_eal_intr_init=discard
+fun:rte_epoll_ctl=discard
+fun:rte_epoll_wait=discard
+fun:rte_intr_allow_others=discard
+fun:rte_intr_callback_register=discard
+fun:rte_intr_callback_unregister=discard
+fun:rte_intr_cap_multiple=discard
+fun:rte_intr_disable=discard
+fun:rte_intr_dp_is_en=discard
+fun:rte_intr_efd_disable=discard
+fun:rte_intr_efd_enable=discard
+fun:rte_intr_enable=discard
+fun:rte_intr_free_epoll_fd=discard
+fun:rte_intr_rx_ctl=discard
+fun:rte_intr_tls_epfd=discard
+fun:eal_cpu_core_id=discard
+fun:eal_cpu_detected=discard
+fun:eal_cpu_socket_id=discard
+fun:rte_eal_log_init=discard
+fun:eal_memalloc_alloc_seg=discard
+fun:eal_memalloc_alloc_seg_bulk=discard
+fun:eal_memalloc_free_seg=discard
+fun:eal_memalloc_free_seg_bulk=discard
+fun:eal_memalloc_get_seg_fd=discard
+fun:eal_memalloc_get_seg_fd_offset=discard
+fun:eal_memalloc_init=discard
+fun:eal_memalloc_set_seg_fd=discard
+fun:eal_memalloc_sync_with_primary=discard
+fun:rte_eal_hugepage_attach=discard
+fun:rte_eal_hugepage_init=discard
+fun:rte_eal_memseg_init=discard
+fun:rte_eal_using_phys_addrs=discard
+fun:rte_mem_virt2iova=discard
+fun:rte_mem_virt2phy=discard
+fun:eal_thread_init_master=discard
+fun:eal_thread_loop=discard
+fun:rte_eal_remote_launch=discard
+fun:rte_sys_gettid=discard
+fun:rte_thread_setname=discard
+fun:get_tsc_freq=discard
+fun:rte_eal_hpet_init=discard
+fun:rte_eal_timer_init=discard
+fun:rte_get_hpet_cycles=discard
+fun:rte_get_hpet_hz=discard
+fun:rte_vfio_clear_group=discard
+fun:rte_vfio_container_create=discard
+fun:rte_vfio_container_destroy=discard
+fun:rte_vfio_container_dma_map=discard
+fun:rte_vfio_container_dma_unmap=discard
+fun:rte_vfio_container_group_bind=discard
+fun:rte_vfio_container_group_unbind=discard
+fun:rte_vfio_dma_map=discard
+fun:rte_vfio_dma_unmap=discard
+fun:rte_vfio_enable=discard
+fun:rte_vfio_get_container_fd=discard
+fun:rte_vfio_get_group_fd=discard
+fun:rte_vfio_get_group_num=discard
+fun:rte_vfio_is_enabled=discard
+fun:rte_vfio_noiommu_is_enabled=discard
+fun:rte_vfio_release_device=discard
+fun:rte_vfio_setup_device=discard
+fun:vfio_get_default_container_fd=discard
+fun:vfio_has_supported_extensions=discard
+fun:vfio_set_iommu_type=discard
+fun:vfio_mp_sync_setup=discard
+fun:eal_dev_hotplug_request_to_primary=discard
+fun:eal_dev_hotplug_request_to_secondary=discard
+fun:rte_mp_dev_hotplug_init=discard
+fun:malloc_elem_alloc=discard
+fun:malloc_elem_can_hold=discard
+fun:malloc_elem_dump=discard
+fun:malloc_elem_find_max_iova_contig=discard
+fun:malloc_elem_free=discard
+fun:malloc_elem_free_list_index=discard
+fun:malloc_elem_free_list_insert=discard
+fun:malloc_elem_free_list_remove=discard
+fun:malloc_elem_hide_region=discard
+fun:malloc_elem_init=discard
+fun:malloc_elem_insert=discard
+fun:malloc_elem_join_adjacent_free=discard
+fun:malloc_elem_resize=discard
+fun:alloc_pages_on_heap=discard
+fun:malloc_heap_add_external_memory=discard
+fun:malloc_heap_alloc=discard
+fun:malloc_heap_alloc_biggest=discard
+fun:malloc_heap_create=discard
+fun:malloc_heap_destroy=discard
+fun:malloc_heap_dump=discard
+fun:malloc_heap_free=discard
+fun:malloc_heap_free_pages=discard
+fun:malloc_heap_get_stats=discard
+fun:malloc_heap_remove_external_memory=discard
+fun:malloc_heap_resize=discard
+fun:malloc_socket_to_heap_id=discard
+fun:rollback_expand_heap=discard
+fun:rte_eal_malloc_heap_init=discard
+fun:register_mp_requests=discard
+fun:request_sync=discard
+fun:request_to_primary=discard
+fun:rte_cpu_get_flag_enabled=discard
+fun:rte_cpu_get_flag_name=discard
+fun:get_tsc_freq_arch=discard
+fun:rte_hypervisor_get=discard
+fun:rte_keepalive_create=discard
+fun:rte_keepalive_dispatch_pings=discard
+fun:rte_keepalive_mark_alive=discard
+fun:rte_keepalive_mark_sleep=discard
+fun:rte_keepalive_register_core=discard
+fun:rte_keepalive_register_relay_callback=discard
+fun:rte_calloc=discard
+fun:rte_calloc_socket=discard
+fun:rte_free=discard
+fun:rte_malloc=discard
+fun:rte_malloc_dump_heaps=discard
+fun:rte_malloc_dump_stats=discard
+fun:rte_malloc_get_socket_stats=discard
+fun:rte_malloc_heap_create=discard
+fun:rte_malloc_heap_destroy=discard
+fun:rte_malloc_heap_get_socket=discard
+fun:rte_malloc_heap_memory_add=discard
+fun:rte_malloc_heap_memory_attach=discard
+fun:rte_malloc_heap_memory_detach=discard
+fun:rte_malloc_heap_memory_remove=discard
+fun:rte_malloc_heap_socket_is_external=discard
+fun:rte_malloc_set_limit=discard
+fun:rte_malloc_socket=discard
+fun:rte_malloc_validate=discard
+fun:rte_malloc_virt2iova=discard
+fun:rte_realloc=discard
+fun:rte_zmalloc=discard
+fun:rte_zmalloc_socket=discard
+fun:rte_option_init=discard
+fun:rte_option_parse=discard
+fun:rte_option_register=discard
+fun:rte_reciprocal_value=discard
+fun:rte_reciprocal_value_u64=discard
+fun:rte_service_attr_get=discard
+fun:rte_service_attr_reset_all=discard
+fun:rte_service_component_register=discard
+fun:rte_service_component_runstate_set=discard
+fun:rte_service_component_unregister=discard
+fun:rte_service_dump=discard
+fun:rte_service_finalize=discard
+fun:rte_service_get_by_name=discard
+fun:rte_service_get_count=discard
+fun:rte_service_get_name=discard
+fun:rte_service_init=discard
+fun:rte_service_lcore_add=discard
+fun:rte_service_lcore_attr_get=discard
+fun:rte_service_lcore_attr_reset_all=discard
+fun:rte_service_lcore_count=discard
+fun:rte_service_lcore_count_services=discard
+fun:rte_service_lcore_del=discard
+fun:rte_service_lcore_list=discard
+fun:rte_service_lcore_reset_all=discard
+fun:rte_service_lcore_start=discard
+fun:rte_service_lcore_stop=discard
+fun:rte_service_map_lcore_get=discard
+fun:rte_service_map_lcore_set=discard
+fun:rte_service_may_be_active=discard
+fun:rte_service_probe_capability=discard
+fun:rte_service_run_iter_on_app_lcore=discard
+fun:rte_service_runstate_get=discard
+fun:rte_service_runstate_set=discard
+fun:rte_service_set_runstate_mapped_check=discard
+fun:rte_service_set_stats_enable=discard
+fun:rte_service_start_with_defaults=discard
+fun:eth_dev_to_id=discard
+fun:eth_find_device=discard
+fun:rte_eth_devargs_parse_list=discard
+fun:rte_eth_devargs_parse_representor_ports=discard
+fun:__rte_eth_dev_profile_init=discard
+fun:_rte_eth_dev_callback_process=discard
+fun:_rte_eth_dev_reset=discard
+fun:rte_eth_add_first_rx_callback=discard
+fun:rte_eth_add_rx_callback=discard
+fun:rte_eth_add_tx_callback=discard
+fun:rte_eth_allmulticast_disable=discard
+fun:rte_eth_allmulticast_enable=discard
+fun:rte_eth_allmulticast_get=discard
+fun:rte_eth_dev_adjust_nb_rx_tx_desc=discard
+fun:rte_eth_dev_allocate=discard
+fun:rte_eth_dev_allocated=discard
+fun:rte_eth_dev_attach_secondary=discard
+fun:rte_eth_dev_callback_register=discard
+fun:rte_eth_dev_callback_unregister=discard
+fun:rte_eth_dev_close=discard
+fun:rte_eth_dev_configure=discard
+fun:rte_eth_dev_count=discard
+fun:rte_eth_dev_count_avail=discard
+fun:rte_eth_dev_count_total=discard
+fun:rte_eth_dev_create=discard
+fun:rte_eth_dev_default_mac_addr_set=discard
+fun:rte_eth_dev_destroy=discard
+fun:rte_eth_dev_filter_ctrl=discard
+fun:rte_eth_dev_filter_supported=discard
+fun:rte_eth_dev_flow_ctrl_get=discard
+fun:rte_eth_dev_flow_ctrl_set=discard
+fun:rte_eth_dev_fw_version_get=discard
+fun:rte_eth_dev_get_dcb_info=discard
+fun:rte_eth_dev_get_eeprom=discard
+fun:rte_eth_dev_get_eeprom_length=discard
+fun:rte_eth_dev_get_module_eeprom=discard
+fun:rte_eth_dev_get_module_info=discard
+fun:rte_eth_dev_get_mtu=discard
+fun:rte_eth_dev_get_name_by_port=discard
+fun:rte_eth_dev_get_port_by_name=discard
+fun:rte_eth_dev_get_reg_info=discard
+fun:rte_eth_dev_get_sec_ctx=discard
+fun:rte_eth_dev_get_supported_ptypes=discard
+fun:rte_eth_dev_get_vlan_offload=discard
+fun:rte_eth_dev_info_get=discard
+fun:rte_eth_dev_is_removed=discard
+fun:rte_eth_dev_is_valid_port=discard
+fun:rte_eth_dev_l2_tunnel_eth_type_conf=discard
+fun:rte_eth_dev_l2_tunnel_offload_set=discard
+fun:rte_eth_dev_mac_addr_add=discard
+fun:rte_eth_dev_mac_addr_remove=discard
+fun:rte_eth_dev_owner_delete=discard
+fun:rte_eth_dev_owner_get=discard
+fun:rte_eth_dev_owner_new=discard
+fun:rte_eth_dev_owner_set=discard
+fun:rte_eth_dev_owner_unset=discard
+fun:rte_eth_dev_pool_ops_supported=discard
+fun:rte_eth_dev_priority_flow_ctrl_set=discard
+fun:rte_eth_dev_probing_finish=discard
+fun:rte_eth_dev_release_port=discard
+fun:rte_eth_dev_reset=discard
+fun:rte_eth_dev_rss_hash_conf_get=discard
+fun:rte_eth_dev_rss_hash_update=discard
+fun:rte_eth_dev_rss_reta_query=discard
+fun:rte_eth_dev_rss_reta_update=discard
+fun:rte_eth_dev_rx_intr_ctl=discard
+fun:rte_eth_dev_rx_intr_ctl_q=discard
+fun:rte_eth_dev_rx_intr_ctl_q_get_fd=discard
+fun:rte_eth_dev_rx_intr_disable=discard
+fun:rte_eth_dev_rx_intr_enable=discard
+fun:rte_eth_dev_rx_offload_name=discard
+fun:rte_eth_dev_rx_queue_start=discard
+fun:rte_eth_dev_rx_queue_stop=discard
+fun:rte_eth_dev_set_eeprom=discard
+fun:rte_eth_dev_set_link_down=discard
+fun:rte_eth_dev_set_link_up=discard
+fun:rte_eth_dev_set_mc_addr_list=discard
+fun:rte_eth_dev_set_mtu=discard
+fun:rte_eth_dev_set_rx_queue_stats_mapping=discard
+fun:rte_eth_dev_set_tx_queue_stats_mapping=discard
+fun:rte_eth_dev_set_vlan_ether_type=discard
+fun:rte_eth_dev_set_vlan_offload=discard
+fun:rte_eth_dev_set_vlan_pvid=discard
+fun:rte_eth_dev_set_vlan_strip_on_queue=discard
+fun:rte_eth_dev_socket_id=discard
+fun:rte_eth_dev_start=discard
+fun:rte_eth_dev_stop=discard
+fun:rte_eth_dev_tx_offload_name=discard
+fun:rte_eth_dev_tx_queue_start=discard
+fun:rte_eth_dev_tx_queue_stop=discard
+fun:rte_eth_dev_uc_all_hash_table_set=discard
+fun:rte_eth_dev_uc_hash_table_set=discard
+fun:rte_eth_dev_udp_tunnel_port_add=discard
+fun:rte_eth_dev_udp_tunnel_port_delete=discard
+fun:rte_eth_dev_vlan_filter=discard
+fun:rte_eth_devargs_parse=discard
+fun:rte_eth_dma_zone_reserve=discard
+fun:rte_eth_find_next=discard
+fun:rte_eth_find_next_owned_by=discard
+fun:rte_eth_iterator_cleanup=discard
+fun:rte_eth_iterator_init=discard
+fun:rte_eth_iterator_next=discard
+fun:rte_eth_led_off=discard
+fun:rte_eth_led_on=discard
+fun:rte_eth_link_get=discard
+fun:rte_eth_link_get_nowait=discard
+fun:rte_eth_macaddr_get=discard
+fun:rte_eth_mirror_rule_reset=discard
+fun:rte_eth_mirror_rule_set=discard
+fun:rte_eth_promiscuous_disable=discard
+fun:rte_eth_promiscuous_enable=discard
+fun:rte_eth_promiscuous_get=discard
+fun:rte_eth_remove_rx_callback=discard
+fun:rte_eth_remove_tx_callback=discard
+fun:rte_eth_rx_queue_info_get=discard
+fun:rte_eth_rx_queue_setup=discard
+fun:rte_eth_set_queue_rate_limit=discard
+fun:rte_eth_speed_bitflag=discard
+fun:rte_eth_stats_get=discard
+fun:rte_eth_stats_reset=discard
+fun:rte_eth_switch_domain_alloc=discard
+fun:rte_eth_switch_domain_free=discard
+fun:rte_eth_timesync_adjust_time=discard
+fun:rte_eth_timesync_disable=discard
+fun:rte_eth_timesync_enable=discard
+fun:rte_eth_timesync_read_rx_timestamp=discard
+fun:rte_eth_timesync_read_time=discard
+fun:rte_eth_timesync_read_tx_timestamp=discard
+fun:rte_eth_timesync_write_time=discard
+fun:rte_eth_tx_buffer_count_callback=discard
+fun:rte_eth_tx_buffer_drop_callback=discard
+fun:rte_eth_tx_buffer_init=discard
+fun:rte_eth_tx_buffer_set_err_callback=discard
+fun:rte_eth_tx_done_cleanup=discard
+fun:rte_eth_tx_queue_info_get=discard
+fun:rte_eth_tx_queue_setup=discard
+fun:rte_eth_xstats_get=discard
+fun:rte_eth_xstats_get_by_id=discard
+fun:rte_eth_xstats_get_id_by_name=discard
+fun:rte_eth_xstats_get_names=discard
+fun:rte_eth_xstats_get_names_by_id=discard
+fun:rte_eth_xstats_reset=discard
+fun:rte_flow_conv=discard
+fun:rte_flow_copy=discard
+fun:rte_flow_create=discard
+fun:rte_flow_destroy=discard
+fun:rte_flow_error_set=discard
+fun:rte_flow_expand_rss=discard
+fun:rte_flow_flush=discard
+fun:rte_flow_isolate=discard
+fun:rte_flow_ops_get=discard
+fun:rte_flow_query=discard
+fun:rte_flow_validate=discard
+fun:rte_mtr_capabilities_get=discard
+fun:rte_mtr_create=discard
+fun:rte_mtr_destroy=discard
+fun:rte_mtr_meter_disable=discard
+fun:rte_mtr_meter_dscp_table_update=discard
+fun:rte_mtr_meter_enable=discard
+fun:rte_mtr_meter_profile_add=discard
+fun:rte_mtr_meter_profile_delete=discard
+fun:rte_mtr_meter_profile_update=discard
+fun:rte_mtr_ops_get=discard
+fun:rte_mtr_policer_actions_update=discard
+fun:rte_mtr_stats_read=discard
+fun:rte_mtr_stats_update=discard
+fun:rte_tm_capabilities_get=discard
+fun:rte_tm_get_number_of_leaf_nodes=discard
+fun:rte_tm_hierarchy_commit=discard
+fun:rte_tm_level_capabilities_get=discard
+fun:rte_tm_mark_ip_dscp=discard
+fun:rte_tm_mark_ip_ecn=discard
+fun:rte_tm_mark_vlan_dei=discard
+fun:rte_tm_node_add=discard
+fun:rte_tm_node_capabilities_get=discard
+fun:rte_tm_node_cman_update=discard
+fun:rte_tm_node_delete=discard
+fun:rte_tm_node_parent_update=discard
+fun:rte_tm_node_resume=discard
+fun:rte_tm_node_shaper_update=discard
+fun:rte_tm_node_shared_shaper_update=discard
+fun:rte_tm_node_shared_wred_context_update=discard
+fun:rte_tm_node_stats_read=discard
+fun:rte_tm_node_stats_update=discard
+fun:rte_tm_node_suspend=discard
+fun:rte_tm_node_type_get=discard
+fun:rte_tm_node_wfq_weight_mode_update=discard
+fun:rte_tm_node_wred_context_update=discard
+fun:rte_tm_ops_get=discard
+fun:rte_tm_shaper_profile_add=discard
+fun:rte_tm_shaper_profile_delete=discard
+fun:rte_tm_shared_shaper_add_update=discard
+fun:rte_tm_shared_shaper_delete=discard
+fun:rte_tm_shared_wred_context_add_update=discard
+fun:rte_tm_shared_wred_context_delete=discard
+fun:rte_tm_wred_profile_add=discard
+fun:rte_tm_wred_profile_delete=discard
+fun:gro_tcp4_reassemble=discard
+fun:gro_tcp4_tbl_create=discard
+fun:gro_tcp4_tbl_destroy=discard
+fun:gro_tcp4_tbl_pkt_count=discard
+fun:gro_tcp4_tbl_timeout_flush=discard
+fun:gro_vxlan_tcp4_reassemble=discard
+fun:gro_vxlan_tcp4_tbl_create=discard
+fun:gro_vxlan_tcp4_tbl_destroy=discard
+fun:gro_vxlan_tcp4_tbl_pkt_count=discard
+fun:gro_vxlan_tcp4_tbl_timeout_flush=discard
+fun:rte_gro_ctx_create=discard
+fun:rte_gro_ctx_destroy=discard
+fun:rte_gro_get_pkt_count=discard
+fun:rte_gro_reassemble=discard
+fun:rte_gro_reassemble_burst=discard
+fun:rte_gro_timeout_flush=discard
+fun:rte_hash_add_key=discard
+fun:rte_hash_add_key_data=discard
+fun:rte_hash_add_key_with_hash=discard
+fun:rte_hash_add_key_with_hash_data=discard
+fun:rte_hash_count=discard
+fun:rte_hash_create=discard
+fun:rte_hash_del_key=discard
+fun:rte_hash_del_key_with_hash=discard
+fun:rte_hash_find_existing=discard
+fun:rte_hash_free=discard
+fun:rte_hash_free_key_with_position=discard
+fun:rte_hash_get_key_with_position=discard
+fun:rte_hash_hash=discard
+fun:rte_hash_iterate=discard
+fun:rte_hash_lookup=discard
+fun:rte_hash_lookup_bulk=discard
+fun:rte_hash_lookup_bulk_data=discard
+fun:rte_hash_lookup_data=discard
+fun:rte_hash_lookup_with_hash=discard
+fun:rte_hash_lookup_with_hash_data=discard
+fun:rte_hash_reset=discard
+fun:rte_hash_set_cmp_func=discard
+fun:rte_fbk_hash_create=discard
+fun:rte_fbk_hash_find_existing=discard
+fun:rte_fbk_hash_free=discard
+fun:ip_frag_find=discard
+fun:ip_frag_lookup=discard
+fun:ip_frag_process=discard
+fun:rte_frag_table_del_expired_entries=discard
+fun:rte_ip_frag_free_death_row=discard
+fun:rte_ip_frag_table_create=discard
+fun:rte_ip_frag_table_destroy=discard
+fun:rte_ip_frag_table_statistics_dump=discard
+fun:rte_ipv4_fragment_packet=discard
+fun:ipv4_frag_reassemble=discard
+fun:rte_ipv4_frag_reassemble_packet=discard
+fun:rte_ipv6_fragment_packet=discard
+fun:ipv6_frag_reassemble=discard
+fun:rte_ipv6_frag_reassemble_packet=discard
+fun:rte_kvargs_count=discard
+fun:rte_kvargs_free=discard
+fun:rte_kvargs_parse=discard
+fun:rte_kvargs_parse_delim=discard
+fun:rte_kvargs_process=discard
+fun:rte_kvargs_strcmp=discard
+fun:__rte_pktmbuf_read=discard
+fun:rte_get_rx_ol_flag_list=discard
+fun:rte_get_rx_ol_flag_name=discard
+fun:rte_get_tx_ol_flag_list=discard
+fun:rte_get_tx_ol_flag_name=discard
+fun:rte_mbuf_sanity_check=discard
+fun:rte_pktmbuf_dump=discard
+fun:rte_pktmbuf_dynamic_pool_create=discard
+fun:rte_pktmbuf_init=discard
+fun:rte_pktmbuf_pool_create=discard
+fun:rte_pktmbuf_pool_create_by_ops=discard
+fun:rte_pktmbuf_pool_init=discard
+fun:rte_mbuf_best_mempool_ops=discard
+fun:rte_mbuf_platform_mempool_ops=discard
+fun:rte_mbuf_set_platform_mempool_ops=discard
+fun:rte_mbuf_set_user_mempool_ops=discard
+fun:rte_mbuf_user_mempool_ops=discard
+fun:rte_get_ptype_inner_l2_name=discard
+fun:rte_get_ptype_inner_l3_name=discard
+fun:rte_get_ptype_inner_l4_name=discard
+fun:rte_get_ptype_l2_name=discard
+fun:rte_get_ptype_l3_name=discard
+fun:rte_get_ptype_l4_name=discard
+fun:rte_get_ptype_name=discard
+fun:rte_get_ptype_tunnel_name=discard
+fun:rte_mempool_audit=discard
+fun:rte_mempool_avail_count=discard
+fun:rte_mempool_cache_create=discard
+fun:rte_mempool_cache_free=discard
+fun:rte_mempool_calc_obj_size=discard
+fun:rte_mempool_check_cookies=discard
+fun:rte_mempool_contig_blocks_check_cookies=discard
+fun:rte_mempool_create=discard
+fun:rte_mempool_create_empty=discard
+fun:rte_mempool_dump=discard
+fun:rte_mempool_free=discard
+fun:rte_mempool_in_use_count=discard
+fun:rte_mempool_list_dump=discard
+fun:rte_mempool_lookup=discard
+fun:rte_mempool_mem_iter=discard
+fun:rte_mempool_obj_iter=discard
+fun:rte_mempool_populate_anon=discard
+fun:rte_mempool_populate_default=discard
+fun:rte_mempool_populate_iova=discard
+fun:rte_mempool_populate_virt=discard
+fun:rte_mempool_walk=discard
+fun:rte_mempool_ops_alloc=discard
+fun:rte_mempool_ops_calc_mem_size=discard
+fun:rte_mempool_ops_free=discard
+fun:rte_mempool_ops_get_count=discard
+fun:rte_mempool_ops_get_info=discard
+fun:rte_mempool_ops_populate=discard
+fun:rte_mempool_register_ops=discard
+fun:rte_mempool_set_ops_byname=discard
+fun:rte_mempool_op_calc_mem_size_default=discard
+fun:rte_mempool_op_populate_default=discard
+fun:rte_metrics_get_names=discard
+fun:rte_metrics_get_values=discard
+fun:rte_metrics_init=discard
+fun:rte_metrics_reg_name=discard
+fun:rte_metrics_reg_names=discard
+fun:rte_metrics_update_value=discard
+fun:rte_metrics_update_values=discard
+fun:rte_net_make_rarp_packet=discard
+fun:rte_net_get_ptype=discard
+fun:rte_net_skip_ip6_ext=discard
+fun:rte_net_crc_calc=discard
+fun:rte_net_crc_set_alg=discard
+fun:eal_parse_pci_BDF=discard
+fun:eal_parse_pci_DomBDF=discard
+fun:pci_map_resource=discard
+fun:pci_unmap_resource=discard
+fun:rte_eal_compare_pci_addr=discard
+fun:rte_pci_addr_cmp=discard
+fun:rte_pci_addr_parse=discard
+fun:rte_pci_device_name=discard
+fun:rte_eth_from_ring=discard
+fun:rte_eth_from_rings=discard
+fun:sock_support_features=discard
+fun:vhost_kernel_open_sock=discard
+fun:vhost_kernel_set_sock=discard
+fun:tap_support_features=discard
+fun:vhost_kernel_open_tap=discard
+fun:eth_virtio_dev_init=discard
+fun:virtio_dev_pause=discard
+fun:virtio_dev_resume=discard
+fun:virtio_inject_pkts=discard
+fun:virtio_interrupt_handler=discard
+fun:vtpci_get_status=discard
+fun:vtpci_init=discard
+fun:vtpci_isr=discard
+fun:vtpci_msix_detect=discard
+fun:vtpci_negotiate_features=discard
+fun:vtpci_read_dev_config=discard
+fun:vtpci_reinit_complete=discard
+fun:vtpci_reset=discard
+fun:vtpci_set_status=discard
+fun:vtpci_write_dev_config=discard
+fun:virtio_dev_cq_start=discard
+fun:virtio_dev_rx_queue_done=discard
+fun:virtio_dev_rx_queue_setup=discard
+fun:virtio_dev_rx_queue_setup_finish=discard
+fun:virtio_dev_tx_queue_setup=discard
+fun:virtio_dev_tx_queue_setup_finish=discard
+fun:virtio_recv_mergeable_pkts=discard
+fun:virtio_recv_mergeable_pkts_inorder=discard
+fun:virtio_recv_pkts=discard
+fun:virtio_xmit_pkts=discard
+fun:virtio_xmit_pkts_inorder=discard
+fun:vq_ring_free_chain=discard
+fun:vq_ring_free_inorder=discard
+fun:virtio_rxq_vec_setup=discard
+fun:virtio_recv_pkts_vec=discard
+fun:is_vhost_user_by_type=discard
+fun:virtio_user_dev_init=discard
+fun:virtio_user_dev_uninit=discard
+fun:virtio_user_handle_cq=discard
+fun:virtio_user_handle_mq=discard
+fun:virtio_user_start_device=discard
+fun:virtio_user_stop_device=discard
+fun:virtqueue_detach_unused=discard
+fun:virtqueue_rxvq_flush=discard
+fun:rte_ring_create=discard
+fun:rte_ring_dump=discard
+fun:rte_ring_free=discard
+fun:rte_ring_get_memsize=discard
+fun:rte_ring_init=discard
+fun:rte_ring_list_dump=discard
+fun:rte_ring_lookup=discard
+fun:rte_timer_dump_stats=discard
+fun:rte_timer_init=discard
+fun:rte_timer_manage=discard
+fun:rte_timer_pending=discard
+fun:rte_timer_reset=discard
+fun:rte_timer_reset_sync=discard
+fun:rte_timer_stop=discard
+fun:rte_timer_stop_sync=discard
+fun:rte_timer_subsystem_init=discard
diff --git a/angora/run.sh b/angora/run.sh
new file mode 100644
index 0000000..be215f1
--- /dev/null
+++ b/angora/run.sh
@@ -0,0 +1 @@
+~/git/Angora/angora_fuzzer -M 2048 -i seeds -o output -t /root/git/uss/angora/tcp_lo.taint -- /root/git/uss/angora/tcp_lo.fast 127.0.0.1 1234 @@
diff --git a/angora/seeds/seed.txt b/angora/seeds/seed.txt
new file mode 100644
index 0000000..f534deb
--- /dev/null
+++ b/angora/seeds/seed.txt
@@ -0,0 +1 @@
+Hello World.
diff --git a/dpdk/Makefile b/dpdk/Makefile
index 15204fa..5d92719 100644
--- a/dpdk/Makefile
+++ b/dpdk/Makefile
@@ -21,10 +21,12 @@ DPDK_PKTMBUF_HEADROOM ?= 128
DPDK_MARCH ?= native
DPDK_TUNE ?= generic
DPDK_DEBUG ?= n
+DPDK_DESTDIR ?= $(CURDIR)/install
+PACKETDRILL ?= n
B := $(DPDK_BUILD_DIR)
I := $(DPDK_INSTALL_DIR)
-DPDK_GIT_REPO ?= http://dpdk.org/git/dpdk
+DPDK_GIT_REPO ?= http://dpdk.org/git/dpdk -b v18.11
DPDK_SOURCE := $(B)/dpdk
ifneq (,$(findstring clang,$(CC)))
@@ -40,8 +42,8 @@ endif
JOBS := $(shell grep processor /proc/cpuinfo | wc -l)
# compiler/linker custom arguments
-DPDK_CPU_CFLAGS := -pie -fPIC
-DPDK_CPU_LDFLAGS :=
+DPDK_CPU_CFLAGS := -fPIC
+DPDK_CPU_LDFLAGS := -r
DPDK_EXTRA_LDFLAGS := -g
ifeq ($(DPDK_DEBUG),n)
@@ -78,6 +80,7 @@ DPDK_MAKE_ARGS := -C $(DPDK_SOURCE) -j $(JOBS) \
EXTRA_LDFLAGS="$(DPDK_EXTRA_LDFLAGS)" \
CPU_CFLAGS="$(DPDK_CPU_CFLAGS)" \
CPU_LDFLAGS="$(DPDK_CPU_LDFLAGS)" \
+ DESTDIR="$(DPDK_DESTDIR)" \
$(DPDK_MAKE_EXTRA_ARGS)
DPDK_SOURCE_FILES := $(shell [ -e $(DPDK_SOURCE) ] && \
@@ -102,7 +105,7 @@ $(B)/custom-config: $(B)/.patch.ok Makefile
$(call set,RTE_MAX_LCORE,256)
$(call set,RTE_PKTMBUF_HEADROOM,$(DPDK_PKTMBUF_HEADROOM))
$(call set,RTE_LIBEAL_USE_HPET,y)
- $(call set,RTE_BUILD_COMBINE_LIBS,y)
+ $(call set,RTE_BUILD_COMBINE_LIBS,n)
$(call set,RTE_LIBRTE_I40E_16BYTE_RX_DESC,y)
$(call set,RTE_LIBRTE_I40E_ITR_INTERVAL,16)
$(call set,RTE_LIBRTE_PMD_PCAP,y)
@@ -115,13 +118,101 @@ $(B)/custom-config: $(B)/.patch.ok Makefile
$(call set,RTE_LIBRTE_PMD_BOND,y)
$(call set,RTE_LIBRTE_IP_FRAG,y)
@# not needed
+ $(call set,RTE_LIBRTE_TIMER,y)
$(call set,RTE_LIBRTE_CFGFILE,n)
+ $(call set,RTE_LIBRTE_LPM,y)
+ $(call set,RTE_LIBRTE_ACL,n)
$(call set,RTE_LIBRTE_POWER,n)
$(call set,RTE_LIBRTE_DISTRIBUTOR,n)
$(call set,RTE_LIBRTE_REORDER,n)
+ $(call set,RTE_LIBRTE_PORT,n)
+ $(call set,RTE_LIBRTE_TABLE,n)
+ $(call set,RTE_LIBRTE_PIPELINE,n)
$(call set,RTE_LIBRTE_FLOW_CLASSIFY,n)
$(call set,RTE_LIBRTE_PMD_CRYPTO_SCHEDULER,n)
$(call set,RTE_KNI_KMOD,n)
+ $(call set,RTE_LIBRTE_ENA_PMD,n)
+ $(call set,RTE_LIBRTE_FM10K_PMD,n)
+ $(call set,RTE_LIBRTE_CXGBE_PMD,n)
+ $(call set,RTE_LIBRTE_ENIC_PMD,n)
+ $(call set,RTE_LIBRTE_BNXT_PMD,n)
+ $(call set,RTE_LIBRTE_SFC_EFX_PMD,n)
+ $(call set,RTE_LIBRTE_PMD_SOFTNIC,n)
+ $(call set,RTE_LIBRTE_THUNDERX_NICVF_PMD,n)
+ $(call set,RTE_LIBRTE_LIO_PMD,n)
+ $(call set,RTE_LIBRTE_OCTEONTX_PMD,n)
+ $(call set,RTE_LIBRTE_VMXNET3_PMD,n)
+ $(call set,RTE_LIBRTE_QEDE_PMD,n)
+ $(call set,RTE_LIBRTE_ARK_PMD,n)
+ $(call set,RTE_LIBRTE_PMD_NULL,n)
+ $(call set,RTE_LIBRTE_CRYPTODEV,n)
+ $(call set,RTE_LIBRTE_PMD_NULL_CRYPTO,n)
+ $(call set,RTE_LIBRTE_SECURITY,n)
+ $(call set,RTE_LIBRTE_EVENTDEV,n)
+ $(call set,RTE_LIBRTE_PMD_SKELETON_EVENTDEV,n)
+ $(call set,RTE_LIBRTE_PMD_OCTEONTX_SSOVF,n)
+ $(call set,RTE_LIBRTE_OCTEONTX_MEMPOOL,n)
+ $(call set,RTE_LIBRTE_EFD,n)
+ $(call set,RTE_LIBRTE_MEMBER,n)
+ $(call set,RTE_LIBRTE_JOBSTATS,n)
+ $(call set,RTE_LIBRTE_METER,n)
+ $(call set,RTE_LIBRTE_SCHED,n)
+ $(call set,RTE_APP_TEST,n)
+ $(call set,RTE_APP_CRYPTO_PERF,n)
+ $(call set,RTE_APP_EVENTDEV,n)
+ $(call set,RTE_LIBRTE_PMD_FAILSAFE,n)
+ $(call set,RTE_LIBRTE_EM_PMD,n)
+ $(call set,RTE_LIBRTE_IGB_PMD,n)
+ $(call set,RTE_LIBRTE_LATENCY_STATS,n)
+ $(call set,RTE_EAL_IGB_UIO,n)
+ $(call set,RTE_LIBRTE_KNI,n)
+ $(call set,RTE_LIBRTE_PMD_KNI,n)
+ $(call set,RTE_KNI_KMOD,n)
+ $(call set,RTE_KNI_KMOD_ETHTOOL,n)
+ $(call set,RTE_LIBRTE_BITRATE,n)
+ $(call set,RTE_LIBRTE_METRICS,y)
+ $(call set,RTE_LIBRTE_AVP_PMD,n)
+ $(call set,RTE_LIBRTE_NFP_PMD,n)
+ $(call set,RTE_LIBRTE_PMD_TAP,n)
+ $(call set,RTE_LIBRTE_VHOST,$(PACKETDRILL))
+ $(call set,RTE_LIBRTE_IFC_PMD,n)
+ $(call set,RTE_LIBRTE_PMD_VHOST,n)
+ $(call set,RTE_PROC_INFO,n)
+ $(call set,RTE_TEST_PMD,n)
+ $(call set,RTE_LIBRTE_FSLMC_BUS,n)
+ $(call set,RTE_LIBRTE_DPAA_BUS,n)
+ $(call set,RTE_LIBRTE_VMBUS,n)
+ $(call set,RTE_LIBRTE_IFPGA_BUS,n)
+ $(call set,RTE_LIBRTE_BPF,n)
+ $(call set,RTE_LIBRTE_COMPRESSDEV,n)
+ $(call set,RTE_LIBRTE_VDEV_NETVSC_PMD,n)
+ $(call set,RTE_LIBRTE_NETVSC_PMD,n)
+ $(call set,RTE_LIBRTE_RAWDEV,n)
+ $(call set,RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT,n)
+ $(call set,RTE_LIBRTE_AXGBE_PMD,n)
+ $(call set,RTE_LIBRTE_AVF_PMD,n)
+ $(call set,RTE_LIBRTE_BBDEV,n)
+ $(call set,RTE_LIBRTE_IP_FRAG_MAX_FRAG,48)
+ $(call set,RTE_MAX_NUMA_NODES,2)
+ $(call set,RTE_MAX_ETHPORTS,4)
+ $(call set,RTE_MAX_QUEUES_PER_PORT,8)
+ $(call set,RTE_LIBRTE_I40E_PMD,n)
+ $(call set,RTE_LIBRTE_IXGBE_PMD,n)
+ $(call set,RTE_LIBRTE_ENETC_PMD,n)
+ $(call set,RTE_LIBRTE_PMD_BOND,n)
+ $(call set,RTE_LIBRTE_ATLANTIC_PMD,n)
+ $(call set,RTE_LIBRTE_GSO,n)
+ $(call set,RTE_MAX_VFIO_GROUPS,4)
+ $(call set,RTE_MAX_VFIO_CONTAINERS,4)
+ $(call set,RTE_LIBRTE_COMMON_DPAAX,n)
+ $(call set,RTE_LIBRTE_PMD_OCTEONTX_CRYPTO,n)
+ $(call set,RTE_EAL_NUMA_AWARE_HUGEPAGES,n)
+ $(call set,RTE_DRIVER_MEMPOOL_STACK,y)
+ $(call set,RTE_DRIVER_MEMPOOL_BUCKET,n)
+ $(call set,RTE_LIBRTE_PMD_QAT,n)
+ $(call set,RTE_LIBRTE_PMD_AF_PACKET,n)
+ $(call set,RTE_MAX_MEM_MB,1024)
+ $(call set,RTE_LIBRTE_PDUMP,n)
@rm -f .config.ok
$(B)/.download.ok:
@@ -165,4 +256,4 @@ build: $(B)/.build.ok
.PHONY: clean
clean:
- @rm -rf $(B) $(I)
+ @rm -rf $(DPDK_BUILD_DIR) $(DPDK_DESTDIR)
diff --git a/dpdk/dpdk-v18.11_patches/0001-eal-don-t-start-the-interrupt-mp-thread.patch b/dpdk/dpdk-v18.11_patches/0001-eal-don-t-start-the-interrupt-mp-thread.patch
new file mode 100644
index 0000000..770bf05
--- /dev/null
+++ b/dpdk/dpdk-v18.11_patches/0001-eal-don-t-start-the-interrupt-mp-thread.patch
@@ -0,0 +1,35 @@
+From f68558b0ccbddb4cc81aca36befa0a7730ee051c Mon Sep 17 00:00:00 2001
+From: Jianfeng Tan <henry.tjf@antfin.com>
+Date: Wed, 29 Aug 2018 14:24:01 +0000
+Subject: [PATCH 7/9] eal: don't start the interrupt mp thread
+
+---
+ lib/librte_eal/common/eal_common_proc.c | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
+index 9fcb91219..79d9e6bbe 100644
+--- a/lib/librte_eal/common/eal_common_proc.c
++++ b/lib/librte_eal/common/eal_common_proc.c
+@@ -615,6 +615,7 @@ rte_mp_channel_init(void)
+ return -1;
+ }
+
++#if 0
+ if (rte_ctrl_thread_create(&mp_handle_tid, "rte_mp_handle",
+ NULL, mp_handle, NULL) < 0) {
+ RTE_LOG(ERR, EAL, "failed to create mp thead: %s\n",
+@@ -624,6 +625,10 @@ rte_mp_channel_init(void)
+ mp_fd = -1;
+ return -1;
+ }
++#else
++ RTE_SET_USED(mp_handle);
++ RTE_SET_USED(mp_handle_tid);
++#endif
+
+ /* unlock the directory */
+ flock(dir_fd, LOCK_UN);
+--
+2.17.1
+
diff --git a/dpdk/dpdk-v18.11_patches/0002-eal-prioritize-constructor.patch b/dpdk/dpdk-v18.11_patches/0002-eal-prioritize-constructor.patch
new file mode 100644
index 0000000..9d2959f
--- /dev/null
+++ b/dpdk/dpdk-v18.11_patches/0002-eal-prioritize-constructor.patch
@@ -0,0 +1,25 @@
+From 7fe32567994a8ce782fa8406613bade1d2100dca Mon Sep 17 00:00:00 2001
+From: Jianfeng Tan <henry.tjf@antfin.com>
+Date: Wed, 29 Aug 2018 14:14:09 +0000
+Subject: [PATCH 2/9] eal: prioritize constructor
+
+---
+ lib/librte_eal/common/include/rte_common.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/lib/librte_eal/common/include/rte_common.h b/lib/librte_eal/common/include/rte_common.h
+index 069c13ec7..a635f5be4 100644
+--- a/lib/librte_eal/common/include/rte_common.h
++++ b/lib/librte_eal/common/include/rte_common.h
+@@ -84,7 +84,7 @@ typedef uint16_t unaligned_uint16_t;
+ #define RTE_PRIORITY_LOG 101
+ #define RTE_PRIORITY_BUS 110
+ #define RTE_PRIORITY_CLASS 120
+-#define RTE_PRIORITY_LAST 65535
++#define RTE_PRIORITY_LAST 130
+
+ #define RTE_PRIO(prio) \
+ RTE_PRIORITY_ ## prio
+--
+2.17.1
+
diff --git a/dpdk/dpdk-v18.11_patches/0003-mbuf-add-single-linked-list.patch b/dpdk/dpdk-v18.11_patches/0003-mbuf-add-single-linked-list.patch
new file mode 100644
index 0000000..7430d1e
--- /dev/null
+++ b/dpdk/dpdk-v18.11_patches/0003-mbuf-add-single-linked-list.patch
@@ -0,0 +1,33 @@
+From 1416ff5de58922dc32eb2fb9ce2b9b970282136c Mon Sep 17 00:00:00 2001
+From: Jianfeng Tan <henry.tjf@antfin.com>
+Date: Wed, 29 Aug 2018 14:18:13 +0000
+Subject: [PATCH 3/9] mbuf: add single linked list
+
+---
+ lib/librte_mbuf/rte_mbuf.h | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
+index 9ce5d76d7..0081bd6d7 100644
+--- a/lib/librte_mbuf/rte_mbuf.h
++++ b/lib/librte_mbuf/rte_mbuf.h
+@@ -593,6 +593,8 @@ struct rte_mbuf {
+ */
+ struct rte_mbuf_ext_shared_info *shinfo;
+
++ struct rte_mbuf *next_pkt;
++
+ } __rte_cache_aligned;
+
+ /**
+@@ -1237,6 +1239,7 @@ static inline void rte_pktmbuf_reset_headroom(struct rte_mbuf *m)
+ static inline void rte_pktmbuf_reset(struct rte_mbuf *m)
+ {
+ m->next = NULL;
++ m->next_pkt = NULL;
+ m->pkt_len = 0;
+ m->tx_offload = 0;
+ m->vlan_tci = 0;
+--
+2.17.1
+
diff --git a/dpdk/dpdk-v18.11_patches/0004-net-virtio-user-add-rss-update-for-virtio-user.patch b/dpdk/dpdk-v18.11_patches/0004-net-virtio-user-add-rss-update-for-virtio-user.patch
new file mode 100644
index 0000000..e4eb8e7
--- /dev/null
+++ b/dpdk/dpdk-v18.11_patches/0004-net-virtio-user-add-rss-update-for-virtio-user.patch
@@ -0,0 +1,43 @@
+From 9bbe20eda858fd7fcbd8f137e5f96f51d571a556 Mon Sep 17 00:00:00 2001
+From: Jianfeng Tan <henry.tjf@antfin.com>
+Date: Wed, 29 Aug 2018 14:20:51 +0000
+Subject: [PATCH 4/9] net/virtio-user: add rss update for virtio-user
+
+---
+ drivers/net/virtio/virtio_ethdev.c | 13 +++++++++++++
+ 1 file changed, 13 insertions(+)
+
+diff --git a/drivers/net/virtio/virtio_ethdev.c b/drivers/net/virtio/virtio_ethdev.c
+index 614357da7..e7336cde9 100644
+--- a/drivers/net/virtio/virtio_ethdev.c
++++ b/drivers/net/virtio/virtio_ethdev.c
+@@ -738,6 +738,18 @@ virtio_dev_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id)
+ return 0;
+ }
+
++static int
++virtio_rss_hash_update(struct rte_eth_dev *dev,
++ struct rte_eth_rss_conf *rss_conf __rte_unused)
++{
++ struct virtio_hw *hw = dev->data->dev_private;
++
++ if (hw->virtio_user_dev)
++ return 0;
++
++ return -1;
++}
++
+ /*
+ * dev_ops for virtio, bare necessities for basic operation
+ */
+@@ -772,6 +784,7 @@ static const struct eth_dev_ops virtio_eth_dev_ops = {
+ .mac_addr_add = virtio_mac_addr_add,
+ .mac_addr_remove = virtio_mac_addr_remove,
+ .mac_addr_set = virtio_mac_addr_set,
++ .rss_hash_update = virtio_rss_hash_update,
+ };
+
+ static void
+--
+2.17.1
+
diff --git a/dpdk/dpdk-v18.11_patches/0005-net-virtio-user-support-raw-socket-as-backend.patch b/dpdk/dpdk-v18.11_patches/0005-net-virtio-user-support-raw-socket-as-backend.patch
new file mode 100644
index 0000000..1d950c5
--- /dev/null
+++ b/dpdk/dpdk-v18.11_patches/0005-net-virtio-user-support-raw-socket-as-backend.patch
@@ -0,0 +1,645 @@
+From 307f7debe0f2143e70659b7a082537077b20d185 Mon Sep 17 00:00:00 2001
+From: Jianfeng Tan <henry.tjf@antfin.com>
+Date: Thu, 19 Jul 2018 11:25:22 +0000
+Subject: [PATCH] net/virtio-user: support raw socket as backend
+
+We will support tapfd or raw socket fd opened by application and
+passed into virtio-user for initialization.
+
+Note if there are multiple queue pairs, users are still supposed
+to pass down the iface name with the first queue pair fd passed
+through this parameter.
+
+Signed-off-by: Jianfeng Tan <henry.tjf@antfin.com>
+---
+ drivers/net/virtio/Makefile | 1 +
+ drivers/net/virtio/virtio_user/vhost_kernel.c | 78 ++++++---
+ drivers/net/virtio/virtio_user/vhost_kernel.h | 15 ++
+ .../virtio/virtio_user/vhost_kernel_sock.c | 156 ++++++++++++++++++
+ .../net/virtio/virtio_user/vhost_kernel_tap.c | 64 ++++++-
+ .../net/virtio/virtio_user/vhost_kernel_tap.h | 39 -----
+ .../net/virtio/virtio_user/virtio_user_dev.c | 16 +-
+ .../net/virtio/virtio_user/virtio_user_dev.h | 3 +-
+ drivers/net/virtio/virtio_user_ethdev.c | 20 ++-
+ 9 files changed, 318 insertions(+), 74 deletions(-)
+ create mode 100644 drivers/net/virtio/virtio_user/vhost_kernel.h
+ create mode 100644 drivers/net/virtio/virtio_user/vhost_kernel_sock.c
+ delete mode 100644 drivers/net/virtio/virtio_user/vhost_kernel_tap.h
+
+diff --git a/drivers/net/virtio/Makefile b/drivers/net/virtio/Makefile
+index 6c2c9967b..2e1fc9b5e 100644
+--- a/drivers/net/virtio/Makefile
++++ b/drivers/net/virtio/Makefile
+@@ -41,6 +41,7 @@ ifeq ($(CONFIG_RTE_VIRTIO_USER),y)
+ SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_user.c
+ SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_kernel.c
+ SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_kernel_tap.c
++SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_kernel_sock.c
+ SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/virtio_user_dev.c
+ SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user_ethdev.c
+ endif
+diff --git a/drivers/net/virtio/virtio_user/vhost_kernel.c b/drivers/net/virtio/virtio_user/vhost_kernel.c
+index 6b19180d7..fa84287f5 100644
+--- a/drivers/net/virtio/virtio_user/vhost_kernel.c
++++ b/drivers/net/virtio/virtio_user/vhost_kernel.c
+@@ -6,13 +6,14 @@
+ #include <sys/stat.h>
+ #include <fcntl.h>
+ #include <unistd.h>
++#include <sys/ioctl.h>
+
+ #include <rte_memory.h>
+ #include <rte_eal_memconfig.h>
+
+ #include "vhost.h"
+ #include "virtio_user_dev.h"
+-#include "vhost_kernel_tap.h"
++#include "vhost_kernel.h"
+
+ struct vhost_memory_kernel {
+ uint32_t nregions;
+@@ -152,27 +153,25 @@ prepare_vhost_memory_kernel(void)
+ (1ULL << VIRTIO_NET_F_HOST_TSO6) | \
+ (1ULL << VIRTIO_NET_F_CSUM))
+
+-static unsigned int
+-tap_support_features(void)
++#define PATH_SYS_CLASS_NET "/sys/class/net"
++
++static int
++vhost_kernel_is_tap(struct virtio_user_dev *dev)
+ {
+- int tapfd;
+- unsigned int tap_features;
++ char path[128];
+
+- tapfd = open(PATH_NET_TUN, O_RDWR);
+- if (tapfd < 0) {
+- PMD_DRV_LOG(ERR, "fail to open %s: %s",
+- PATH_NET_TUN, strerror(errno));
+- return -1;
+- }
++ if (dev->ifname == NULL)
++ return 0;
+
+- if (ioctl(tapfd, TUNGETFEATURES, &tap_features) == -1) {
+- PMD_DRV_LOG(ERR, "TUNGETFEATURES failed: %s", strerror(errno));
+- close(tapfd);
+- return -1;
+- }
++ snprintf(path, 128, PATH_SYS_CLASS_NET"/%s", dev->ifname);
++ if(access(path, F_OK) == -1)
++ return 1;
+
+- close(tapfd);
+- return tap_features;
++ snprintf(path, 128, PATH_SYS_CLASS_NET"/%s/tun_flags", dev->ifname);
++ if(access(path, F_OK) != -1)
++ return 1;
++
++ return 0;
+ }
+
+ static int
+@@ -186,7 +185,6 @@ vhost_kernel_ioctl(struct virtio_user_dev *dev,
+ struct vhost_memory_kernel *vm = NULL;
+ int vhostfd;
+ unsigned int queue_sel;
+- unsigned int features;
+
+ PMD_DRV_LOG(INFO, "%s", vhost_msg_strings[req]);
+
+@@ -240,21 +238,36 @@ vhost_kernel_ioctl(struct virtio_user_dev *dev,
+ }
+
+ if (!ret && req_kernel == VHOST_GET_FEATURES) {
+- features = tap_support_features();
+- /* with tap as the backend, all these features are supported
++ int vnet_hdr, mq;
++
++ if (vhost_kernel_is_tap(dev))
++ tap_support_features(&vnet_hdr, &mq);
++ else
++ sock_support_features(dev->be_fd, &vnet_hdr, &mq);
++
++ /* with kernel vhost, all these features are supported
+ * but not claimed by vhost-net, so we add them back when
+ * reporting to upper layer.
+ */
+- if (features & IFF_VNET_HDR) {
++ if (vnet_hdr) {
+ *((uint64_t *)arg) |= VHOST_KERNEL_GUEST_OFFLOADS_MASK;
+ *((uint64_t *)arg) |= VHOST_KERNEL_HOST_OFFLOADS_MASK;
+ }
+
+- /* vhost_kernel will not declare this feature, but it does
++ /* kernel vhost will not declare this feature, but it does
+ * support multi-queue.
+ */
+- if (features & IFF_MULTI_QUEUE)
++ if (mq)
+ *(uint64_t *)arg |= (1ull << VIRTIO_NET_F_MQ);
++
++ /* raw socket only supports vnet header size of 10, so we must
++ * eliminate below features.
++ */
++ if (!vhost_kernel_is_tap(dev) &&
++ vnet_hdr == sizeof(struct virtio_net_hdr)) {
++ *((uint64_t *)arg) &= ~(1ull << VIRTIO_NET_F_MRG_RXBUF);
++ *((uint64_t *)arg) &= ~(1ull << VIRTIO_F_VERSION_1);
++ }
+ }
+
+ if (vm)
+@@ -333,7 +346,8 @@ vhost_kernel_enable_queue_pair(struct virtio_user_dev *dev,
+
+ if (!enable) {
+ if (dev->tapfds[pair_idx] >= 0) {
+- close(dev->tapfds[pair_idx]);
++ if (dev->be_fd < 0)
++ close(dev->tapfds[pair_idx]);
+ dev->tapfds[pair_idx] = -1;
+ }
+ return vhost_kernel_set_backend(vhostfd, -1);
+@@ -347,8 +361,18 @@ vhost_kernel_enable_queue_pair(struct virtio_user_dev *dev,
+ else
+ hdr_size = sizeof(struct virtio_net_hdr);
+
+- tapfd = vhost_kernel_open_tap(&dev->ifname, hdr_size, req_mq,
+- (char *)dev->mac_addr, dev->features);
++ if (vhost_kernel_is_tap(dev)) {
++ tapfd = vhost_kernel_open_tap(&dev->ifname, hdr_size,
++ req_mq, (char *)dev->mac_addr, dev->features);
++ } else {
++ if (pair_idx == 0 && dev->be_fd >= 0)
++ tapfd = vhost_kernel_set_sock(dev->be_fd,
++ hdr_size, req_mq);
++ else
++ tapfd = vhost_kernel_open_sock(dev->ifname,
++ hdr_size, dev->mac_addr, req_mq);
++ }
++
+ if (tapfd < 0) {
+ PMD_DRV_LOG(ERR, "fail to open tap for vhost kernel");
+ return -1;
+diff --git a/drivers/net/virtio/virtio_user/vhost_kernel.h b/drivers/net/virtio/virtio_user/vhost_kernel.h
+new file mode 100644
+index 000000000..75d6c5bf6
+--- /dev/null
++++ b/drivers/net/virtio/virtio_user/vhost_kernel.h
+@@ -0,0 +1,15 @@
++/* SPDX-License-Identifier: BSD-3-Clause
++ * Copyright(c) 2016 Intel Corporation
++ */
++
++int vhost_kernel_open_tap(char **p_ifname, int hdr_size, int req_mq,
++ const char *mac, uint64_t features);
++
++void tap_support_features(int *vnet_hdr, int *mq);
++
++int vhost_kernel_open_sock(char *ifname, int hdr_size,
++ uint8_t *mac, int req_mq);
++
++int vhost_kernel_set_sock(int sockfd, int hdr_size, int req_mq);
++
++void sock_support_features(int fd, int *vnet_hdr, int *mq);
+diff --git a/drivers/net/virtio/virtio_user/vhost_kernel_sock.c b/drivers/net/virtio/virtio_user/vhost_kernel_sock.c
+new file mode 100644
+index 000000000..7c2ace294
+--- /dev/null
++++ b/drivers/net/virtio/virtio_user/vhost_kernel_sock.c
+@@ -0,0 +1,156 @@
++/* SPDX-License-Identifier: BSD-3-Clause
++ * Copyright(c) 2018 Alibaba Group
++ * Copyright(c) 2018 Ant Financial Services Group
++ */
++
++#include <unistd.h>
++#include <sys/types.h>
++#include <sys/stat.h>
++#include <fcntl.h>
++#include <net/if.h>
++#include <net/if_arp.h>
++#include <errno.h>
++#include <string.h>
++#include <limits.h>
++#include <sys/socket.h>
++#include <arpa/inet.h>
++#include <netpacket/packet.h>
++#include <linux/if_ether.h>
++#include <sys/ioctl.h>
++
++#include <rte_ether.h>
++
++#include "../virtqueue.h"
++#include "../virtio_logs.h"
++#include "vhost_kernel.h"
++
++#ifndef PACKET_VNET_HDR
++#define PACKET_VNET_HDR 15
++#endif
++
++#ifndef PACKET_FANOUT
++#define PACKET_FANOUT 18
++#endif
++
++#ifndef PACKET_VNET_HDR_SZ
++#define PACKET_VNET_HDR_SZ 128
++#endif
++
++void
++sock_support_features(int fd, int *vnet_hdr, int *mq)
++{
++ int hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf);
++ int local_fd = 0;
++
++ if (fd < 0) {
++ fd = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
++ if (fd < 0) {
++ *mq = 0;
++ *vnet_hdr = 0;
++ return;
++ }
++ local_fd = 1;
++ }
++
++ *mq = 1;
++
++ if (setsockopt(fd, SOL_PACKET, PACKET_VNET_HDR_SZ,
++ (void *)&hdr_size, sizeof(hdr_size))) {
++ *vnet_hdr = sizeof(struct virtio_net_hdr);
++ } else
++ *vnet_hdr = hdr_size;
++
++ if (local_fd)
++ close(fd);
++}
++
++int
++vhost_kernel_set_sock(int sockfd, int hdr_size, int req_mq)
++{
++ int ret;
++ int fanout_type = 0; /* PACKET_FANOUT_HASH */
++
++ if (hdr_size == sizeof(struct virtio_net_hdr))
++ ret = setsockopt(sockfd, SOL_PACKET, PACKET_VNET_HDR,
++ (void *)&hdr_size, sizeof(hdr_size));
++ else
++ ret = setsockopt(sockfd, SOL_PACKET, PACKET_VNET_HDR_SZ,
++ (void *)&hdr_size, sizeof(hdr_size));
++ if (ret) {
++ PMD_DRV_LOG(ERR, "failed to set vnet hdr (%d): %s",
++ hdr_size, strerror(errno));
++ close(sockfd);
++ return -1;
++ }
++
++ if (fcntl(sockfd, F_SETFL, fcntl(sockfd, F_GETFL) | O_NONBLOCK))
++ {
++ PMD_DRV_LOG(ERR, "fcntl O_NONBLOCK failed! %s",
++ strerror(errno));
++ close(sockfd);
++ return -1;
++ }
++
++ if (req_mq) {
++ if (setsockopt(sockfd, SOL_PACKET, PACKET_FANOUT,
++ (void *)&fanout_type, sizeof(fanout_type))) {
++ PMD_DRV_LOG(ERR, "PACKET_FANOUT failed! %s",
++ strerror(errno));
++ close(sockfd);
++ return -1;
++ }
++ }
++
++ return sockfd;
++}
++
++int
++vhost_kernel_open_sock(char *ifname, int hdr_size,
++ uint8_t *mac, int req_mq)
++{
++ int sockfd;
++ struct ifreq ifr;
++ struct sockaddr_ll addr_ll;
++
++ sockfd = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
++ if (sockfd < 0) {
++ PMD_DRV_LOG(ERR, "socket failed: %s", strerror(errno));
++ return -1;
++ }
++
++ memset(&ifr, 0, sizeof(ifr));
++ strncpy(ifr.ifr_name, ifname, IFNAMSIZ - 1);
++
++ if (ioctl(sockfd, SIOCGIFINDEX, (void*)&ifr)) {
++ PMD_DRV_LOG(ERR, "SIOCGIFINDEX failed: %s", strerror(errno));
++ close(sockfd);
++ return -1;
++ }
++
++ memset(&addr_ll, 0, sizeof(addr_ll));
++ addr_ll.sll_ifindex = ifr.ifr_ifindex;
++ addr_ll.sll_family = AF_PACKET;
++ addr_ll.sll_protocol = htons(ETH_P_ALL);
++ addr_ll.sll_hatype = 0;
++ //addr_ll.sll_pkttype = PACKET_HOST;
++ //addr_ll.sll_halen = ETH_ALEN;
++ if (bind(sockfd, (struct sockaddr*)&addr_ll, sizeof(addr_ll))) {
++ PMD_DRV_LOG(ERR, "bind failed: %s", strerror(errno));
++ close(sockfd);
++ return -1;
++ }
++
++ ifr.ifr_flags |= IFF_PROMISC | IFF_UP;
++
++ if (ioctl(sockfd, SIOCSIFFLAGS, (char*)&ifr)) {
++ PMD_DRV_LOG(ERR, "SIOCSIFFLAGS failed: %s", strerror(errno));
++ close(sockfd);
++ return -1;
++ }
++
++ ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
++ if (ioctl(sockfd, SIOCGIFHWADDR, &ifr) == 0)
++ memcpy(mac, ifr.ifr_hwaddr.sa_data, ETHER_ADDR_LEN);
++
++ return vhost_kernel_set_sock(sockfd, hdr_size, req_mq);
++}
+diff --git a/drivers/net/virtio/virtio_user/vhost_kernel_tap.c b/drivers/net/virtio/virtio_user/vhost_kernel_tap.c
+index a3faf1d0c..85dd24dd6 100644
+--- a/drivers/net/virtio/virtio_user/vhost_kernel_tap.c
++++ b/drivers/net/virtio/virtio_user/vhost_kernel_tap.c
+@@ -11,13 +11,75 @@
+ #include <errno.h>
+ #include <string.h>
+ #include <limits.h>
++#include <sys/ioctl.h>
+
+ #include <rte_ether.h>
+
+-#include "vhost_kernel_tap.h"
++#include "vhost_kernel.h"
+ #include "../virtio_logs.h"
+ #include "../virtio_pci.h"
+
++/* TUN ioctls */
++#define TUNSETIFF _IOW('T', 202, int)
++#define TUNGETFEATURES _IOR('T', 207, unsigned int)
++#define TUNSETOFFLOAD _IOW('T', 208, unsigned int)
++#define TUNGETIFF _IOR('T', 210, unsigned int)
++#define TUNSETSNDBUF _IOW('T', 212, int)
++#define TUNGETVNETHDRSZ _IOR('T', 215, int)
++#define TUNSETVNETHDRSZ _IOW('T', 216, int)
++#define TUNSETQUEUE _IOW('T', 217, int)
++#define TUNSETVNETLE _IOW('T', 220, int)
++#define TUNSETVNETBE _IOW('T', 222, int)
++
++/* TUNSETIFF ifr flags */
++#define IFF_TAP 0x0002
++#define IFF_NO_PI 0x1000
++#define IFF_ONE_QUEUE 0x2000
++#define IFF_VNET_HDR 0x4000
++#define IFF_MULTI_QUEUE 0x0100
++#define IFF_ATTACH_QUEUE 0x0200
++#define IFF_DETACH_QUEUE 0x0400
++
++/* Features for GSO (TUNSETOFFLOAD). */
++#define TUN_F_CSUM 0x01 /* You can hand me unchecksummed packets. */
++#define TUN_F_TSO4 0x02 /* I can handle TSO for IPv4 packets */
++#define TUN_F_TSO6 0x04 /* I can handle TSO for IPv6 packets */
++#define TUN_F_TSO_ECN 0x08 /* I can handle TSO with ECN bits. */
++#define TUN_F_UFO 0x10 /* I can handle UFO packets */
++
++/* Constants */
++#define PATH_NET_TUN "/dev/net/tun"
++
++void
++tap_support_features(int *vnet_hdr, int *mq)
++{
++ int tapfd;
++ unsigned int tap_features;
++
++ *vnet_hdr = 0;
++ *mq = 0;
++
++ tapfd = open(PATH_NET_TUN, O_RDWR);
++ if (tapfd < 0) {
++ PMD_DRV_LOG(ERR, "fail to open %s: %s",
++ PATH_NET_TUN, strerror(errno));
++ return;
++ }
++
++ if (ioctl(tapfd, TUNGETFEATURES, &tap_features) == -1) {
++ PMD_DRV_LOG(ERR, "TUNGETFEATURES failed: %s", strerror(errno));
++ close(tapfd);
++ return;
++ }
++
++ close(tapfd);
++
++ if (tap_features & IFF_VNET_HDR)
++ *vnet_hdr = 1;
++ if (tap_features & IFF_MULTI_QUEUE)
++ *mq = 1;
++}
++
+ static int
+ vhost_kernel_tap_set_offload(int fd, uint64_t features)
+ {
+diff --git a/drivers/net/virtio/virtio_user/vhost_kernel_tap.h b/drivers/net/virtio/virtio_user/vhost_kernel_tap.h
+deleted file mode 100644
+index e0e95b4f5..000000000
+--- a/drivers/net/virtio/virtio_user/vhost_kernel_tap.h
++++ /dev/null
+@@ -1,39 +0,0 @@
+-/* SPDX-License-Identifier: BSD-3-Clause
+- * Copyright(c) 2016 Intel Corporation
+- */
+-
+-#include <sys/ioctl.h>
+-
+-/* TUN ioctls */
+-#define TUNSETIFF _IOW('T', 202, int)
+-#define TUNGETFEATURES _IOR('T', 207, unsigned int)
+-#define TUNSETOFFLOAD _IOW('T', 208, unsigned int)
+-#define TUNGETIFF _IOR('T', 210, unsigned int)
+-#define TUNSETSNDBUF _IOW('T', 212, int)
+-#define TUNGETVNETHDRSZ _IOR('T', 215, int)
+-#define TUNSETVNETHDRSZ _IOW('T', 216, int)
+-#define TUNSETQUEUE _IOW('T', 217, int)
+-#define TUNSETVNETLE _IOW('T', 220, int)
+-#define TUNSETVNETBE _IOW('T', 222, int)
+-
+-/* TUNSETIFF ifr flags */
+-#define IFF_TAP 0x0002
+-#define IFF_NO_PI 0x1000
+-#define IFF_ONE_QUEUE 0x2000
+-#define IFF_VNET_HDR 0x4000
+-#define IFF_MULTI_QUEUE 0x0100
+-#define IFF_ATTACH_QUEUE 0x0200
+-#define IFF_DETACH_QUEUE 0x0400
+-
+-/* Features for GSO (TUNSETOFFLOAD). */
+-#define TUN_F_CSUM 0x01 /* You can hand me unchecksummed packets. */
+-#define TUN_F_TSO4 0x02 /* I can handle TSO for IPv4 packets */
+-#define TUN_F_TSO6 0x04 /* I can handle TSO for IPv6 packets */
+-#define TUN_F_TSO_ECN 0x08 /* I can handle TSO with ECN bits. */
+-#define TUN_F_UFO 0x10 /* I can handle UFO packets */
+-
+-/* Constants */
+-#define PATH_NET_TUN "/dev/net/tun"
+-
+-int vhost_kernel_open_tap(char **p_ifname, int hdr_size, int req_mq,
+- const char *mac, uint64_t features);
+diff --git a/drivers/net/virtio/virtio_user/virtio_user_dev.c b/drivers/net/virtio/virtio_user/virtio_user_dev.c
+index 20816c936..7e655a0d5 100644
+--- a/drivers/net/virtio/virtio_user/virtio_user_dev.c
++++ b/drivers/net/virtio/virtio_user/virtio_user_dev.c
+@@ -294,7 +294,7 @@ virtio_user_fill_intr_handle(struct virtio_user_dev *dev)
+ eth_dev->intr_handle->max_intr = dev->max_queue_pairs + 1;
+ eth_dev->intr_handle->type = RTE_INTR_HANDLE_VDEV;
+ /* For virtio vdev, no need to read counter for clean */
+- eth_dev->intr_handle->efd_counter_size = 0;
++ eth_dev->intr_handle->efd_counter_size = 8;
+ eth_dev->intr_handle->fd = -1;
+ if (dev->vhostfd >= 0)
+ eth_dev->intr_handle->fd = dev->vhostfd;
+@@ -312,7 +312,9 @@ virtio_user_mem_event_cb(enum rte_mem_event type __rte_unused,
+ {
+ struct virtio_user_dev *dev = arg;
+ struct rte_memseg_list *msl;
++#if 0
+ uint16_t i;
++#endif
+
+ /* ignore externally allocated memory */
+ msl = rte_mem_virt2memseg_list(addr);
+@@ -325,15 +327,19 @@ virtio_user_mem_event_cb(enum rte_mem_event type __rte_unused,
+ goto exit;
+
+ /* Step 1: pause the active queues */
++#if 0
+ for (i = 0; i < dev->queue_pairs; i++)
+ dev->ops->enable_qp(dev, i, 0);
++#endif
+
+ /* Step 2: update memory regions */
+ dev->ops->send_request(dev, VHOST_USER_SET_MEM_TABLE, NULL);
+
+ /* Step 3: resume the active queues */
++#if 0
+ for (i = 0; i < dev->queue_pairs; i++)
+ dev->ops->enable_qp(dev, i, 1);
++#endif
+
+ exit:
+ pthread_mutex_unlock(&dev->mutex);
+@@ -412,7 +418,7 @@ virtio_user_dev_setup(struct virtio_user_dev *dev)
+ int
+ virtio_user_dev_init(struct virtio_user_dev *dev, char *path, int queues,
+ int cq, int queue_size, const char *mac, char **ifname,
+- int mrg_rxbuf, int in_order)
++ int mrg_rxbuf, int in_order, int fd)
+ {
+ pthread_mutex_init(&dev->mutex, NULL);
+ snprintf(dev->path, PATH_MAX, "%s", path);
+@@ -435,6 +441,12 @@ virtio_user_dev_init(struct virtio_user_dev *dev, char *path, int queues,
+ return -1;
+ }
+
++ if (fd >= 0) {
++ dev->be_fd = fd;
++ } else {
++ dev->be_fd = -1;
++ }
++
+ if (!dev->is_server) {
+ if (dev->ops->send_request(dev, VHOST_USER_SET_OWNER,
+ NULL) < 0) {
+diff --git a/drivers/net/virtio/virtio_user/virtio_user_dev.h b/drivers/net/virtio/virtio_user/virtio_user_dev.h
+index c42ce5d4b..575c21e3b 100644
+--- a/drivers/net/virtio/virtio_user/virtio_user_dev.h
++++ b/drivers/net/virtio/virtio_user/virtio_user_dev.h
+@@ -21,6 +21,7 @@ struct virtio_user_dev {
+ char *ifname;
+ int *vhostfds;
+ int *tapfds;
++ int be_fd;
+
+ /* for both vhost_user and vhost_kernel */
+ int callfds[VIRTIO_MAX_VIRTQUEUES];
+@@ -50,7 +51,7 @@ int virtio_user_start_device(struct virtio_user_dev *dev);
+ int virtio_user_stop_device(struct virtio_user_dev *dev);
+ int virtio_user_dev_init(struct virtio_user_dev *dev, char *path, int queues,
+ int cq, int queue_size, const char *mac, char **ifname,
+- int mrg_rxbuf, int in_order);
++ int mrg_rxbuf, int in_order, int fd);
+ void virtio_user_dev_uninit(struct virtio_user_dev *dev);
+ void virtio_user_handle_cq(struct virtio_user_dev *dev, uint16_t queue_idx);
+ uint8_t virtio_user_handle_mq(struct virtio_user_dev *dev, uint16_t q_pairs);
+diff --git a/drivers/net/virtio/virtio_user_ethdev.c b/drivers/net/virtio/virtio_user_ethdev.c
+index f8791391a..d5e87b24c 100644
+--- a/drivers/net/virtio/virtio_user_ethdev.c
++++ b/drivers/net/virtio/virtio_user_ethdev.c
+@@ -221,8 +221,7 @@ virtio_user_get_features(struct virtio_hw *hw)
+ {
+ struct virtio_user_dev *dev = virtio_user_get_dev(hw);
+
+- /* unmask feature bits defined in vhost user protocol */
+- return dev->device_features & VIRTIO_PMD_SUPPORTED_GUEST_FEATURES;
++ return dev->device_features;
+ }
+
+ static void
+@@ -361,6 +360,8 @@ static const char *valid_args[] = {
+ VIRTIO_USER_ARG_MRG_RXBUF,
+ #define VIRTIO_USER_ARG_IN_ORDER "in_order"
+ VIRTIO_USER_ARG_IN_ORDER,
++#define VIRTIO_USER_ARG_FD "fd"
++ VIRTIO_USER_ARG_FD,
+ NULL
+ };
+
+@@ -464,6 +465,7 @@ virtio_user_pmd_probe(struct rte_vdev_device *dev)
+ uint64_t server_mode = VIRTIO_USER_DEF_SERVER_MODE;
+ uint64_t mrg_rxbuf = 1;
+ uint64_t in_order = 1;
++ uint64_t fd = -1;
+ char *path = NULL;
+ char *ifname = NULL;
+ char *mac_addr = NULL;
+@@ -581,6 +583,15 @@ virtio_user_pmd_probe(struct rte_vdev_device *dev)
+ }
+ }
+
++ if (rte_kvargs_count(kvlist, VIRTIO_USER_ARG_FD) == 1) {
++ if (rte_kvargs_process(kvlist, VIRTIO_USER_ARG_FD,
++ &get_integer_arg, &fd) < 0) {
++ PMD_INIT_LOG(ERR, "error to parse %s",
++ VIRTIO_USER_ARG_FD);
++ goto end;
++ }
++ }
++
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ struct virtio_user_dev *vu_dev;
+
+@@ -598,7 +609,7 @@ virtio_user_pmd_probe(struct rte_vdev_device *dev)
+ vu_dev->is_server = false;
+ if (virtio_user_dev_init(hw->virtio_user_dev, path, queues, cq,
+ queue_size, mac_addr, &ifname, mrg_rxbuf,
+- in_order) < 0) {
++ in_order, fd) < 0) {
+ PMD_INIT_LOG(ERR, "virtio_user_dev_init fails");
+ virtio_user_eth_dev_free(eth_dev);
+ goto end;
+@@ -677,4 +688,5 @@ RTE_PMD_REGISTER_PARAM_STRING(net_virtio_user,
+ "iface=<string> "
+ "server=<0|1> "
+ "mrg_rxbuf=<0|1> "
+- "in_order=<0|1>");
++ "in_order=<0|1>"
++ "fd=<int>");
+--
+2.17.1
+
diff --git a/dpdk/dpdk-v18.11_patches/0006-mempool-add-dynamic-mempool-support.patch b/dpdk/dpdk-v18.11_patches/0006-mempool-add-dynamic-mempool-support.patch
new file mode 100644
index 0000000..bcc9743
--- /dev/null
+++ b/dpdk/dpdk-v18.11_patches/0006-mempool-add-dynamic-mempool-support.patch
@@ -0,0 +1,247 @@
+From 9d2ddfe6012b37297bc84f6ddcce810232162e5b Mon Sep 17 00:00:00 2001
+From: Jianfeng Tan <henry.tjf@antfin.com>
+Date: Wed, 26 Dec 2018 14:39:24 +0000
+Subject: [PATCH 1/2] mempool: add dynamic mempool support
+
+Signed-off-by: Jianfeng Tan <henry.tjf@antfin.com>
+---
+ drivers/mempool/ring/rte_mempool_ring.c | 26 +++++++----
+ lib/librte_mempool/rte_mempool.c | 27 +++++++++--
+ lib/librte_mempool/rte_mempool.h | 62 ++++++++++++++++++++-----
+ 3 files changed, 92 insertions(+), 23 deletions(-)
+
+diff --git a/drivers/mempool/ring/rte_mempool_ring.c b/drivers/mempool/ring/rte_mempool_ring.c
+index bc123fc52..e8fec9119 100644
+--- a/drivers/mempool/ring/rte_mempool_ring.c
++++ b/drivers/mempool/ring/rte_mempool_ring.c
+@@ -49,30 +49,40 @@ common_ring_get_count(const struct rte_mempool *mp)
+ static int
+ common_ring_alloc(struct rte_mempool *mp)
+ {
++ int n;
+ int rg_flags = 0, ret;
+ char rg_name[RTE_RING_NAMESIZE];
+ struct rte_ring *r;
+
+- ret = snprintf(rg_name, sizeof(rg_name),
+- RTE_MEMPOOL_MZ_FORMAT, mp->name);
+- if (ret < 0 || ret >= (int)sizeof(rg_name)) {
+- rte_errno = ENAMETOOLONG;
+- return -rte_errno;
+- }
+-
+ /* ring flags */
+ if (mp->flags & MEMPOOL_F_SP_PUT)
+ rg_flags |= RING_F_SP_ENQ;
+ if (mp->flags & MEMPOOL_F_SC_GET)
+ rg_flags |= RING_F_SC_DEQ;
+
++ if (mp->flags & MEMPOOL_F_DYNAMIC) {
++ n = RTE_MIN(mp->size, mp->populated_size + mp->dynamic_size);
++
++ ret = snprintf(rg_name, sizeof(rg_name),
++ RTE_MEMPOOL_MZ_FORMAT"_%x", mp->name, n);
++ } else {
++ n = mp->size;
++ ret = snprintf(rg_name, sizeof(rg_name),
++ RTE_MEMPOOL_MZ_FORMAT, mp->name);
++ }
++
++ if (ret < 0 || ret >= (int)sizeof(rg_name)) {
++ rte_errno = ENAMETOOLONG;
++ return -rte_errno;
++ }
++
+ /*
+ * Allocate the ring that will be used to store objects.
+ * Ring functions will return appropriate errors if we are
+ * running as a secondary process etc., so no checks made
+ * in this function for that condition.
+ */
+- r = rte_ring_create(rg_name, rte_align32pow2(mp->size + 1),
++ r = rte_ring_create(rg_name, rte_align32pow2(n + 1),
+ mp->socket_id, rg_flags);
+ if (r == NULL)
+ return -rte_errno;
+diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
+index 683b216f9..70039f6c3 100644
+--- a/lib/librte_mempool/rte_mempool.c
++++ b/lib/librte_mempool/rte_mempool.c
+@@ -152,6 +152,8 @@ mempool_add_elem(struct rte_mempool *mp, __rte_unused void *opaque,
+ hdr->mp = mp;
+ hdr->iova = iova;
+ STAILQ_INSERT_TAIL(&mp->elt_list, hdr, next);
++ if (mp->flags & MEMPOOL_F_DYNAMIC && mp->dyn_obj_cb)
++ mp->dyn_obj_cb(mp, NULL, obj, mp->populated_size);
+ mp->populated_size++;
+
+ #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+@@ -426,9 +428,10 @@ rte_mempool_populate_default(struct rte_mempool *mp)
+ ssize_t mem_size;
+ size_t align, pg_sz, pg_shift;
+ rte_iova_t iova;
+- unsigned mz_id, n;
++ unsigned mz_id, n, avail;
+ int ret;
+ bool no_contig, try_contig, no_pageshift, external;
++ bool dynamic = (mp->flags & MEMPOOL_F_DYNAMIC) ? true : false;
+
+ ret = mempool_ops_alloc_once(mp);
+ if (ret != 0)
+@@ -441,7 +444,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
+ external = ret;
+
+ /* mempool must not be populated */
+- if (mp->nb_mem_chunks != 0)
++ if (mp->nb_mem_chunks != 0 && !dynamic)
+ return -EEXIST;
+
+ no_contig = mp->flags & MEMPOOL_F_NO_IOVA_CONTIG;
+@@ -512,7 +515,16 @@ rte_mempool_populate_default(struct rte_mempool *mp)
+ pg_shift = rte_bsf32(pg_sz);
+ }
+
+- for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) {
++ n = mp->size;
++ if (dynamic) {
++ n = RTE_MIN(mp->size - mp->populated_size, mp->dynamic_size);
++ if (mp->nb_mem_chunks != 0 && rte_mempool_ops_alloc(mp) != 0)
++ return -ENOMEM;
++ }
++
++ avail = 0;
++ mz_id = mp->nb_mem_chunks;
++ for (; n > 0; mz_id++, n -= ret, avail += ret) {
+ size_t min_chunk_size;
+ unsigned int flags;
+
+@@ -607,9 +619,16 @@ rte_mempool_populate_default(struct rte_mempool *mp)
+ }
+ }
+
+- return mp->size;
++ return avail;
+
+ fail:
++ if (dynamic) {
++ if (avail)
++ return avail;
++
++ return ret;
++ }
++
+ rte_mempool_free_memchunks(mp);
+ return ret;
+ }
+diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
+index 7c9cd9a2f..0886b19f2 100644
+--- a/lib/librte_mempool/rte_mempool.h
++++ b/lib/librte_mempool/rte_mempool.h
+@@ -207,6 +207,16 @@ struct rte_mempool_info {
+ unsigned int contig_block_size;
+ } __rte_cache_aligned;
+
++struct rte_mempool;
++/**
++ * An object callback function for mempool.
++ *
++ * Used by rte_mempool_create() and rte_mempool_obj_iter().
++ */
++typedef void (rte_mempool_obj_cb_t)(struct rte_mempool *mp,
++ void *opaque, void *obj, unsigned obj_idx);
++typedef rte_mempool_obj_cb_t rte_mempool_obj_ctor_t; /* compat */
++
+ /**
+ * The RTE mempool structure.
+ */
+@@ -247,6 +257,8 @@ struct rte_mempool {
+ struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */
+
+ uint32_t populated_size; /**< Number of populated objects. */
++ uint32_t dynamic_size; /**< Number of dynamic populated objects. */
++ rte_mempool_obj_cb_t *dyn_obj_cb; /**< elem cb for dynamic populated objects. */
+ struct rte_mempool_objhdr_list elt_list; /**< List of objects in pool */
+ uint32_t nb_mem_chunks; /**< Number of memory chunks */
+ struct rte_mempool_memhdr_list mem_list; /**< List of memory chunks */
+@@ -264,6 +276,8 @@ struct rte_mempool {
+ #define MEMPOOL_F_POOL_CREATED 0x0010 /**< Internal: pool is created. */
+ #define MEMPOOL_F_NO_IOVA_CONTIG 0x0020 /**< Don't need IOVA contiguous objs. */
+ #define MEMPOOL_F_NO_PHYS_CONTIG MEMPOOL_F_NO_IOVA_CONTIG /* deprecated */
++#define MEMPOOL_F_DYNAMIC 0x0040 /**< Don't populate element once for all */
++#define MEMPOOL_F_DYNAMIC_NOW 0x0080 /**< It's is dynamically populated now */
+
+ /**
+ * @internal When debug is enabled, store some statistics.
+@@ -839,15 +853,6 @@ int rte_mempool_register_ops(const struct rte_mempool_ops *ops);
+ rte_mempool_register_ops(&ops); \
+ }
+
+-/**
+- * An object callback function for mempool.
+- *
+- * Used by rte_mempool_create() and rte_mempool_obj_iter().
+- */
+-typedef void (rte_mempool_obj_cb_t)(struct rte_mempool *mp,
+- void *opaque, void *obj, unsigned obj_idx);
+-typedef rte_mempool_obj_cb_t rte_mempool_obj_ctor_t; /* compat */
+-
+ /**
+ * A memory callback function for mempool.
+ *
+@@ -989,6 +994,22 @@ struct rte_mempool *
+ rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
+ unsigned cache_size, unsigned private_data_size,
+ int socket_id, unsigned flags);
++
++static inline void
++rte_mempool_set_dynamic_size(struct rte_mempool *mp, int dynamic_size)
++{
++ mp->flags |= MEMPOOL_F_DYNAMIC;
++ mp->dynamic_size = dynamic_size;
++}
++
++static inline void
++rte_mempool_set_dynamic_cb(struct rte_mempool *mp,
++ rte_mempool_obj_cb_t *dyn_obj_cb)
++{
++ mp->flags |= MEMPOOL_F_DYNAMIC;
++ mp->dyn_obj_cb = dyn_obj_cb;
++}
++
+ /**
+ * Free a mempool
+ *
+@@ -1390,9 +1411,28 @@ __mempool_generic_get(struct rte_mempool *mp, void **obj_table,
+ /* get remaining objects from ring */
+ ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, n);
+
+- if (ret < 0)
++ if (ret < 0) {
++ if (mp->flags & MEMPOOL_F_DYNAMIC &&
++ mp->populated_size < mp->size) {
++ int work;
++
++ work = rte_atomic32_cmpset(&mp->flags,
++ mp->flags & ~MEMPOOL_F_DYNAMIC_NOW,
++ mp->flags | MEMPOOL_F_DYNAMIC_NOW);
++ if (work) {
++ int more;
++
++ more = rte_mempool_populate_default(mp);
++ mp->flags &= ~MEMPOOL_F_DYNAMIC_NOW;
++ if (more > 0)
++ goto ring_dequeue;
++ } else {
++ /* mempool is populating, try again */
++ goto ring_dequeue;
++ }
++ }
+ __MEMPOOL_STAT_ADD(mp, get_fail, n);
+- else
++ } else
+ __MEMPOOL_STAT_ADD(mp, get_success, n);
+
+ return ret;
+--
+2.17.1
+
diff --git a/dpdk/dpdk-v18.11_patches/0007-mbuf-add-dynamic-mbuf-mempool-support.patch b/dpdk/dpdk-v18.11_patches/0007-mbuf-add-dynamic-mbuf-mempool-support.patch
new file mode 100644
index 0000000..8618928
--- /dev/null
+++ b/dpdk/dpdk-v18.11_patches/0007-mbuf-add-dynamic-mbuf-mempool-support.patch
@@ -0,0 +1,305 @@
+From c2a2b8eec349156b31f2faab61cc6063ef3f0c61 Mon Sep 17 00:00:00 2001
+From: Jianfeng Tan <henry.tjf@antfin.com>
+Date: Wed, 26 Dec 2018 14:40:07 +0000
+Subject: [PATCH 2/2] mbuf: add dynamic mbuf mempool support
+
+Signed-off-by: Jianfeng Tan <henry.tjf@antfin.com>
+---
+ examples/Makefile | 1 +
+ examples/dynamic_mbuf_pool/Makefile | 56 ++++++++++++++++
+ examples/dynamic_mbuf_pool/main.c | 92 ++++++++++++++++++++++++++
+ examples/dynamic_mbuf_pool/meson.build | 11 +++
+ lib/librte_mbuf/rte_mbuf.c | 51 ++++++++++++++
+ lib/librte_mbuf/rte_mbuf.h | 5 ++
+ lib/librte_mbuf/rte_mbuf_version.map | 8 ++-
+ 7 files changed, 223 insertions(+), 1 deletion(-)
+ create mode 100644 examples/dynamic_mbuf_pool/Makefile
+ create mode 100644 examples/dynamic_mbuf_pool/main.c
+ create mode 100644 examples/dynamic_mbuf_pool/meson.build
+
+diff --git a/examples/Makefile b/examples/Makefile
+index 33fe0e586..3df9cb7ad 100644
+--- a/examples/Makefile
++++ b/examples/Makefile
+@@ -21,6 +21,7 @@ DIRS-$(CONFIG_RTE_LIBRTE_CRYPTODEV) += fips_validation
+ DIRS-$(CONFIG_RTE_LIBRTE_FLOW_CLASSIFY) += flow_classify
+ DIRS-y += flow_filtering
+ DIRS-y += helloworld
++DIRS-y += dynamic_mbuf_pool
+ DIRS-$(CONFIG_RTE_LIBRTE_PIPELINE) += ip_pipeline
+ ifeq ($(CONFIG_RTE_LIBRTE_LPM),y)
+ DIRS-$(CONFIG_RTE_IP_FRAG) += ip_reassembly
+diff --git a/examples/dynamic_mbuf_pool/Makefile b/examples/dynamic_mbuf_pool/Makefile
+new file mode 100644
+index 000000000..f2761f661
+--- /dev/null
++++ b/examples/dynamic_mbuf_pool/Makefile
+@@ -0,0 +1,56 @@
++# SPDX-License-Identifier: BSD-3-Clause
++# Copyright(c) 2010-2014 Intel Corporation
++
++# binary name
++APP = dynamic_mbuf_pool
++
++# all source are stored in SRCS-y
++SRCS-y := main.c
++
++# Build using pkg-config variables if possible
++$(shell pkg-config --exists libdpdk)
++ifeq ($(.SHELLSTATUS),0)
++
++all: shared
++.PHONY: shared static
++shared: build/$(APP)-shared
++ ln -sf $(APP)-shared build/$(APP)
++static: build/$(APP)-static
++ ln -sf $(APP)-static build/$(APP)
++
++PC_FILE := $(shell pkg-config --path libdpdk)
++CFLAGS += -O3 $(shell pkg-config --cflags libdpdk)
++LDFLAGS_SHARED = $(shell pkg-config --libs libdpdk)
++LDFLAGS_STATIC = -Wl,-Bstatic $(shell pkg-config --static --libs libdpdk)
++
++build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build
++ $(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)
++
++build/$(APP)-static: $(SRCS-y) Makefile $(PC_FILE) | build
++ $(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC)
++
++build:
++ @mkdir -p $@
++
++.PHONY: clean
++clean:
++ rm -f build/$(APP) build/$(APP)-static build/$(APP)-shared
++ rmdir --ignore-fail-on-non-empty build
++
++else
++
++ifeq ($(RTE_SDK),)
++$(error "Please define RTE_SDK environment variable")
++endif
++
++# Default target, can be overridden by command line or environment
++RTE_TARGET ?= x86_64-native-linuxapp-gcc
++
++include $(RTE_SDK)/mk/rte.vars.mk
++
++CFLAGS += -O3
++CFLAGS += $(WERROR_FLAGS)
++
++include $(RTE_SDK)/mk/rte.extapp.mk
++
++endif
+diff --git a/examples/dynamic_mbuf_pool/main.c b/examples/dynamic_mbuf_pool/main.c
+new file mode 100644
+index 000000000..a568d7cec
+--- /dev/null
++++ b/examples/dynamic_mbuf_pool/main.c
+@@ -0,0 +1,92 @@
++/* SPDX-License-Identifier: BSD-3-Clause
++ * Copyright(c) 2010-2014 Intel Corporation
++ */
++
++#include <stdio.h>
++#include <string.h>
++#include <stdint.h>
++#include <errno.h>
++#include <sys/queue.h>
++#include <sys/types.h>
++#include <sys/stat.h>
++#include <fcntl.h>
++#include <unistd.h>
++
++#include <rte_memory.h>
++#include <rte_launch.h>
++#include <rte_eal.h>
++#include <rte_per_lcore.h>
++#include <rte_lcore.h>
++#include <rte_debug.h>
++#include <rte_memory.h>
++#include <rte_mbuf.h>
++#include <rte_memzone.h>
++
++#define HUGE_2M "/sys/kernel/mm/hugepages/hugepages-2048kB/free_hugepages"
++#define HUGE_1G "/sys/kernel/mm/hugepages/hugepages-1048576kB/free_hugepages"
++
++static long int
++get_value(const char *path)
++{
++ int fd, len;
++ long int value;
++ char buf[1024];
++
++ fd = open(path, O_RDONLY);
++ if (fd < 0)
++ return ULONG_MAX;
++
++ len = read(fd, buf, sizeof(buf));
++
++ close(fd);
++
++ if (len <= 0) {
++ return ULONG_MAX;
++ }
++
++ value = strtol(buf, NULL, 10);
++ return value;
++}
++
++static void
++print_free_hugepages(void)
++{
++ printf("2M: %ld\t\t1G: %ld\n", get_value(HUGE_2M), get_value(HUGE_1G));
++}
++
++int
++main(int argc, char **argv)
++{
++ int i;
++ int ret;
++ int n = 512 * 1024;
++ int dynamic_size = 8 * 1024;
++ struct rte_mbuf *m;
++ struct rte_mempool *mp;
++
++ ret = rte_eal_init(argc, argv);
++ if (ret < 0)
++ rte_panic("Cannot init EAL\n");
++
++ mp = rte_pktmbuf_dynamic_pool_create("mbuf_pool", n,
++ 64, 0, RTE_MBUF_DEFAULT_BUF_SIZE,
++ 0, dynamic_size);
++ if (mp == NULL)
++ rte_panic("Failed to create mbuf mempool");
++
++ for (i = 0; i < n; i++) {
++ m = rte_pktmbuf_alloc(mp);
++ if (m == NULL)
++ break;
++
++ if ((i % dynamic_size) == 1) {
++ print_free_hugepages();
++ usleep(100 * 1000);
++ }
++ }
++
++ printf("have allocated %d mbufs", i);
++ rte_memzone_dump(stdout);
++
++ return 0;
++}
+diff --git a/examples/dynamic_mbuf_pool/meson.build b/examples/dynamic_mbuf_pool/meson.build
+new file mode 100644
+index 000000000..c34e11e36
+--- /dev/null
++++ b/examples/dynamic_mbuf_pool/meson.build
+@@ -0,0 +1,11 @@
++# SPDX-License-Identifier: BSD-3-Clause
++# Copyright(c) 2017 Intel Corporation
++
++# meson file, for building this example as part of a main DPDK build.
++#
++# To build this example as a standalone application with an already-installed
++# DPDK instance, use 'make'
++
++sources = files(
++ 'main.c'
++)
+diff --git a/lib/librte_mbuf/rte_mbuf.c b/lib/librte_mbuf/rte_mbuf.c
+index 9790b4fb1..b70abd88c 100644
+--- a/lib/librte_mbuf/rte_mbuf.c
++++ b/lib/librte_mbuf/rte_mbuf.c
+@@ -167,6 +167,57 @@ rte_pktmbuf_pool_create(const char *name, unsigned int n,
+ data_room_size, socket_id, NULL);
+ }
+
++struct rte_mempool *
++rte_pktmbuf_dynamic_pool_create(const char *name, unsigned int n,
++ unsigned int cache_size, uint16_t priv_size,
++ uint16_t data_room_size, int socket_id, int dynamic_size)
++{
++ struct rte_mempool *mp;
++ struct rte_pktmbuf_pool_private mbp_priv;
++ const char *mp_ops_name;
++ unsigned elt_size;
++ int ret;
++
++ if (RTE_ALIGN(priv_size, RTE_MBUF_PRIV_ALIGN) != priv_size) {
++ RTE_LOG(ERR, MBUF, "mbuf priv_size=%u is not aligned\n",
++ priv_size);
++ rte_errno = EINVAL;
++ return NULL;
++ }
++ elt_size = sizeof(struct rte_mbuf) + (unsigned)priv_size +
++ (unsigned)data_room_size;
++ mbp_priv.mbuf_data_room_size = data_room_size;
++ mbp_priv.mbuf_priv_size = priv_size;
++
++ mp = rte_mempool_create_empty(name, n, elt_size, cache_size,
++ sizeof(struct rte_pktmbuf_pool_private),
++ socket_id, MEMPOOL_F_DYNAMIC);
++ if (mp == NULL)
++ return NULL;
++
++ mp_ops_name = rte_mbuf_best_mempool_ops();
++ ret = rte_mempool_set_ops_byname(mp, mp_ops_name, NULL);
++ if (ret != 0) {
++ RTE_LOG(ERR, MBUF, "error setting mempool handler\n");
++ rte_mempool_free(mp);
++ rte_errno = -ret;
++ return NULL;
++ }
++ rte_pktmbuf_pool_init(mp, &mbp_priv);
++
++ rte_mempool_set_dynamic_size(mp, dynamic_size);
++ rte_mempool_set_dynamic_cb(mp, rte_pktmbuf_init);
++
++ ret = rte_mempool_populate_default(mp);
++ if (ret < 0) {
++ rte_mempool_free(mp);
++ rte_errno = -ret;
++ return NULL;
++ }
++
++ return mp;
++}
++
+ /* do some sanity checks on a mbuf: panic if it fails */
+ void
+ rte_mbuf_sanity_check(const struct rte_mbuf *m, int is_header)
+diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
+index 3dbc6695e..5a2d81605 100644
+--- a/lib/librte_mbuf/rte_mbuf.h
++++ b/lib/librte_mbuf/rte_mbuf.h
+@@ -1183,6 +1183,11 @@ rte_pktmbuf_pool_create(const char *name, unsigned n,
+ unsigned cache_size, uint16_t priv_size, uint16_t data_room_size,
+ int socket_id);
+
++struct rte_mempool *
++rte_pktmbuf_dynamic_pool_create(const char *name, unsigned int n,
++ unsigned int cache_size, uint16_t priv_size,
++ uint16_t data_room_size, int socket_id, int dynamic_size);
++
+ /**
+ * Create a mbuf pool with a given mempool ops name
+ *
+diff --git a/lib/librte_mbuf/rte_mbuf_version.map b/lib/librte_mbuf/rte_mbuf_version.map
+index cae68db8d..d6d25af95 100644
+--- a/lib/librte_mbuf/rte_mbuf_version.map
++++ b/lib/librte_mbuf/rte_mbuf_version.map
+@@ -44,4 +44,10 @@ DPDK_18.08 {
+ rte_mbuf_set_user_mempool_ops;
+ rte_mbuf_user_mempool_ops;
+ rte_pktmbuf_pool_create_by_ops;
+-} DPDK_16.11;
++} DPDK_18.11;
++
++DPDK_18.11 {
++ global:
++
++ rte_pktmbuf_dynamic_pool_create;
++} DPDK_18.12;
+--
+2.17.1
+
diff --git a/dpdk/dpdk-v18.11_patches/0008-mempool-prioritize-constructor.patch b/dpdk/dpdk-v18.11_patches/0008-mempool-prioritize-constructor.patch
new file mode 100644
index 0000000..c941443
--- /dev/null
+++ b/dpdk/dpdk-v18.11_patches/0008-mempool-prioritize-constructor.patch
@@ -0,0 +1,30 @@
+From cd36895a4a7bfc342915b42e3856bd233452f0bd Mon Sep 17 00:00:00 2001
+From: Jianfeng Tan <henry.tjf@antfin.com>
+Date: Fri, 13 Jul 2018 15:25:22 +0800
+Subject: [PATCH 1/9] mempool: prioritize constructor
+
+---
+ lib/librte_mempool/rte_mempool.h | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
+index 7c9cd9a2f..bdc32d583 100644
+--- a/lib/librte_mempool/rte_mempool.h
++++ b/lib/librte_mempool/rte_mempool.h
+@@ -833,10 +833,10 @@ int rte_mempool_register_ops(const struct rte_mempool_ops *ops);
+ * more than RTE_MEMPOOL_MAX_OPS_IDX is registered.
+ */
+ #define MEMPOOL_REGISTER_OPS(ops) \
+- void mp_hdlr_init_##ops(void); \
+- void __attribute__((constructor, used)) mp_hdlr_init_##ops(void)\
++ static void __attribute__((constructor(101), used)) \
++ mp_hdlr_init_##ops(void) \
+ { \
+- rte_mempool_register_ops(&ops); \
++ rte_mempool_register_ops(&ops); \
+ }
+
+ /**
+--
+2.17.1
+
diff --git a/dpdk/dpdk-v18.11_patches/0009-net-virtio-fill-desc-limit.patch b/dpdk/dpdk-v18.11_patches/0009-net-virtio-fill-desc-limit.patch
new file mode 100644
index 0000000..146ea88
--- /dev/null
+++ b/dpdk/dpdk-v18.11_patches/0009-net-virtio-fill-desc-limit.patch
@@ -0,0 +1,42 @@
+commit 470acd1b108f20ae12b1216c9f6157b78655bcc7
+Author: Jianfeng Tan <henry.tjf@antfin.com>
+Date: Wed Dec 12 02:14:03 2018 +0000
+
+ net/virtio: fill desc limit
+
+ We shall fill desc limit accordingly, or APIs, such as
+ rte_eth_dev_adjust_nb_rx_tx_desc, will not give correct desc
+ information.
+
+ Signed-off-by: Jianfeng Tan <henry.tjf@antfin.com>
+
+diff --git a/drivers/net/virtio/virtio_ethdev.c b/drivers/net/virtio/virtio_ethdev.c
+index dbfa6865c..d369d5ce8 100644
+--- a/drivers/net/virtio/virtio_ethdev.c
++++ b/drivers/net/virtio/virtio_ethdev.c
+@@ -2172,6 +2172,7 @@ virtio_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
+ {
+ uint64_t tso_mask, host_features;
+ struct virtio_hw *hw = dev->data->dev_private;
++ struct virtqueue *vq;
+
+ dev_info->speed_capa = ETH_LINK_SPEED_10G; /* fake value */
+
+@@ -2209,6 +2210,17 @@ virtio_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
+ (1ULL << VIRTIO_NET_F_HOST_TSO6);
+ if ((host_features & tso_mask) == tso_mask)
+ dev_info->tx_offload_capa |= DEV_TX_OFFLOAD_TCP_TSO;
++
++
++ if (hw->vqs) {
++ vq = hw->vqs[VTNET_SQ_RQ_QUEUE_IDX];
++ dev_info->rx_desc_lim.nb_max = vq->vq_nentries;
++ dev_info->rx_desc_lim.nb_min = 256;
++
++ vq = hw->vqs[VTNET_SQ_TQ_QUEUE_IDX];
++ dev_info->tx_desc_lim.nb_max = vq->vq_nentries;
++ dev_info->tx_desc_lim.nb_min = 256;
++ }
+ }
+
+ /*
diff --git a/examples/Makefile b/examples/Makefile
index cf13574..9ef8d85 100644
--- a/examples/Makefile
+++ b/examples/Makefile
@@ -21,6 +21,6 @@ endif
include $(RTE_SDK)/mk/rte.vars.mk
-DIRS-y += l4fwd
+#DIRS-y += l4fwd
include $(TLDK_ROOT)/mk/tle.subdir.mk
diff --git a/examples/l4fwd/main.c b/examples/l4fwd/main.c
index 9396403..2e16479 100644
--- a/examples/l4fwd/main.c
+++ b/examples/l4fwd/main.c
@@ -68,7 +68,6 @@ static char proto_name[3][10] = {"udp", "tcp", ""};
static const struct rte_eth_conf port_conf_default = {
.rxmode = {
- .offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
},
};
diff --git a/examples/l4fwd/port.h b/examples/l4fwd/port.h
index a154844..67ca19a 100644
--- a/examples/l4fwd/port.h
+++ b/examples/l4fwd/port.h
@@ -177,21 +177,10 @@ port_init(struct netbe_port *uprt, uint32_t proto)
}
port_conf = port_conf_default;
- if ((uprt->rx_offload & RX_CSUM_OFFLOAD) != 0) {
- RTE_LOG(ERR, USER1, "%s(%u): enabling RX csum offload;\n",
- __func__, uprt->id);
- port_conf.rxmode.offloads |= uprt->rx_offload & RX_CSUM_OFFLOAD;
- }
- port_conf.rxmode.max_rx_pkt_len = uprt->mtu + ETHER_CRC_LEN;
- if (port_conf.rxmode.max_rx_pkt_len > ETHER_MAX_LEN)
- port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_JUMBO_FRAME;
-
rc = update_rss_conf(uprt, &dev_info, &port_conf, proto);
if (rc != 0)
return rc;
- port_conf.txmode.offloads = uprt->tx_offload;
-
rc = rte_eth_dev_configure(uprt->id, uprt->nb_lcore, uprt->nb_lcore,
&port_conf);
RTE_LOG(NOTICE, USER1,
diff --git a/lib/Makefile b/lib/Makefile
index 6317af9..9bbe159 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -25,5 +25,6 @@ DIRS-y += libtle_misc
DIRS-y += libtle_dring
DIRS-y += libtle_timer
DIRS-y += libtle_l4p
+DIRS-y += libtle_glue
include $(TLDK_ROOT)/mk/tle.subdir.mk
diff --git a/lib/libtle_glue/Makefile b/lib/libtle_glue/Makefile
new file mode 100644
index 0000000..13ceb82
--- /dev/null
+++ b/lib/libtle_glue/Makefile
@@ -0,0 +1,62 @@
+# Copyright (c) 2018 Ant Financial Services Group.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ifeq ($(RTE_SDK),)
+$(error "Please define RTE_SDK environment variable")
+endif
+
+# Default target, can be overwritten by command line or environment
+RTE_TARGET ?= x86_64-native-linuxapp-gcc
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+# library name
+LIB = libtle_glue.a
+
+CFLAGS += -O3
+CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR)
+
+EXPORT_MAP := tle_glue_version.map
+
+LIBABIVER := 1
+
+# source files
+SRCS-y += fd.c
+SRCS-y += ctx.c
+SRCS-y += arp.c
+SRCS-y += icmp.c
+SRCS-y += rxcb.c
+SRCS-y += port.c
+SRCS-y += sym.c
+SRCS-y += init.c
+SRCS-y += be.c
+SRCS-y += epoll.c
+SRCS-y += socket.c
+SRCS-y += rxtx.c
+SRCS-y += poll.c
+SRCS-y += util.c
+SRCS-y += tcp.c
+SRCS-y += udp.c
+SRCS-y += select.c
+
+ifeq ($(PACKETDRILL),y)
+SRCS-y += packetdrill.c
+endif
+
+# install this header file
+SYMLINK-y-include += tle_glue.h
+
+# this lib dependencies
+DEPDIRS-y += lib/libtle_l4p
+
+include $(TLDK_ROOT)/mk/tle.lib.mk
diff --git a/lib/libtle_glue/arp.c b/lib/libtle_glue/arp.c
new file mode 100644
index 0000000..9b13d9e
--- /dev/null
+++ b/lib/libtle_glue/arp.c
@@ -0,0 +1,935 @@
+/*
+ * Copyright (c) 2019 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netinet/icmp6.h>
+
+#include <rte_ethdev.h>
+#include <rte_arp.h>
+#include <rte_ip.h>
+#include <rte_hash.h>
+#include <rte_byteorder.h>
+
+#include "log.h"
+#include "ctx.h"
+#include "internal.h"
+#include "tle_timer.h"
+#include "util.h"
+#include "ndp.h"
+#include "gateway.h"
+
+#define IPV6_MULTI_MASK_LEN 13
+
+const struct in6_addr ipv6_all_multi = {{{
+ 0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01
+}}};
+
+const struct in6_addr ipv6_multi_mask = {{{
+ 0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+}}};
+
+static inline void
+set_multicast_mac_v6(struct ether_addr *addr, const struct in6_addr *ip6_addr)
+{
+ unaligned_uint16_t *ea_words = (unaligned_uint16_t *)addr;
+
+ ea_words[0] = 0x3333;
+ ea_words[1] = ip6_addr->__in6_u.__u6_addr16[6];
+ ea_words[2] = ip6_addr->__in6_u.__u6_addr16[7];
+}
+
+static inline void
+set_multicast_ipv6(uint8_t ipv6[16])
+{
+ rte_memcpy(ipv6, &ipv6_multi_mask, IPV6_MULTI_MASK_LEN);
+}
+
+static inline void
+set_broadcast_addr(struct ether_addr *addr)
+{
+ unaligned_uint16_t *ea_words = (unaligned_uint16_t *)addr;
+
+ ea_words[0] = 0xFFFF;
+ ea_words[1] = 0xFFFF;
+ ea_words[2] = 0xFFFF;
+}
+
+static inline bool
+match_addr(struct glue_ctx *ctx, struct rte_mbuf *pkt, const struct in_addr *addr)
+{
+ struct ipv4_hdr *ip4h;
+ const struct in_addr *gw;
+
+ ip4h = rte_pktmbuf_mtod_offset(pkt, struct ipv4_hdr *, pkt->l2_len);
+ if ((ip4h->version_ihl >> 4) != 4)
+ return false;
+
+ gw = ipv4_gateway_lookup(ctx, (struct in_addr *)&ip4h->dst_addr);
+ if (gw->s_addr != addr->s_addr)
+ return false;
+
+ return true;
+}
+
+static inline bool
+match_addr6(struct glue_ctx *ctx, struct rte_mbuf *pkt,
+ const struct in6_addr *addr)
+{
+ struct ipv6_hdr *ip6h;
+ const struct in6_addr *gw;
+
+ ip6h = rte_pktmbuf_mtod_offset(pkt, struct ipv6_hdr *, pkt->l2_len);
+ if (((ip6h->vtc_flow & 0xffffff00) >> 4) != 6)
+ return false;
+
+ gw = ipv6_gateway_lookup(ctx, (struct in6_addr *)&ip6h->dst_addr);
+ if (memcmp(gw, addr, sizeof(struct in6_addr)) != 0)
+ return false;
+
+ return true;
+}
+
+static inline void
+send_pkts(struct glue_ctx *ctx, struct rte_mbuf **pkts, uint16_t nb,
+ const char *prefix)
+{
+ uint16_t i, sent;
+
+ sent = rte_eth_tx_burst(ctx->port_id, ctx->queue_id, pkts, nb);
+ for (i = sent; i < nb; i++)
+ rte_pktmbuf_free(pkts[i]);
+
+ RTE_SET_USED(prefix);
+ TRACE("%s, send %u/%u pkts", prefix, sent, nb);
+}
+
+static void
+flush_arp_wait(int af, struct glue_ctx *ctx, const void *addr,
+ struct ether_addr *e_addr)
+{
+ struct rte_mbuf *pkt, *pre, *pkts[MAX_PKTS_BURST];
+ struct ether_hdr *eth;
+ uint32_t nb_pkts;
+
+ pre = NULL;
+ nb_pkts = 0;
+ for (pkt = ctx->arp_wait; pkt; pkt = pkt->next_pkt) {
+ if ((af == AF_INET &&
+ !match_addr(ctx, pkt, (const struct in_addr *)addr)) ||
+ (af == AF_INET6 &&
+ !match_addr6(ctx, pkt, (const struct in6_addr *)addr))) {
+ pre = pkt;
+ continue;
+ }
+
+ if (pre == NULL)
+ ctx->arp_wait = pkt->next_pkt;
+ else
+ pre->next_pkt = pkt->next_pkt;
+ eth = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+ ether_addr_copy(e_addr, &eth->d_addr);
+ pkts[nb_pkts++] = pkt;
+ if (nb_pkts == MAX_PKTS_BURST) {
+ send_pkts(ctx, pkts, nb_pkts, "ARP learned");
+ nb_pkts = 0;
+ }
+ }
+ if (nb_pkts)
+ send_pkts(ctx, pkts, nb_pkts, "ARP learned");
+}
+
+static inline void
+ipv4_dst_set(struct glue_ctx *ctx, struct tle_dest *dst,
+ const struct in_addr *addr, struct ether_addr *e_addr)
+{
+ struct ether_hdr *eth;
+ struct ipv4_hdr *ip4h;
+
+ if (is_ipv4_loopback_addr(addr->s_addr, ctx))
+ dst->mtu = MTU_LOOPBACK;
+ else
+ dst->mtu = MTU_NORMAL;
+ dst->l2_len = sizeof(*eth);
+ dst->head_mp = get_mempool_by_socket(0); /* fix me */
+
+ eth = (struct ether_hdr *)dst->hdr;
+ ether_addr_copy(&ctx->mac, &eth->s_addr);
+ if (e_addr == NULL)
+ set_broadcast_addr(&eth->d_addr);
+ else
+ ether_addr_copy(e_addr, &eth->d_addr);
+ eth->ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv4);
+
+ dst->l3_len = sizeof(*ip4h);
+ ip4h = (struct ipv4_hdr *)(eth + 1);
+ ip4h->dst_addr = addr->s_addr;
+ ip4h->version_ihl = 4 << 4 | sizeof(*ip4h) / IPV4_IHL_MULTIPLIER;
+ ip4h->time_to_live = 64;
+ ip4h->next_proto_id = IPPROTO_TCP;
+}
+
+static inline void
+ipv6_dst_set(struct glue_ctx *ctx, struct tle_dest *dst,
+ const struct in6_addr *addr, struct ether_addr *e_addr)
+{
+ struct ether_hdr *eth;
+ struct ipv6_hdr *ip6h;
+
+ if (is_ipv6_loopback_addr(addr, ctx))
+ dst->mtu = MTU_LOOPBACK;
+ else
+ dst->mtu = MTU_NORMAL;
+ dst->l2_len = sizeof(*eth);
+ dst->head_mp = get_mempool_by_socket(0); /* fix me */
+
+ eth = (struct ether_hdr *)dst->hdr;
+ ether_addr_copy(&ctx->mac, &eth->s_addr);
+ if (e_addr == NULL)
+ set_broadcast_addr(&eth->d_addr);
+ else
+ ether_addr_copy(e_addr, &eth->d_addr);
+ eth->ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv6);
+
+ dst->l3_len = sizeof(*ip6h);
+ ip6h = (struct ipv6_hdr *)(eth + 1);
+ rte_memcpy(ip6h->dst_addr, addr, sizeof(struct in6_addr));
+ ip6h->vtc_flow = 6 << 4;
+ ip6h->hop_limits = 255;
+ ip6h->proto = IPPROTO_TCP;
+}
+
+#define arp_timer(ctx, entry, interval) \
+ tle_timer_start(ctx->arp_tmw, entry, interval)
+
+void
+ipv4_dst_add(struct glue_ctx *ctx, const struct in_addr *addr,
+ struct ether_addr *e_addr)
+{
+ struct arp_entry *entry;
+ struct tle_dest *dst;
+ struct ether_hdr *eth;
+ uint64_t idx;
+ bool check_wait;
+ int rc;
+
+ rc = rte_hash_lookup_data(ctx->arp_hash, addr, (void**)&idx);
+ if (rc >= 0) {
+ entry = &ctx->arp4[idx];
+ dst = &entry->dst;
+ eth = (struct ether_hdr *)dst->hdr;
+ check_wait = is_broadcast_ether_addr(&eth->d_addr);
+
+ /* update arp entry, reset timer */
+ ether_addr_copy(e_addr, &eth->d_addr);
+ print_arp(AF_INET, addr, &eth->d_addr, "UPDATE");
+ if(entry->timer != NULL)
+ tle_timer_stop(ctx->arp_tmw, entry->timer);
+ entry->timer = arp_timer(ctx, entry, ARP_ENTRY_EXPIRE);
+ entry->inuse = 0;
+ entry->req_time = 0;
+
+ if(check_wait)
+ flush_arp_wait(AF_INET, ctx, addr, e_addr);
+
+ return;
+ }
+
+ idx = ctx->arp4_num;
+ entry = &ctx->arp4[idx];
+ dst = &entry->dst;
+
+ ipv4_dst_set(ctx, dst, addr, e_addr);
+ if (e_addr == NULL) {
+ entry->timer = arp_timer(ctx, entry, ARP_REQUEST_EXPIRE);
+ entry->req_time = 1;
+ } else {
+ entry->timer = arp_timer(ctx, entry, ARP_ENTRY_EXPIRE);
+ entry->inuse = 0;
+ }
+
+ rc = rte_hash_add_key_data(ctx->arp_hash, addr, (void *)idx);
+ if (rc < 0)
+ rte_panic("Failed to add ARP entry");
+
+ ctx->arp4_num++;
+ eth = (struct ether_hdr *)dst->hdr;
+ print_arp(AF_INET, addr, &eth->d_addr, "ADD");
+}
+
+void
+ipv6_dst_add(struct glue_ctx *ctx, const struct in6_addr *addr,
+ struct ether_addr *e_addr)
+{
+ struct arp_entry* entry;
+ struct tle_dest *dst;
+ struct ether_hdr *eth;
+ uint64_t idx;
+ bool check_wait;
+ int rc;
+
+ rc = rte_hash_lookup_data(ctx->arp6_hash, addr, (void**)&idx);
+ if (rc >= 0) {
+ entry = &ctx->arp6[idx];
+ dst = &entry->dst;
+ eth = (struct ether_hdr *)dst->hdr;
+ check_wait = is_broadcast_ether_addr(&eth->d_addr);
+
+ /* update arp entry, reset timer */
+ ether_addr_copy(e_addr, &eth->d_addr);
+ print_arp(AF_INET6, addr, &eth->d_addr, "UPDATE");
+ if(entry->timer != NULL)
+ tle_timer_stop(ctx->arp_tmw, entry->timer);
+ entry->timer = arp_timer(ctx, entry, ARP_ENTRY_EXPIRE);
+ entry->inuse = 0;
+ entry->req_time = 0;
+
+ if(check_wait)
+ flush_arp_wait(AF_INET6, ctx, addr, e_addr);
+
+ return;
+ }
+
+ idx = ctx->arp6_num;
+ entry = &ctx->arp6[idx];
+ dst = &entry->dst;
+
+ ipv6_dst_set(ctx, dst, addr, e_addr);
+ if (e_addr == NULL) {
+ entry->timer = arp_timer(ctx, entry, ARP_REQUEST_EXPIRE);
+ entry->req_time = 1;
+ } else {
+ entry->timer = arp_timer(ctx, entry, ARP_ENTRY_EXPIRE);
+ entry->inuse = 0;
+ }
+
+ rc = rte_hash_add_key_data(ctx->arp6_hash, addr, (void *)idx);
+ if (rc < 0)
+ rte_panic("Failed to add ARP6 entry");
+
+ eth = (struct ether_hdr *)dst->hdr;
+ print_arp(AF_INET6, addr, &eth->d_addr, "ADD");
+ ctx->arp6_num++;
+}
+
+static inline int
+arp_ip_exist(const struct rte_hash *h, const void *ip)
+{
+ return rte_hash_lookup(h, ip) >= 0;
+}
+
+struct rte_mbuf *
+ndp_recv(struct glue_ctx *ctx, struct rte_mbuf *m,
+ uint32_t l2len, uint32_t l3len)
+{
+ struct ether_hdr *eth_h;
+ struct ipv6_hdr *ipv6_h;
+ struct nd_neighbor_solicit *ns_h;
+ struct nd_opt_hdr *opth;
+
+ eth_h = rte_pktmbuf_mtod(m, struct ether_hdr *);
+ ipv6_h = rte_pktmbuf_mtod_offset(m, struct ipv6_hdr *, l2len);
+ ns_h = rte_pktmbuf_mtod_offset(m, struct nd_neighbor_solicit *,
+ l2len + l3len);
+
+ if (ipv6_h->payload_len < sizeof(struct nd_neighbor_solicit))
+ goto drop;
+
+ /* We only learn mac when:
+ * 1. Normal NS for my ip, whose TargetAddr is me
+ * 2. Normal NA to my ip, whose DstIpv6 is me
+ * 3. Unsolicited NA, and we already have an entry for that IP
+ */
+
+ /* NS message */
+ if (ns_h->nd_ns_hdr.icmp6_type == ND_NEIGHBOR_SOLICIT) {
+ /* not support Duplicate Address Detect NS yet */
+ if (IN6_IS_ADDR_UNSPECIFIED(ipv6_h->src_addr))
+ goto drop;
+
+ if (memcmp(&ns_h->nd_ns_target, &ctx->ipv6, sizeof(ctx->ipv6)))
+ goto drop;
+
+ /* NS message, target is my ipv6 addr */
+ opth = (struct nd_opt_hdr*)(ns_h + 1);
+ ipv6_dst_add(ctx, (struct in6_addr *)ipv6_h->src_addr,
+ (struct ether_addr *)(opth + 1));
+
+ /* response NA message */
+ ether_addr_copy(&ctx->mac, &eth_h->s_addr);
+ ether_addr_copy((struct ether_addr*)(opth + 1),
+ &eth_h->d_addr);
+
+ rte_memcpy(ipv6_h->dst_addr, ipv6_h->src_addr,
+ sizeof(struct in6_addr));
+ rte_memcpy(ipv6_h->src_addr, &ctx->ipv6,
+ sizeof(struct in6_addr));
+
+ ns_h->nd_ns_hdr.icmp6_type = ND_NEIGHBOR_ADVERT;
+ ns_h->nd_ns_hdr.icmp6_dataun.icmp6_un_data8[0] = 0x60;
+ ns_h->nd_ns_hdr.icmp6_cksum = 0;
+
+ opth->nd_opt_type = ND_OPT_TARGET_LINKLAYER_ADDR;
+ ether_addr_copy(&ctx->mac, (struct ether_addr*)(opth + 1));
+
+ ns_h->nd_ns_hdr.icmp6_cksum = rte_ipv6_udptcp_cksum(ipv6_h, ns_h);
+
+ if (m->pkt_len < ETHER_MIN_LEN)
+ rte_pktmbuf_append(m, ETHER_MIN_LEN - m->pkt_len);
+
+ send_pkts(ctx, &m, 1, "NDP NA reply");
+ return NULL;
+ }
+
+ /* NA message */
+ if (memcmp(ipv6_h->dst_addr, &ctx->ipv6, sizeof(ctx->ipv6)) == 0 ||
+ (memcmp(ipv6_h->dst_addr, &ipv6_all_multi, sizeof(ctx->ipv6)) == 0 &&
+ arp_ip_exist(ctx->arp6_hash, &ns_h->nd_ns_target))) {
+ opth = (struct nd_opt_hdr *)(ns_h + 1);
+ ipv6_dst_add(ctx, &ns_h->nd_ns_target,
+ (struct ether_addr *)(opth + 1));
+ }
+
+drop:
+ rte_pktmbuf_free(m);
+ return NULL;
+}
+
+struct rte_mbuf *
+arp_recv(struct glue_ctx *ctx, struct rte_mbuf *m, uint32_t l2len)
+{
+ struct ether_hdr *eth;
+ struct arp_hdr *ahdr;
+ struct arp_ipv4 *adata;
+ uint32_t tip;
+
+ eth = rte_pktmbuf_mtod(m, struct ether_hdr *);
+ ahdr = rte_pktmbuf_mtod_offset(m, struct arp_hdr *, l2len);
+
+ if (ahdr->arp_hrd != rte_be_to_cpu_16(ARP_HRD_ETHER) ||
+ ahdr->arp_pro != rte_be_to_cpu_16(ETHER_TYPE_IPv4))
+ goto drop;
+
+ adata = &ahdr->arp_data;
+ tip = adata->arp_tip;
+
+ /* We only learn mac when:
+ * 1. tip is me, or
+ * 2. this is a RARP, and we already have an entry for that IP
+ */
+ if (tip == ctx->ipv4 ||
+ (tip == INADDR_ANY && arp_ip_exist(ctx->arp_hash, &adata->arp_sip)))
+ ipv4_dst_add(ctx, (struct in_addr *)&adata->arp_sip,
+ &adata->arp_sha);
+
+ /* We only do ARP reply when:
+ * 1. tip is me.
+ */
+ if (ahdr->arp_op == rte_be_to_cpu_16(ARP_OP_REQUEST) &&
+ tip == ctx->ipv4) {
+ eth->d_addr = eth->s_addr;
+ eth->s_addr = ctx->mac;
+ ahdr->arp_op = rte_cpu_to_be_16(ARP_OP_REPLY);
+
+ adata->arp_tip = adata->arp_sip;
+ adata->arp_sip = tip;
+
+ adata->arp_tha = adata->arp_sha;
+ adata->arp_sha = ctx->mac;
+ if (m->pkt_len < ETHER_MIN_LEN)
+ rte_pktmbuf_append(m, ETHER_MIN_LEN - m->pkt_len);
+ send_pkts(ctx, &m, 1, "ARP reply");
+ return NULL;
+ }
+drop:
+ rte_pktmbuf_free(m);
+ return NULL;
+}
+
+static void
+arp6_send_request(struct glue_ctx *ctx, const struct in6_addr *addr)
+{
+ struct rte_mempool *mp = get_mempool_by_socket(0); /* fix me */
+ struct ether_hdr *eth;
+ struct ipv6_hdr *ip6h;
+ struct nd_neighbor_solicit *nsh;
+ struct nd_opt_hdr *opth;
+ struct ether_addr *sll_addr;
+ struct rte_mbuf *m;
+#ifdef ENABLE_TRACE
+ char str_ip[64];
+#endif
+
+ m = rte_pktmbuf_alloc(mp);
+ if (m == NULL)
+ rte_panic("Failed to alloc mbuf for ndp ns request");
+
+ eth = (struct ether_hdr *)rte_pktmbuf_append(m, sizeof(*eth));
+ ether_addr_copy(&ctx->mac, &eth->s_addr);
+ set_multicast_mac_v6(&eth->d_addr, addr);
+ eth->ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv6);
+
+ ip6h = (struct ipv6_hdr*)rte_pktmbuf_append(m, sizeof(struct ipv6_hdr));
+ ip6h->vtc_flow = 6 << 4;
+ ip6h->payload_len = sizeof(struct nd_neighbor_solicit) +
+ sizeof(struct nd_opt_hdr) +
+ sizeof(struct ether_addr);
+ ip6h->proto = IPPROTO_ICMPV6;
+ ip6h->hop_limits = 255;
+ rte_memcpy(ip6h->src_addr, &ctx->ipv6, sizeof(struct in6_addr));
+ rte_memcpy(ip6h->dst_addr, addr, sizeof(struct in6_addr));
+ set_multicast_ipv6(ip6h->dst_addr);
+
+ nsh = (struct nd_neighbor_solicit *)rte_pktmbuf_append(m, sizeof(*nsh));
+ nsh->nd_ns_hdr.icmp6_type = ND_NEIGHBOR_SOLICIT;
+ nsh->nd_ns_hdr.icmp6_code = 0;
+ nsh->nd_ns_hdr.icmp6_cksum = 0;
+ nsh->nd_ns_hdr.icmp6_dataun.icmp6_un_data32[0] = 0;
+ rte_memcpy(&nsh->nd_ns_target, addr, sizeof(struct in6_addr));
+
+ opth = (struct nd_opt_hdr *)rte_pktmbuf_append(m, sizeof(*opth));
+ opth->nd_opt_type = ND_OPT_SOURCE_LINKLAYER_ADDR;
+ opth->nd_opt_len = 1;
+
+ sll_addr = (struct ether_addr *)rte_pktmbuf_append(m, sizeof(*sll_addr));
+ ether_addr_copy(&ctx->mac, sll_addr);
+
+ nsh->nd_ns_hdr.icmp6_cksum = rte_ipv6_udptcp_cksum(ip6h, nsh);
+
+ send_pkts(ctx, &m, 1, "ARP6 request");
+}
+
+static void
+arp_send_request(struct glue_ctx *ctx, const struct in_addr *addr)
+{
+ struct rte_mempool *mp = get_mempool_by_socket(0); /* fix me */
+ struct ether_hdr *eth;
+ struct arp_hdr *ahdr;
+ struct arp_ipv4 *adata;
+ struct rte_mbuf *m;
+ uint16_t pad_len, i;
+ char *pad;
+
+ m = rte_pktmbuf_alloc(mp);
+ if (m == NULL)
+ rte_panic("Failed to alloc mbuf for arp request");
+
+ eth = (struct ether_hdr *)rte_pktmbuf_append(m, sizeof(*eth));
+ ether_addr_copy(&ctx->mac, &eth->s_addr);
+ set_broadcast_addr(&eth->d_addr);
+ eth->ether_type = rte_cpu_to_be_16(ETHER_TYPE_ARP);
+
+ ahdr = (struct arp_hdr *)rte_pktmbuf_append(m, sizeof(*ahdr));
+ ahdr->arp_hrd = rte_be_to_cpu_16(ARP_HRD_ETHER);
+ ahdr->arp_pro = rte_be_to_cpu_16(ETHER_TYPE_IPv4);
+ ahdr->arp_hln = sizeof(struct ether_addr);
+ ahdr->arp_pln = sizeof(*addr);
+ ahdr->arp_op = rte_be_to_cpu_16(ARP_OP_REQUEST);
+ adata = &ahdr->arp_data;
+ ether_addr_copy(&ctx->mac, &adata->arp_sha);
+ adata->arp_sip = ctx->ipv4;
+ set_broadcast_addr(&adata->arp_tha);
+ adata->arp_tip = addr->s_addr;
+
+ pad_len = ETHER_MIN_LEN - sizeof(*eth) - sizeof(*ahdr);
+ pad = rte_pktmbuf_append(m, pad_len);
+ for (i = 0; i < pad_len; ++i)
+ pad[i] = 0;
+
+ send_pkts(ctx, &m, 1, "ARP request");
+}
+
+#define addr2ipv4(addr) (&((const struct sockaddr_in *)addr)->sin_addr)
+#define addr2ipv6(addr) (&((const struct sockaddr_in6 *)addr)->sin6_addr)
+void
+mac_check(struct glue_ctx *ctx, const struct sockaddr *addr)
+{
+ int rc;
+ const struct in_addr *addr4 = NULL;
+ const struct in6_addr *addr6 = NULL;
+
+ if(addr->sa_family == AF_INET) {
+ addr4 = ipv4_gateway_lookup(ctx, addr2ipv4(addr));
+ rc = rte_hash_lookup(ctx->arp_hash, addr4);
+ } else {
+ addr6 = ipv6_gateway_lookup(ctx, addr2ipv6(addr));
+ rc = rte_hash_lookup(ctx->arp6_hash, addr6);
+ }
+ if (rc >= 0)
+ return;
+
+ if(addr->sa_family == AF_INET)
+ arp_send_request(ctx, addr4);
+ else
+ arp6_send_request(ctx, addr6);
+}
+
+static int
+arp_inherit(struct glue_ctx *ctx, const struct in_addr *addr)
+{
+ struct glue_ctx *next;
+ struct tle_dest *dst;
+ struct ether_hdr *eth;
+ uint64_t idx;
+ uint16_t i;
+ int rc;
+
+ for (i = 0; i < nb_ctx; i++) {
+ next = &ctx_array[i++];
+ if (next == NULL || next == ctx)
+ continue;
+
+ rc = rte_hash_lookup_data(next->arp_hash, addr, (void **)&idx);
+ if (rc < 0)
+ continue;
+
+ dst = &next->arp4[idx].dst;
+ eth = (struct ether_hdr *)dst->hdr;
+ ipv4_dst_add(ctx, addr, &eth->d_addr);
+ return 0;
+ }
+
+ return -1;
+}
+
+static int
+arp6_inherit(struct glue_ctx *ctx, const struct in6_addr *addr)
+{
+ struct glue_ctx *next;
+ struct ether_hdr *eth;
+ struct tle_dest *dst;
+ uint64_t idx;
+ uint16_t i;
+ int rc;
+
+ for (i = 0; i < nb_ctx; i++) {
+ next = &ctx_array[i++];
+ if (next == NULL || next == ctx)
+ continue;
+
+ rc = rte_hash_lookup_data(next->arp6_hash, addr, (void **)&idx);
+ if (rc < 0)
+ continue;
+
+ dst = &next->arp6[idx].dst;
+ eth = (struct ether_hdr *)dst->hdr;
+ ipv6_dst_add(ctx, addr, &eth->d_addr);
+ return 0;
+ }
+
+ return -1;
+}
+
+#define len_dest(dst) \
+ (offsetof(struct tle_dest, hdr) + dst->l2_len + dst->l3_len)
+
+int
+arp_ipv6_dst_lookup(void *data, const struct in6_addr *addr,
+ struct tle_dest *res, int proto)
+{
+ int32_t rc;
+ uint64_t idx;
+ struct tle_dest *dst;
+ struct ipv6_hdr *ip6h;
+ struct glue_ctx *ctx = data;
+
+ if (is_ipv6_loopback_addr(addr, ctx)) {
+ dst = &ctx->lb_dst_v6;
+ rte_memcpy(res, dst, len_dest(dst));
+ if (proto == IPPROTO_TCP)
+ res->dev = ctx->lb_tcp_dev;
+ else
+ res->dev = ctx->lb_udp_dev;
+ rc = 0;
+ goto set_proto;
+ }
+
+ rc = rte_hash_lookup_data(ctx->arp6_hash, addr, (void **)&idx);
+ if (rc >= 0) {
+ if (!ctx->arp6[idx].inuse)
+ ctx->arp6[idx].inuse = 1;
+ dst = &ctx->arp6[idx].dst;
+ rte_memcpy(res, dst, len_dest(dst));
+ } else {
+ memset(res, 0, sizeof(*res));
+ ipv6_dst_set(ctx, res, addr, NULL);
+ rc = 0;
+ }
+
+ if (proto == IPPROTO_TCP)
+ res->dev = ctx->tcp_dev;
+ else
+ res->dev = ctx->udp_dev;
+
+set_proto:
+ ip6h = (struct ipv6_hdr *)&res->hdr[res->l2_len];
+ ip6h->proto = proto;
+ return rc;
+}
+
+int
+arp_ipv4_dst_lookup(void *data, const struct in_addr *addr,
+ struct tle_dest *res, int proto)
+{
+ int32_t rc;
+ uint64_t idx;
+ struct tle_dest *dst;
+ struct ipv4_hdr *ip4h;
+ struct glue_ctx *ctx = data;
+
+ if (is_ipv4_loopback_addr(addr->s_addr, ctx)) {
+ dst = &ctx->lb_dst;
+ rte_memcpy(res, dst, len_dest(dst));
+ if (proto == IPPROTO_TCP)
+ res->dev = ctx->lb_tcp_dev;
+ else
+ res->dev = ctx->lb_udp_dev;
+ rc = 0;
+ goto set_proto;
+ }
+
+ rc = rte_hash_lookup_data(ctx->arp_hash, addr, (void **)&idx);
+ if (rc >= 0) {
+ if (!ctx->arp4[idx].inuse)
+ ctx->arp4[idx].inuse = 1;
+ dst = &ctx->arp4[idx].dst;
+ rte_memcpy(res, dst, len_dest(dst));
+ } else {
+ memset(res, 0, sizeof(*res));
+ ipv4_dst_set(ctx, res, addr, NULL);
+ rc = 0;
+ }
+
+ if (proto == IPPROTO_TCP)
+ res->dev = ctx->tcp_dev;
+ else
+ res->dev = ctx->udp_dev;
+
+set_proto:
+ ip4h = (struct ipv4_hdr *)&res->hdr[res->l2_len];
+ ip4h->next_proto_id = proto;
+ return rc;
+}
+
+int
+mac_fill(struct glue_ctx *ctx, struct rte_mbuf *m)
+{
+ int32_t rc;
+ uint64_t idx;
+ uint8_t ipver;
+ struct arp_entry* entry;
+ struct ether_addr *dst, *dst1;
+ struct ipv4_hdr *ipv4_hdr;
+ struct ipv6_hdr *ipv6_hdr;
+ const struct in_addr *addr4 = NULL;
+ const struct in6_addr *addr6 = NULL;
+
+ dst = rte_pktmbuf_mtod(m, struct ether_addr *);
+ if (!is_broadcast_ether_addr(dst))
+ return 0;
+
+ ipv4_hdr = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr *, m->l2_len);
+ ipv6_hdr = (struct ipv6_hdr*)ipv4_hdr;
+ ipver = ipv4_hdr->version_ihl >> 4;
+
+retry:
+ if (ipver == 4) {
+ addr4 = (struct in_addr *)&ipv4_hdr->dst_addr;
+ addr4 = ipv4_gateway_lookup(ctx, addr4);
+ rc = rte_hash_lookup_data(ctx->arp_hash, addr4, (void **)&idx);
+ if (rc >= 0)
+ entry = &ctx->arp4[idx];
+ } else {
+ addr6 = (struct in6_addr *)ipv6_hdr->dst_addr;
+ addr6 = ipv6_gateway_lookup(ctx, addr6);
+ rc = rte_hash_lookup_data(ctx->arp6_hash, addr6, (void **)&idx);
+ if (rc >= 0)
+ entry = &ctx->arp6[idx];
+ }
+
+ if (rc >= 0) {
+ dst1 = (struct ether_addr *)entry->dst.hdr;
+ if (!is_broadcast_ether_addr(dst1)) {
+ ether_addr_copy(dst1 , dst);
+ return 0;
+ }
+
+ if (ipver == 4)
+ arp_send_request(ctx, addr4);
+ else
+ arp6_send_request(ctx, addr6);
+ entry->req_time++;
+ if (entry->timer != NULL)
+ tle_timer_stop(ctx->arp_tmw, entry->timer);
+ entry->timer = arp_timer(ctx, entry, ARP_REQUEST_EXPIRE);
+ } else {
+ if (ipver == 4) {
+ if (arp_inherit(ctx, addr4) == 0)
+ goto retry;
+ ipv4_dst_add(ctx, addr4, NULL);
+ arp_send_request(ctx, addr4);
+ } else {
+ if (arp6_inherit(ctx, addr6) == 0)
+ goto retry;
+ ipv6_dst_add(ctx, addr6, NULL);
+ arp6_send_request(ctx, addr6);
+ }
+ }
+
+ return -1;
+}
+
+static inline const struct in_addr *
+get_addr_from_entry(struct arp_entry *e)
+{
+ const struct ipv4_hdr *ipv4;
+ const struct in_addr *addr;
+
+ ipv4 = (struct ipv4_hdr *)(e->dst.hdr + e->dst.l2_len);
+ addr = (const struct in_addr *)&ipv4->dst_addr;
+ return addr;
+}
+
+static inline const struct in6_addr *
+get_addr6_from_entry(struct arp_entry *e)
+{
+ const struct ipv6_hdr *ipv6;
+ const struct in6_addr *addr;
+
+ ipv6 = (struct ipv6_hdr *)(e->dst.hdr + e->dst.l2_len);
+ addr = (const struct in6_addr *)ipv6->dst_addr;
+ return addr;
+}
+
+static void
+drop_arp_wait(int af, struct glue_ctx *ctx, const void *addr)
+{
+ struct rte_mbuf *pkt, *pre;
+
+ for (pre = NULL, pkt = ctx->arp_wait; pkt; pkt = pkt->next_pkt) {
+ if ((af == AF_INET &&
+ !match_addr(ctx, pkt, (const struct in_addr *)addr)) ||
+ (af == AF_INET6 &&
+ !match_addr6(ctx, pkt, (const struct in6_addr *)addr))) {
+ pre = pkt;
+ continue;
+ }
+
+ if (pre == NULL)
+ ctx->arp_wait = pkt->next_pkt;
+ else
+ pre->next_pkt = pkt->next_pkt;
+
+ rte_pktmbuf_free(pkt);
+ }
+}
+
+static void
+arp_entry_del(struct glue_ctx *ctx, int af, struct arp_entry *e)
+{
+ const void *addr;
+ struct arp_entry *t;
+ uint32_t idx, last_idx;
+ const struct rte_hash *h;
+
+ if (af == AF_INET) {
+ addr = get_addr_from_entry(e);
+ t = ctx->arp4;
+ h = ctx->arp_hash;
+ last_idx = ctx->arp4_num - 1;
+ } else {
+ addr = get_addr6_from_entry(e);
+ t = ctx->arp6;
+ h = ctx->arp6_hash;
+ last_idx = ctx->arp6_num - 1;
+ }
+
+ idx = e - t;
+ if (idx > last_idx) /* entry has been moved */
+ return;
+
+ print_arp(af, addr, (struct ether_addr *)e->dst.hdr, "DELETE");
+
+ if (e->req_time > ARP_MAX_REQ_TIMES)
+ drop_arp_wait(af, ctx, addr);
+
+ rte_hash_del_key(h, addr);
+
+ if (idx < last_idx) {
+ /* replace current entry with last entry */
+ rte_memcpy(e, t + last_idx, sizeof(*e));
+ rte_hash_add_key_data(h, addr, (void *)(uintptr_t)idx);
+ tle_timer_stop(ctx->arp_tmw, t[last_idx].timer);
+ if (e->req_time > 0)
+ e->timer = arp_timer(ctx, e, ARP_REQUEST_EXPIRE);
+ else {
+ e->timer = arp_timer(ctx, e, ARP_ENTRY_EXPIRE);
+ e->inuse = 0;
+ }
+ }
+
+ /* we always delete the last entry to keep it contiguous */
+ t[last_idx].timer = NULL;
+ t[last_idx].inuse = 0;
+ t[last_idx].req_time = 0;
+ if (af == AF_INET)
+ ctx->arp4_num--;
+ else
+ ctx->arp6_num--;
+}
+
+void
+mac_timeout(struct glue_ctx *ctx)
+{
+#define ARP_PROCESS_MAX 32
+ struct arp_entry *entry[ARP_PROCESS_MAX], *e;
+ struct tle_timer_wheel *tw;
+ const struct in_addr *addr4;
+ const struct in6_addr *addr6;
+ uint32_t i, cnt;
+ uint8_t *l3h;
+
+ tw = ctx->arp_tmw;
+ tle_timer_expire(tw, rte_get_tsc_cycles() >> ctx->cycles_ms_shift);
+ cnt = tle_timer_get_expired_bulk(tw, (void**)entry, ARP_PROCESS_MAX);
+ if (cnt == 0)
+ return;
+
+ for(i = 0; i < cnt; i++) {
+ e = entry[i];
+ e->timer = NULL;
+ l3h = e->dst.hdr + e->dst.l2_len;
+ if (e->inuse ||
+ (e->req_time > 0 && e->req_time <= ARP_MAX_REQ_TIMES)) {
+ if (((struct ipv4_hdr *)l3h)->version_ihl >> 4 == 4) {
+ addr4 = get_addr_from_entry(e);
+ arp_send_request(ctx, addr4);
+ } else {
+ addr6 = get_addr6_from_entry(e);
+ arp6_send_request(ctx, addr6);
+ }
+
+ e->timer = arp_timer(ctx, e, ARP_REQUEST_EXPIRE);
+ e->inuse = 0;
+ e->req_time++;
+ } else {
+ if (((struct ipv4_hdr *)l3h)->version_ihl >> 4 == 4)
+ arp_entry_del(ctx, AF_INET, e);
+ else
+ arp_entry_del(ctx, AF_INET6, e);
+ }
+ }
+}
diff --git a/lib/libtle_glue/be.c b/lib/libtle_glue/be.c
new file mode 100644
index 0000000..7e2227e
--- /dev/null
+++ b/lib/libtle_glue/be.c
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <rte_ethdev.h>
+#include <rte_ip.h>
+
+#include <tle_tcp.h>
+#include <tle_udp.h>
+
+#include "config.h"
+#include "log.h"
+#include "util.h"
+#include "internal.h"
+
+static inline void
+rte_pktmbuf_copy_seg(struct rte_mbuf *dst, struct rte_mbuf* src)
+{
+ size_t offset = offsetof(struct rte_mbuf, data_off);
+ rte_memcpy((char*)dst + offset, (char*)src + offset,
+ sizeof(struct rte_mbuf) - offset);
+ rte_mbuf_refcnt_set(dst, 1);
+ dst->ol_flags &= ~IND_ATTACHED_MBUF;
+ rte_memcpy(rte_pktmbuf_mtod(dst, void*), rte_pktmbuf_mtod(src, void*),
+ src->data_len);
+}
+
+static inline struct rte_mbuf*
+rte_pktmbuf_copy(struct rte_mbuf *md, struct rte_mempool* mp)
+{
+ struct rte_mbuf *mc, *mi, **prev;
+ uint32_t pktlen;
+ uint16_t nseg;
+
+ if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL))
+ return NULL;
+
+ mi = mc;
+ prev = &mi->next;
+ pktlen = md->pkt_len;
+ nseg = 0;
+
+ do {
+ nseg++;
+ rte_pktmbuf_copy_seg(mi, md);
+ *prev = mi;
+ prev = &mi->next;
+ } while ((md = md->next) != NULL &&
+ (mi = rte_pktmbuf_alloc(mp)) != NULL);
+
+ *prev = NULL;
+ mc->nb_segs = nseg;
+ mc->pkt_len = pktlen;
+
+ /* Allocation of new indirect segment failed */
+ if (unlikely(mi == NULL)) {
+ rte_pktmbuf_free(mc);
+ return NULL;
+ }
+
+ __rte_mbuf_sanity_check(mc, 1);
+ return mc;
+}
+
+static inline int
+process_rx_pkts(struct glue_ctx *ctx, struct rte_mbuf *pkts[],
+ uint32_t n, uint8_t from_loopback)
+{
+ uint32_t i, j, k, jt, ju, jd;
+ struct rte_mbuf *tcp[MAX_PKTS_BURST];
+ struct rte_mbuf *udp[MAX_PKTS_BURST];
+ struct rte_mbuf *drop[MAX_PKTS_BURST];
+ int32_t rc[MAX_PKTS_BURST];
+ struct tle_dev *tcp_dev, *udp_dev;
+ struct rte_mempool *mp;
+ struct rte_mbuf *tmp;
+ uint64_t ts;
+
+ if (n == 0)
+ return 0;
+
+ if (unlikely(from_loopback)) {
+ tcp_dev = ctx->lb_tcp_dev;
+ udp_dev = ctx->lb_udp_dev;
+ mp = pkts[0]->pool;
+ for (i = 0; i < n; i++) {
+ tmp = rte_pktmbuf_copy(pkts[i], mp);
+ if (tmp != NULL) {
+ rte_pktmbuf_free(pkts[i]);
+ pkts[i] = tmp;
+ pkts[i]->ol_flags |= PKT_RX_IP_CKSUM_GOOD;
+ pkts[i]->ol_flags |= PKT_RX_L4_CKSUM_GOOD;
+ } else {
+ k = i;
+ for (; i < n; i++) {
+ rte_pktmbuf_free(pkts[i]);
+ }
+ n = k;
+ }
+ }
+ } else {
+ tcp_dev = ctx->tcp_dev;
+ udp_dev = ctx->udp_dev;
+ }
+
+ ts = rte_get_tsc_cycles() >> (ctx->cycles_ms_shift - 10);
+
+ for (j = 0, jt = 0, ju = 0, jd = 0; j < n; j++) {
+ pkts[j]->timestamp = ts;
+ switch (pkts[j]->packet_type & RTE_PTYPE_L4_MASK) {
+ case RTE_PTYPE_L4_TCP:
+ tcp[jt++] = pkts[j];
+ break;
+ case RTE_PTYPE_L4_UDP:
+ udp[ju++] = pkts[j];
+ break;
+ case RTE_PTYPE_L4_ICMP:
+ /* TODO */
+ case RTE_PTYPE_L4_FRAG:
+ /* TODO */
+ default:
+ drop[jd++] = pkts[j];
+ }
+ }
+
+ if (jt > 0) {
+ k = tle_tcp_rx_bulk(tcp_dev, tcp, drop + jd, rc, jt);
+ jd += jt - k;
+
+ TRACE("(port=%u, queue=%u), %u/%u (TCP) pkts are received",
+ port_id, queue_id, k, n);
+ }
+
+ if (ju > 0) {
+ k = tle_udp_rx_bulk(udp_dev, udp, drop + jd, rc, ju);
+ jd += ju - k;
+
+ TRACE("(port=%u, queue=%u), %u/%u (UDP) pkts are received",
+ port_id, queue_id, k, n);
+ }
+
+ for (j = 0; j < jd; j++)
+ rte_pktmbuf_free(drop[j]);
+
+ return jt + ju - jd;
+}
+
+static inline int
+be_rx(struct glue_ctx *ctx)
+{
+ int ret;
+ uint32_t n;
+ struct rte_mbuf *pkts[MAX_PKTS_BURST];
+ uint16_t port_id = ctx->port_id;
+ uint16_t queue_id = ctx->queue_id;
+
+ n = rte_eth_rx_burst(port_id, queue_id, pkts, RTE_DIM(pkts));
+ ret = process_rx_pkts(ctx, pkts, n, 0);
+
+ return ret;
+}
+
+int
+be_tx(struct glue_ctx *ctx)
+{
+ uint32_t n, j, k, s, ret;
+ const uint16_t max_pkts = MAX_PKTS_BURST;
+ struct rte_mbuf *pkts[max_pkts];
+ struct rte_mbuf *_pkts[max_pkts];
+ uint16_t port_id = ctx->port_id;
+ uint16_t queue_id = ctx->queue_id;
+
+ ret = 0;
+ tle_tcp_process(ctx->tcp_ctx, TCP_MAX_PROCESS);
+
+ n = tle_tcp_tx_bulk(ctx->lb_tcp_dev, pkts, max_pkts);
+ n += tle_udp_tx_bulk(ctx->lb_udp_dev, pkts + n, max_pkts - n);
+ if (n > 0) {
+ ret += n;
+ rte_eth_tx_burst(ctx->lb_port_id, 0, pkts, n);
+ /* loopback device could receive after transmit immediately */
+ n = rte_eth_rx_burst(ctx->lb_port_id, 0, pkts, RTE_DIM(pkts));
+ process_rx_pkts(ctx, pkts, n, 1);
+
+ /* wake up look-aside backend */
+ wake_lookaside_backend(ctx);
+ }
+
+ n = tle_tcp_tx_bulk(ctx->tcp_dev, pkts, max_pkts);
+ n += tle_udp_tx_bulk(ctx->udp_dev, pkts + n, max_pkts - n);
+ if (n == 0)
+ return 0;
+
+ ret += n;
+ s = 0;
+ for (j = 0; j != n; j++) {
+ if (mac_fill(ctx, pkts[j]) == 0) {
+ PKT_DUMP(pkts[j]);
+ _pkts[s++] = pkts[j];
+ continue;
+ }
+
+ pkts[j]->next_pkt = ctx->arp_wait;
+ ctx->arp_wait = pkts[j];
+ }
+
+ /* For virtio-user/vhost-kernel test case, it's normal that vhost
+ * kthread cannot catch up with packets generation speed in stack.
+ * Shall we drop those packets immdiately or retry some times to
+ * keep those packets? We find dropping packets here is not a good
+ * idea, which leads to lots of retrans and inefficiency of vhost
+ * kthread. Even below code does not work well:
+ *
+ * for (k = 0, retry = 0; k < s && retry < 10000; retry++)
+ * k += rte_eth_tx_burst(port_id, queue_id, _pkts + k, s - k);
+ *
+ * So we choose to blockingly send out packes.
+ */
+ k = 0;
+ while (k < s)
+ k += rte_eth_tx_burst(port_id, queue_id, _pkts + k, s - k);
+
+ for (j = k; j != s; j++)
+ rte_pktmbuf_free(_pkts[j]);
+
+ TRACE("(port=%u, queue=%u), %u/%u pkts are sent",
+ port_id, queue_id, k, s);
+
+ return ret;
+}
+
+int
+be_process(struct glue_ctx *ctx)
+{
+ int ret;
+
+ if (unlikely(stopped))
+ return 0;
+
+ ret = be_rx(ctx);
+ mac_timeout(ctx);
+ ret += be_tx(ctx);
+
+ return ret;
+}
diff --git a/lib/libtle_glue/config.h b/lib/libtle_glue/config.h
new file mode 100644
index 0000000..976495e
--- /dev/null
+++ b/lib/libtle_glue/config.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _TLE_GLUE_CONFIG_H_
+#define _TLE_GLUE_CONFIG_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_STREAMS_PER_CORE 64 * 1024
+#define MIN_STREAMS_PER_CORE 16
+#define DELTA_STREAMS 64
+#define FRAG_BUCKET 8
+#define FRAG_ENTRIES_PER_BUCKET 8
+#define MAX_ARP_ENTRY (1 << 10)
+
+/* RCV buffer & SND buffer
+ * This is not a reall rcv/snd buffer implementation. Below number means
+ * the slots to store mbufs of sent or received data. Each slot could
+ * contains a single mbuf with size of (1500B or 2048B) or a chained
+ * mbuf with size <= 64KB.
+ *
+ * TODO: add real snd/rcv buffer
+ */
+#define MAX_RECV_BUFS_PER_STREAM 256
+#define MAX_SEND_BUFS_PER_STREAM 256
+
+#ifdef LOOK_ASIDE_BACKEND
+#define MAX_NB_CTX 1
+#else
+#define MAX_NB_CTX 16
+#endif
+
+#define MAX_MBUFS 0x80000
+/* should calculated by:
+ * MAX_NB_CTX * MAX_STREAMS_PER_CORE * (MAX_RECV_BUFS_PER_STREAM + MAX_SEND_BUFS_PER_STREAM))
+ */
+
+#define MBUF_DYNAMIC_SIZE 0x800
+
+#define MBUF_PERCORE_CACHE 32
+
+#define MAX_PKTS_BURST 0x20
+
+#define TCP_MAX_PROCESS 32
+
+#define ARP_ENTRY_EXPIRE 60000U
+#define ARP_REQUEST_EXPIRE 1000U /* ms */
+#define ARP_MAX_REQ_TIMES 5
+
+#define MTU_NORMAL 1500
+#define MTU_LOOPBACK 65535
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /*_TLE_GLUE_CONFIG_H_ */
diff --git a/lib/libtle_glue/ctx.c b/lib/libtle_glue/ctx.c
new file mode 100644
index 0000000..dc78f39
--- /dev/null
+++ b/lib/libtle_glue/ctx.c
@@ -0,0 +1,535 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <stdlib.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+
+#include <rte_malloc.h>
+#include <rte_random.h>
+#include <rte_cycles.h>
+#include <rte_ethdev.h>
+#include <rte_hash.h>
+#include <rte_spinlock.h>
+
+#include "config.h"
+#include "ctx.h"
+#include "log.h"
+#include "util.h"
+#include "internal.h"
+#include "gateway.h"
+#include "tle_timer.h"
+
+RTE_DEFINE_PER_LCORE(struct glue_ctx *, glue_ctx);
+
+int nb_ctx;
+struct glue_ctx ctx_array[MAX_NB_CTX];
+struct glue_ctx *default_ctx = &ctx_array[0];
+
+static int
+ipv4_dst_lookup_tcp(void *data, const struct in_addr *addr,
+ struct tle_dest *res)
+{
+ addr = ipv4_gateway_lookup(data, addr);
+ return arp_ipv4_dst_lookup(data, addr, res, IPPROTO_TCP);
+}
+
+static int
+ipv4_dst_lookup_udp(void *data, const struct in_addr *addr,
+ struct tle_dest *res)
+{
+ addr = ipv4_gateway_lookup(data, addr);
+ return arp_ipv4_dst_lookup(data, addr, res, IPPROTO_UDP);
+}
+
+static int
+ipv6_dst_lookup_tcp(void *data, const struct in6_addr *addr,
+ struct tle_dest *res)
+{
+ addr = ipv6_gateway_lookup(data, addr);
+ return arp_ipv6_dst_lookup(data, addr, res, IPPROTO_TCP);
+}
+
+static int
+ipv6_dst_lookup_udp(void *data, const struct in6_addr *addr,
+ struct tle_dest *res)
+{
+ addr = ipv6_gateway_lookup(data, addr);
+ return arp_ipv6_dst_lookup(data, addr, res, IPPROTO_UDP);
+}
+
+static struct tle_ctx *
+proto_ctx_create(uint32_t socket_id, uint32_t proto, void *data)
+{
+ struct tle_ctx_param cprm;
+
+ if (proto != TLE_PROTO_TCP && proto != TLE_PROTO_UDP)
+ rte_panic("Invalid proto [%u]\n", proto);
+
+ cprm.socket_id = socket_id;
+ cprm.proto = proto;
+ cprm.max_streams = MAX_STREAMS_PER_CORE;
+ cprm.min_streams = MIN_STREAMS_PER_CORE;
+ cprm.delta_streams = DELTA_STREAMS;
+ cprm.max_stream_rbufs = MAX_RECV_BUFS_PER_STREAM;
+ cprm.max_stream_sbufs = MAX_SEND_BUFS_PER_STREAM;
+ if (proto == TLE_PROTO_TCP) {
+ cprm.lookup4 = ipv4_dst_lookup_tcp;
+ cprm.lookup6 = ipv6_dst_lookup_tcp;
+ } else {
+ cprm.lookup4 = ipv4_dst_lookup_udp;
+ cprm.lookup6 = ipv6_dst_lookup_udp;
+ }
+ cprm.lookup4_data = data;
+ cprm.lookup6_data = data;
+#ifdef LOOK_ASIDE_BACKEND
+ cprm.flags = 0;
+#else
+ cprm.flags = TLE_CTX_FLAG_ST; /* ctx will be used by single thread*/
+#endif
+ cprm.send_bulk_size = 0; /* 32 if 0 */
+ cprm.hash_alg = TLE_SIPHASH;
+ cprm.secret_key.u64[0] = rte_rand();
+ cprm.secret_key.u64[1] = rte_rand();
+ cprm.icw = 0; /**< congestion window, default is 2*MSS if 0. */
+ cprm.timewait = 1; /* TLE_TCP_TIMEWAIT_DEFAULT */
+
+ return tle_ctx_create(&cprm);
+}
+
+static int
+evq_init(struct glue_ctx *ctx, uint32_t socket_id)
+{
+ struct tle_evq_param eprm = {
+ .socket_id = socket_id,
+ .max_events = 0, /* We don't pre-allocate any event */
+ };
+
+ ctx->ereq = tle_evq_create(&eprm);
+ if (ctx->ereq == NULL)
+ rte_panic("Cannot create ereq");
+
+ ctx->rxeq = tle_evq_create(&eprm);
+ if (ctx->rxeq == NULL)
+ rte_panic("Cannot create rxeq");
+
+ ctx->txeq = tle_evq_create(&eprm);
+ if (ctx->txeq == NULL)
+ rte_panic("Cannot create txeq");
+
+ return 0;
+}
+
+static void
+tle_ctx_init(struct glue_ctx *ctx, uint32_t socket_id)
+{
+ struct tle_dev_param dprm;
+ struct rte_eth_dev_info dev_info;
+ uint16_t port_id = 0; /* currently only use one port */
+
+ ctx->tcp_ctx = proto_ctx_create(socket_id, TLE_PROTO_TCP, ctx);
+ if (ctx->tcp_ctx == NULL)
+ rte_panic("Cannot create tle_ctx for tcp");
+
+ ctx->udp_ctx = proto_ctx_create(socket_id, TLE_PROTO_UDP, ctx);
+ if (ctx->udp_ctx == NULL)
+ rte_panic("Cannot create tle_ctx for udp");
+
+ memset(&dprm, 0, sizeof(dprm));
+
+ /* offloading check and set */
+ rte_eth_dev_info_get(port_id, &dev_info);
+ dprm.rx_offload = dev_info.rx_offload_capa & rx_offload;
+ dprm.tx_offload = dev_info.tx_offload_capa & tx_offload;
+
+ dprm.local_addr4.s_addr = ctx->ipv4;
+ rte_memcpy(&dprm.local_addr6, &ctx->ipv6, sizeof(struct in6_addr));
+ dprm.bl4.nb_port = 0;
+ dprm.bl4.port = NULL;
+ dprm.bl6.nb_port = 0;
+ dprm.bl6.port = NULL;
+
+ ctx->tcp_dev = tle_add_dev(ctx->tcp_ctx, &dprm);
+ if (ctx->tcp_dev == NULL)
+ rte_panic("add tle_dev for tcp failed: %u", rte_errno);
+
+ ctx->udp_dev = tle_add_dev(ctx->udp_ctx, &dprm);
+ if (ctx->udp_dev == NULL)
+ rte_panic("add tle_dev for udp failed: %u", rte_errno);
+
+ if (ctx == default_ctx) {
+ dprm.rx_offload = rx_offload;
+ dprm.tx_offload = tx_offload;
+ dprm.local_addr4.s_addr = htonl(INADDR_LOOPBACK);
+ rte_memcpy(&dprm.local_addr6, &in6addr_loopback,
+ sizeof(struct in6_addr));
+
+ ctx->lb_tcp_dev = tle_add_dev(ctx->tcp_ctx, &dprm);
+ if (ctx->lb_tcp_dev == NULL)
+ rte_panic("failed to add loopback tcp dev: %u\n",
+ rte_errno);
+
+ ctx->lb_udp_dev = tle_add_dev(ctx->udp_ctx, &dprm);
+ if (ctx->lb_udp_dev == NULL)
+ rte_panic("failed to add loopback udp dev: %u\n",
+ rte_errno);
+ }
+
+ evq_init(ctx, socket_id);
+}
+
+static uint32_t
+get_ip(void)
+{
+ struct in_addr addr;
+ const char *ip_str = getenv(DPDK_IP);
+
+ if (ip_str == NULL) {
+ ip_str = DPDK_IP_DEF;
+ GLUE_LOG(INFO, "will use the default IP %s", DPDK_IP_DEF);
+ } else
+ GLUE_LOG(INFO, "will use the IP %s", ip_str);
+
+ if (inet_aton(ip_str, &addr) == 0)
+ rte_panic("Invalid addr from env DPDK_IP: %s", ip_str);
+
+ return addr.s_addr;
+}
+
+static uint8_t
+get_ip_mask(void)
+{
+ const char *mask_str = getenv(DPDK_IP_MASK);
+
+ if (mask_str == NULL) {
+ mask_str = DPDK_IP_MASK_DEF;
+ GLUE_LOG(INFO, "will use the default IP Mask %s", DPDK_IP_MASK_DEF);
+ } else
+ GLUE_LOG(INFO, "will use the IP Mask %s", mask_str);
+
+ return (uint8_t)atoi(mask_str);
+}
+
+static uint32_t
+get_ip_gate(void)
+{
+ struct in_addr addr;
+ const char *ip_str = getenv(DPDK_IP_GATEWAY);
+
+ if (ip_str == NULL) {
+ ip_str = DPDK_IP_GATEWAY_DEF;
+ GLUE_LOG(INFO, "will use the default IP gateway %s",
+ DPDK_IP_GATEWAY_DEF);
+ } else
+ GLUE_LOG(INFO, "will use the IP gateway %s", ip_str);
+
+ if (inet_aton(ip_str, &addr) == 0)
+ rte_panic("Invalid addr from env DPDK_IP_GATEWAY: %s", ip_str);
+
+ return addr.s_addr;
+}
+
+static struct in6_addr*
+get_ipv6(void)
+{
+ static struct in6_addr addr;
+ const char *ip_str = getenv(DPDK_IPV6);
+
+ if (ip_str == NULL) {
+ ip_str = DPDK_IPV6_DEF;
+ GLUE_LOG(INFO, "will use the default IP(V6) %s", DPDK_IPV6_DEF);
+ } else
+ GLUE_LOG(INFO, "will use the IP(V6) %s", ip_str);
+
+ if (inet_pton(AF_INET6, ip_str, &addr) == 0)
+ rte_panic("Invalid addr from env DPDK_IPV6: %s", ip_str);
+
+ return &addr;
+}
+
+static uint8_t
+get_ipv6_mask(void)
+{
+ const char *mask_str = getenv(DPDK_IPV6_MASK);
+
+ if (mask_str == NULL) {
+ mask_str = DPDK_IPV6_MASK_DEF;
+ GLUE_LOG(INFO, "will use the default IPV6 Mask %s",
+ DPDK_IPV6_MASK_DEF);
+ } else
+ GLUE_LOG(INFO, "will use the IPV6 Mask %s", mask_str);
+
+ return (uint8_t)atoi(mask_str);
+}
+
+static struct in6_addr*
+get_ipv6_gate(void)
+{
+ static struct in6_addr addr;
+ const char *ip_str = getenv(DPDK_IPV6_GATEWAY);
+
+ if (ip_str == NULL) {
+ ip_str = DPDK_IPV6_GATEWAY_DEF;
+ GLUE_LOG(INFO, "will use the default IP(V6) gateway %s",
+ DPDK_IPV6_GATEWAY_DEF);
+ } else
+ GLUE_LOG(INFO, "will use the IP(V6) gateway %s", ip_str);
+
+ if (inet_pton(AF_INET6, ip_str, &addr) == 0)
+ rte_panic("Invalid addr from env DPDK_IPV6_GATEWAY: %s", ip_str);
+
+ return &addr;
+}
+
+static bool
+lo4_enabled(void)
+{
+ const char *str = getenv("DPDK_LO4_ENABLED");
+ if (str != NULL && strcmp(str, "0") == 0)
+ return false;
+ return true;
+}
+
+static bool
+lo6_enabled(void)
+{
+ const char *str = getenv("DPDK_LO6_ENABLED");
+ if (str == NULL || strcmp(str, "1") != 0)
+ return false;
+ return true;
+}
+
+static void
+loopback_dst_init(struct glue_ctx *ctx)
+{
+ struct tle_dest *dst;
+ struct ether_hdr *eth;
+ struct ipv4_hdr *ip4h;
+ struct ipv6_hdr *ip6h;
+
+ /* init ipv4 dst */
+ dst = &ctx->lb_dst;
+ dst->mtu = 65535;
+
+ dst->l2_len = sizeof(*eth);
+ dst->head_mp = get_mempool_by_socket(0); /* fix me */
+ eth = (struct ether_hdr *)dst->hdr;
+ memset(eth, 0, 2 * sizeof(eth->d_addr));
+ eth->ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv4);
+
+ dst->l3_len = sizeof(*ip4h);
+ ip4h = (struct ipv4_hdr *)(eth + 1);
+ ip4h->dst_addr = htonl(INADDR_LOOPBACK);
+ ip4h->version_ihl = 4 << 4 | sizeof(*ip4h) / IPV4_IHL_MULTIPLIER;
+ ip4h->time_to_live = 64;
+ ip4h->next_proto_id = IPPROTO_TCP;
+
+ /* init ipv6 dst */
+ dst = &ctx->lb_dst_v6;
+ dst->mtu = 65535;
+
+ dst->l2_len = sizeof(*eth);
+ dst->head_mp = get_mempool_by_socket(0); /* fix me */
+ eth = (struct ether_hdr *)dst->hdr;
+ memset(eth, 0, 2 * sizeof(eth->d_addr));
+ eth->ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv6);
+
+ dst->l3_len = sizeof(*ip6h);
+ ip6h = (struct ipv6_hdr *)(eth + 1);
+ rte_memcpy(ip6h->dst_addr, &in6addr_loopback, sizeof(struct in6_addr));
+ ip6h->vtc_flow = 6 << 4;
+ ip6h->hop_limits = 255;
+ ip6h->proto = IPPROTO_TCP;
+}
+
+static void
+arp_hash_init(struct glue_ctx *ctx, unsigned socket_id)
+{
+ char str[RTE_HASH_NAMESIZE];
+ struct rte_hash_parameters hprm;
+
+ /* init ipv4 arp hash */
+ snprintf(str, sizeof(str), "arp_hash_4@ctx%u", ctx->queue_id);
+ memset(&hprm, 0, sizeof(hprm));
+ hprm.name = str;
+ hprm.entries = MAX_ARP_ENTRY * 2;
+ hprm.socket_id = socket_id;
+ hprm.key_len = sizeof(struct in_addr);
+ ctx->arp_hash = rte_hash_create(&hprm);
+ if (ctx->arp_hash == NULL) {
+ rte_panic("Failed to init hashtable for ARP");
+ }
+
+ /* init ipv6 arp hash */
+ snprintf(str, sizeof(str), "arp_hash_6@ctx%u", ctx->queue_id);
+ memset(&hprm, 0, sizeof(hprm));
+ hprm.name = str;
+ hprm.entries = MAX_ARP_ENTRY * 2;
+ hprm.socket_id = socket_id;
+ hprm.key_len = sizeof(struct in6_addr);
+ ctx->arp6_hash = rte_hash_create(&hprm);
+ if (ctx->arp6_hash == NULL) {
+ rte_panic("Failed to init hashtable for ARP6");
+ }
+}
+
+/* get current timestamp in ms, see tcp_get_tms() */
+static inline uint64_t
+arp_get_tms(uint32_t mshift)
+{
+ uint64_t ts;
+
+ ts = rte_get_tsc_cycles() >> mshift;
+ return ts;
+}
+
+static void
+arp_timer_init(struct glue_ctx *ctx, unsigned socket_id)
+{
+ struct tle_timer_wheel_args twprm;
+
+ twprm.tick_size = 1000U;
+ twprm.max_timer = MAX_ARP_ENTRY + 8;
+ twprm.socket_id = socket_id;
+ ctx->arp_tmw = tle_timer_create(&twprm,
+ arp_get_tms(ctx->cycles_ms_shift));
+ if (ctx->arp_tmw == NULL)
+ rte_panic("Failed to init timer wheel for ARP");
+}
+
+static void
+glue_ctx_init(struct glue_ctx *ctx, uint32_t socket_id)
+{
+ uint64_t ms;
+
+ ctx->arp4 = rte_zmalloc_socket(NULL,
+ sizeof(struct arp_entry) * MAX_ARP_ENTRY,
+ RTE_CACHE_LINE_SIZE, socket_id);
+ ctx->arp6 = rte_zmalloc_socket(NULL,
+ sizeof(struct arp_entry) * MAX_ARP_ENTRY,
+ RTE_CACHE_LINE_SIZE, socket_id);
+ if (!ctx->arp4 || !ctx->arp6)
+ rte_panic("Failed to allocate arp table");
+
+ ctx->port_id = 0;
+ ctx->queue_id = nb_ctx - 1;
+ ctx->ipv4 = get_ip();
+ ctx->ipv4_ml = get_ip_mask();
+ ctx->ipv4_gw.s_addr = get_ip_gate();
+ ctx->lo4_enabled = lo4_enabled();
+ rte_memcpy(&ctx->ipv6, get_ipv6(), sizeof(struct in6_addr));
+ ctx->ipv6_ml = get_ipv6_mask();
+ rte_memcpy(&ctx->ipv6_gw, get_ipv6_gate(), sizeof(struct in6_addr));
+ ctx->lo6_enabled = lo6_enabled();
+
+ /* caclulate closest shift to convert from cycles to ms (approximate) */
+ ms = (rte_get_tsc_hz() + MS_PER_S - 1) / MS_PER_S;
+ ctx->cycles_ms_shift = sizeof(ms) * CHAR_BIT - __builtin_clzll(ms) - 1;
+
+ arp_hash_init(ctx, socket_id);
+ arp_timer_init(ctx, socket_id);
+ ctx->arp_wait = NULL;
+
+ ctx->frag_tbl = rte_ip_frag_table_create(FRAG_BUCKET,
+ FRAG_ENTRIES_PER_BUCKET,
+ FRAG_BUCKET * FRAG_ENTRIES_PER_BUCKET,
+ rte_get_tsc_hz(),
+ socket_id);
+ if (ctx->frag_tbl == NULL)
+ rte_panic("Failed to create ip defrag table");
+
+ PERCPU_MIB = &ctx->mib;
+}
+
+static int ctx_seq;
+static rte_spinlock_t ctx_lock = RTE_SPINLOCK_INITIALIZER;
+
+uint8_t
+glue_ctx_alloc(void)
+{
+ uint32_t socket_id;
+ struct glue_ctx *ctx;
+
+ /* fix me: we need a fine grainer lock */
+ rte_spinlock_lock(&ctx_lock);
+
+ GLUE_LOG(INFO, "allocate ctx: %d", ctx_seq);
+ if (ctx_seq == 0)
+ /* Called from constructor init() */
+ ctx_seq = 1;
+ else if (ctx_seq == 1) {
+ /* Called from first epoll_create() or poll() */
+ ctx_seq = 2;
+ ctx = default_ctx;
+ goto unlock;
+ }
+
+ if (nb_ctx >= MAX_NB_CTX)
+ rte_panic("Exceed the max number of ctx");
+
+ ctx = &ctx_array[nb_ctx++];
+ GLUE_LOG(INFO, "%u ctx allocated, and will init", nb_ctx);
+
+ socket_id = get_socket_id();
+
+ glue_ctx_init(ctx, socket_id);
+
+ /* reconfigure the "physical" port whenever # of ctx changes */
+ port_reconfig();
+
+ if (ctx == default_ctx) {
+ loopback_dst_init(ctx);
+
+ ctx->lb_port_id = create_loopback(socket_id);
+ GLUE_LOG(INFO, "loopback port_id: %u", ctx->lb_port_id);
+ }
+
+ rte_eth_macaddr_get(ctx->port_id, &ctx->mac);
+
+ tle_ctx_init(ctx, socket_id);
+
+unlock:
+ rte_spinlock_unlock(&ctx_lock);
+ return ctx - ctx_array;
+}
+
+void
+glue_ctx_free(struct glue_ctx *ctx __rte_unused)
+{
+ if (nb_ctx == 1 && ctx_seq == 2) {
+ GLUE_LOG(INFO, "free ctx");
+ ctx_seq = 1;
+ return;
+ }
+
+ rte_panic("close epoll fd on running is not supported\n");
+}
+
+struct glue_ctx *
+glue_ctx_lookup(uint16_t port_id, uint16_t queue_id)
+{
+ int i;
+
+ if (port_id == 1) /* loopback */
+ return default_ctx;
+
+ for (i = 0; i < nb_ctx; i++) {
+ if (ctx_array[i].port_id == port_id &&
+ ctx_array[i].queue_id == queue_id)
+ return &ctx_array[i];
+ }
+
+ return NULL;
+}
diff --git a/lib/libtle_glue/ctx.h b/lib/libtle_glue/ctx.h
new file mode 100644
index 0000000..e78b68f
--- /dev/null
+++ b/lib/libtle_glue/ctx.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _TLE_GLUE_SOCK_H_
+#define _TLE_GLUE_SOCK_H_
+
+#include <stdbool.h>
+#include <pthread.h>
+
+#include <rte_memzone.h>
+#include <rte_mempool.h>
+#include <rte_ether.h>
+#include <rte_ip_frag.h>
+
+#include <tle_ctx.h>
+#include <tle_event.h>
+#include <tle_stats.h>
+
+#include <sys/queue.h>
+
+#include "config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define DPDK_IP "DPDK_IP"
+#define DPDK_IP_DEF "0.0.0.0"
+#define DPDK_IP_MASK "DPDK_IP_MASK"
+#define DPDK_IP_MASK_DEF "16"
+#define DPDK_IP_GATEWAY "DPDK_IP_GATEWAY"
+#define DPDK_IP_GATEWAY_DEF "0.0.0.0"
+#define DPDK_IPV6 "DPDK_IPV6"
+#define DPDK_IPV6_DEF "::"
+#define DPDK_IPV6_MASK "DPDK_IPV6_MASK"
+#define DPDK_IPV6_MASK_DEF "64"
+#define DPDK_IPV6_GATEWAY "DPDK_IPV6_GATEWAY"
+#define DPDK_IPV6_GATEWAY_DEF "::"
+
+struct arp_entry {
+ struct tle_dest dst;
+ uint8_t inuse;
+ uint8_t req_time;
+ void* timer;
+};
+
+struct glue_ctx {
+ struct tle_ctx *tcp_ctx;
+ struct tle_dev *tcp_dev;
+ struct tle_dev *lb_tcp_dev;
+ struct tle_ctx *udp_ctx;
+ struct tle_dev *udp_dev;
+ struct tle_dev *lb_udp_dev;
+
+ struct tle_evq *ereq;
+ struct tle_evq *rxeq;
+ struct tle_evq *txeq;
+
+ uint16_t port_id;
+ uint16_t queue_id;
+ uint16_t lb_port_id;
+
+ struct {
+ uint8_t ipv4_ml;
+ uint8_t ipv6_ml;
+ };
+
+ struct ether_addr mac;
+ struct rte_mbuf *arp_wait;
+ struct tle_timer_wheel *arp_tmw;
+ uint32_t cycles_ms_shift; /* to convert from cycles to ms */
+
+ struct {
+ uint32_t ipv4;
+ struct in_addr ipv4_gw;
+ bool lo4_enabled;
+
+ uint32_t arp4_num;
+ struct arp_entry *arp4;
+ struct rte_hash *arp_hash;
+ };
+
+ struct {
+ struct in6_addr ipv6;
+ struct in6_addr ipv6_gw;
+ bool lo6_enabled;
+
+ uint32_t arp6_num;
+ struct arp_entry *arp6;
+ struct rte_hash *arp6_hash;
+ };
+
+ struct {
+ rte_spinlock_t frag_lock;
+ struct rte_ip_frag_tbl *frag_tbl;
+ struct rte_ip_frag_death_row frag_dr;
+ };
+
+ struct tle_dest lb_dst;
+ struct tle_dest lb_dst_v6;
+
+ struct tle_mib mib;
+} __rte_cache_aligned;
+
+extern int nb_ctx;
+extern struct glue_ctx *default_ctx;
+extern struct glue_ctx ctx_array[MAX_NB_CTX];
+
+RTE_DECLARE_PER_LCORE(struct glue_ctx *, glue_ctx);
+
+static inline struct glue_ctx *
+get_ctx(void)
+{
+ if (RTE_PER_LCORE(glue_ctx))
+ return RTE_PER_LCORE(glue_ctx);
+ return default_ctx;
+}
+
+static inline uint8_t
+get_cid(void)
+{
+ return get_ctx() - ctx_array;
+}
+
+uint8_t glue_ctx_alloc(void);
+
+struct glue_ctx * glue_ctx_lookup(uint16_t port_id, uint16_t queue_id);
+
+void glue_ctx_free(struct glue_ctx *ctx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _TLE_GLUE_SOCK_H_ */
diff --git a/lib/libtle_glue/epoll.c b/lib/libtle_glue/epoll.c
new file mode 100644
index 0000000..1c8751b
--- /dev/null
+++ b/lib/libtle_glue/epoll.c
@@ -0,0 +1,577 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <errno.h>
+
+#include <rte_common.h>
+#include <rte_spinlock.h>
+#include <rte_malloc.h>
+#include <rte_ethdev.h>
+#include <rte_atomic.h>
+#include <rte_eal_interrupts.h>
+
+#include "fd.h"
+#include "ctx.h"
+#include "sym.h"
+#include "log.h"
+#include "util.h"
+#include "sock.h"
+#include "internal.h"
+#include "tle_glue.h"
+#include "../libtle_l4p/udp_stream.h"
+#include "../libtle_l4p/tcp_stream.h"
+
+#define EPOLL_DATA_SPECIAL 0xFFFFFFFFFFFFFF01
+
+/* We don't use rte_eth_dev_rx_intr_ctl_q as it has its
+ * own way to specify event.data
+ */
+static int
+dev_rx_intr_ctl_q(uint16_t port_id, uint16_t queue_id, int efd, int op, int rx)
+{
+ int fd, ret;
+ uint32_t vec, efd_idx;
+ struct rte_eth_dev *dev;
+ struct rte_intr_handle *intr_handle;
+ static struct epoll_event ev = {
+ .events = EPOLLIN | EPOLLPRI | EPOLLET,
+ .data = {
+ .u64 = EPOLL_DATA_SPECIAL,
+ },
+ };
+ char buf[32];
+
+ RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV);
+
+ dev = &rte_eth_devices[port_id];
+ if (queue_id >= dev->data->nb_rx_queues)
+ return -EINVAL;
+
+ if (!dev->intr_handle)
+ return -ENOTSUP;
+
+ intr_handle = dev->intr_handle;
+ if (!intr_handle->intr_vec)
+ return -EPERM;
+
+ vec = intr_handle->intr_vec[queue_id];
+
+ efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ?
+ (vec - RTE_INTR_VEC_RXTX_OFFSET) : vec;
+
+ fd = intr_handle->efds[efd_idx];
+
+ if (rx) {
+ /* almost all devices use eventfd, we shall read out */
+ ret = read(fd, buf, sizeof(uint64_t));
+ RTE_SET_USED(ret);
+ }
+
+ return k_epoll_ctl(efd, op, fd, &ev);
+}
+
+int
+PRE(epoll_create)(int size)
+{
+ int epfd;
+ struct sock *so;
+
+ if (!fd_table_initialized)
+ return k_epoll_create(size);
+
+ epfd = get_unused_fd();
+ if (epfd == -1) {
+ errno = EMFILE;
+ return -1;
+ }
+
+
+ so = fd2sock(epfd);
+ so->cid = glue_ctx_alloc();
+
+ so->shadow_efd = k_epoll_create(1);
+ if (so->shadow_efd < 0)
+ rte_panic("Failed to create shadow efd");
+
+ if (dev_rx_intr_ctl_q(CTX(so)->port_id, CTX(so)->queue_id,
+ so->shadow_efd, RTE_INTR_EVENT_ADD, 0) < 0)
+ rte_panic("Failed to epoll_ctl rxq interrupt fd");
+
+ so->epoll = 1;
+
+ return epfd;
+}
+
+int
+PRE(epoll_create1)(int flags __rte_unused)
+{
+ return PRE(epoll_create)(1);
+}
+
+int
+PRE(epoll_ctl)(int epfd, int op, int fd, struct epoll_event *event)
+{
+ struct sock *so_ep;
+ struct sock *so;
+
+ if (is_kernel_fd(epfd)) {
+ if (!is_kernel_fd(fd))
+ rte_panic("kernel epoll (%d) on an userspace fd: %d",
+ epfd, fd);
+
+ return k_epoll_ctl(epfd, op, fd, event);
+ }
+
+ so_ep = fd2sock(epfd);
+
+ if (is_kernel_fd(fd)) {
+ /* Use a shadow epoll fd for possible kernel I/O events. */
+ return k_epoll_ctl(so_ep->shadow_efd, op, fd, event);
+ }
+
+ so = fd2sock(fd);
+
+ if (unlikely(so->cid != so_ep->cid))
+ rte_panic("Different ctx %d and %d for epoll fd and socket fd",
+ so_ep->cid, so->cid);
+
+ GLUE_DEBUG("epoll_ctl: op = %x, fd = %d, event = %x",
+ op, fd, event->events);
+ switch (op) {
+ case EPOLL_CTL_ADD:
+ if (so->event.events) {
+ errno = EEXIST;
+ return -1;
+ }
+
+#ifdef LOOK_ASIDE_BACKEND
+ if (event->events & EPOLLIN)
+ tle_event_active(&so->rxev, TLE_SEV_DOWN);
+ if (event->events & EPOLLOUT)
+ tle_event_active(&so->txev, TLE_SEV_DOWN);
+#endif
+ so->event = *event;
+
+ break;
+ case EPOLL_CTL_MOD:
+ if (so->event.events == 0) {
+ errno = ENOENT;
+ return -1;
+ }
+
+#ifdef LOOK_ASIDE_BACKEND
+ if (event->events & EPOLLIN)
+ tle_event_active(&so->rxev, TLE_SEV_DOWN);
+ else
+ tle_event_idle(&so->rxev);
+ if (event->events & EPOLLOUT)
+ tle_event_active(&so->txev, TLE_SEV_DOWN);
+ else
+ tle_event_idle(&so->txev);
+#endif
+ so->event = *event;
+ break;
+ case EPOLL_CTL_DEL:
+ if (so->event.events == 0) {
+ errno = ENOENT;
+ return -1;
+ }
+
+#ifdef LOOK_ASIDE_BACKEND
+ if (so->event.events & EPOLLIN)
+ tle_event_idle(&so->rxev);
+ if (so->event.events & EPOLLOUT)
+ tle_event_idle(&so->txev);
+#endif
+ so->event.events = 0;
+ break;
+ default:
+ errno = EINVAL;
+ return -1;
+ }
+
+ return 0;
+}
+
+static inline int32_t
+tle_evq_fetch(struct tle_evq *evq, const void *evd[],
+ uint32_t num, uint32_t event)
+{
+ uint32_t i, k;
+ uint32_t polled;
+ struct tle_event *ev;
+ struct tle_event *next;
+
+ if (evq->nb_armed == 0)
+ return 0;
+
+ rte_compiler_barrier();
+
+ rte_spinlock_lock(&evq->lock);
+ ev = TAILQ_FIRST(&evq->armed);
+ for (i = 0, k = 0; i != evq->nb_armed; i++) {
+ next = TAILQ_NEXT(ev, ql);
+ polled = ((const struct sock *)ev->data)->event.events;
+ /* Always report EPOLLHUP, see man epoll_ctl(2) */
+ if (polled && ((polled | EPOLLHUP) & event)) {
+ evd[k++] = ev->data;
+ TAILQ_REMOVE(&evq->armed, ev, ql);
+ /* don't down erev; and assign NULL to data means this
+ * ev is already removed from the queue, refer to
+ * tle_event_idle_err().
+ */
+ if (event != EPOLLHUP)
+ ev->state = TLE_SEV_DOWN;
+ else
+ ev->data = NULL;
+ }
+ if (k == num)
+ break;
+ ev = next;
+ }
+ evq->nb_armed -= k;
+ rte_spinlock_unlock(&evq->lock);
+ return k;
+}
+
+static int
+evq_drain(struct tle_evq *q, uint32_t event,
+ struct epoll_event *events, int maxevents)
+{
+ uint32_t i, n;
+ struct sock *socks[maxevents];
+
+ n = tle_evq_fetch(q, (const void **)(uintptr_t)socks, maxevents, event);
+ for (i = 0; i < n; ++i) {
+ events[i].events = event;
+ events[i].data = socks[i]->event.data;
+
+ /* when EPOLLHUP happens, also return EPOLLIN and EPOLLOUT
+ * if they are registered. So as to emulate behaviour of linux
+ * kernel.
+ * Some applications (e.g. redis) need these events to determine
+ * following works.
+ */
+ if (event & EPOLLHUP)
+ events[i].events |= (socks[i]->event.events &
+ (EPOLLIN | EPOLLOUT));
+
+ /* if multiple events of single socket are triggered,
+ * return single event with multiple event types rather than
+ * multiple events.
+ *
+ * we drain evq in order of EPOLLOUT -> EPOLLIN -> EPOLLHUP,
+ * so only need to check event in evq that has not been drained.
+ */
+ switch (event) {
+ case EPOLLOUT:
+ if ((socks[i]->event.events & EPOLLIN) &&
+ tle_event_state(&socks[i]->rxev) == TLE_SEV_UP) {
+ tle_event_down(&socks[i]->rxev);
+ events[i].events |= EPOLLIN;
+ }
+ /* fallthrough */
+ case EPOLLIN:
+ if (tle_event_state(&socks[i]->erev) == TLE_SEV_UP) {
+ rte_spinlock_lock(&socks[i]->erev.head->lock);
+ if (socks[i]->erev.data != NULL &&
+ tle_event_state(&socks[i]->erev) == TLE_SEV_UP) {
+ TAILQ_REMOVE(&socks[i]->erev.head->armed,
+ &socks[i]->erev, ql);
+ socks[i]->erev.head->nb_armed--;
+ socks[i]->erev.data = NULL;
+ }
+ rte_spinlock_unlock(&socks[i]->erev.head->lock);
+ events[i].events |= EPOLLHUP;
+ }
+ }
+
+ GLUE_DEBUG("event for fd = %d, event = %x",
+ socks[i]->event.data.fd, event);
+ }
+ return n;
+}
+
+#ifdef LOOK_ASIDE_BACKEND
+rte_atomic32_t flag_sleep;
+
+int
+epoll_kernel_wait(struct glue_ctx *ctx, int efd,
+ struct epoll_event *events,
+ int maxevents, int timeout, int *rx)
+{
+ struct epoll_event event;
+ uint16_t port_id = ctx->port_id;
+ uint16_t queue_id = ctx->queue_id;
+
+ RTE_SET_USED(events);
+ RTE_SET_USED(maxevents);
+ RTE_SET_USED(rx);
+
+ rte_eth_dev_rx_intr_enable(port_id, queue_id);
+
+ /* TODO: timeout shall be limited by the latest tcp timer */
+
+ if (be_process(ctx) > 0) /* use this way to avoid concurrency */ {
+ /* Do nothing */
+ } else
+ sleep_with_lock(efd, &event, 1, timeout);
+
+ rte_eth_dev_rx_intr_disable(port_id, queue_id);
+ /* We don't have kernel events for report, so just return zero */
+ return 0;
+}
+#else
+int
+epoll_kernel_wait(struct glue_ctx *ctx, int efd,
+ struct epoll_event *events,
+ int maxevents, int timeout, int *rx)
+{
+ int i, j, rc;
+ int flag_tmp = 0;
+ uint16_t port_id = ctx->port_id;
+ uint16_t queue_id = ctx->queue_id;
+#define LEAST_EVENTS 8
+ struct epoll_event s_events[LEAST_EVENTS];
+ struct epoll_event *r_events;
+ int r_maxevents;
+ int fastpath = 0;
+
+ *rx = 0;
+
+ if (efd == -1) {
+ flag_tmp = 1;
+ efd = k_epoll_create(1);
+ if (efd < 0)
+ rte_panic("Failed to create tmp efd");
+ }
+
+ if (stopped) {
+ rc = k_epoll_pwait(efd, events, maxevents, timeout, NULL);
+ goto check;
+ }
+
+ if (maxevents < LEAST_EVENTS) {
+ r_events = s_events;
+ r_maxevents = maxevents + 1;
+ } else {
+ r_events = events;
+ r_maxevents = maxevents;
+ }
+
+ if (flag_tmp &&
+ dev_rx_intr_ctl_q(port_id, queue_id, efd, RTE_INTR_EVENT_ADD, 0) < 0)
+ /* TODO: fall back to busy polling */
+ rte_panic("Failed to enable rxq interrupt");
+
+ rte_eth_dev_rx_intr_enable(port_id, queue_id);
+
+ /* TODO: timeout shall be limited by the latest tcp timer */
+
+ if (timeout != 0 && be_process(ctx) > 0) {
+ /* use this way to avoid concurrency */
+ rc = 0;
+ fastpath = 1;
+ } else
+ rc = sleep_with_lock(efd, r_events, r_maxevents, timeout);
+
+ rte_eth_dev_rx_intr_disable(port_id, queue_id);
+
+ /* filter out rxq event */
+ for (i = 0, j = 0; i < rc; ++i) {
+ if (r_events[i].data.u64 == EPOLL_DATA_SPECIAL) {
+ *rx = true;
+ if (i + 1 < rc) {
+ memcpy(&r_events[j], &r_events[i+1],
+ (rc-i-1) * sizeof(*events));
+ }
+ rc -= 1;
+ break;
+ } else {
+ if (i != j)
+ r_events[j] = r_events[i];
+ j++;
+ }
+ }
+
+ if (rc > 0 && maxevents < LEAST_EVENTS)
+ memcpy(events, r_events, rc * sizeof(*events));
+
+ if (flag_tmp)
+ dev_rx_intr_ctl_q(port_id, queue_id, efd,
+ RTE_INTR_EVENT_DEL, *rx);
+
+ if (fastpath)
+ *rx = true;
+check:
+ if (flag_tmp)
+ close(efd);
+
+ return rc;
+}
+#endif
+
+/* If only there are some packets to process, we don't sleep; we will poll
+ * for some number of iterations to check packets.
+ *
+ * TODO: change to wait for a period of time?
+ */
+#define IDLE_ITERATIONS 5
+
+int
+poll_common(struct glue_ctx *ctx, struct epoll_event *events,
+ int maxevents, int timeout, int shadow_efd)
+{
+ int rx;
+ int total = 0;
+ int idle = IDLE_ITERATIONS;
+
+again:
+ /* We will start with send, then recv, and last err queue, as we want
+ * to serve exiting connections firstly, then new connections, and
+ * lastly, the wrong connections.
+ */
+
+ /* 0. send evq */
+ total += evq_drain(ctx->txeq, EPOLLOUT,
+ events + total, maxevents-total);
+ if (total == maxevents)
+ return total;
+
+ /* 1. recv evq */
+ total += evq_drain(ctx->rxeq, EPOLLIN,
+ events + total, maxevents-total);
+ if (total == maxevents)
+ return total;
+
+ /* 2. err evq */
+ total += evq_drain(ctx->ereq, EPOLLHUP,
+ events + total, maxevents-total);
+
+ if (total > 0)
+ return total;
+
+ if (idle > 0) {
+ if (be_process(ctx) == 0)
+ idle--;
+ else
+ idle = IDLE_ITERATIONS;
+ goto again;
+ }
+
+ if (timeout == 0)
+ return 0;
+
+ /* Setup rxq interrupt mode, and check kernel I/O events */
+ total = epoll_kernel_wait(ctx, shadow_efd, events,
+ maxevents, timeout, &rx);
+
+ /* Kernel I/O events are available (total > 0) or
+ * timeout (total < 0) or something bad happens.
+ */
+ if (total != 0)
+ return total;
+
+ /* Check userspace I/O events */
+ idle = IDLE_ITERATIONS;
+ be_process(ctx);
+ goto again;
+}
+
+int
+PRE(epoll_wait)(int epfd, struct epoll_event *events,
+ int maxevents, int timeout)
+{
+ struct sock *so;
+
+ if (is_kernel_fd(epfd))
+ return k_epoll_pwait(epfd, events, maxevents, timeout, NULL);
+
+ so = fd2sock(epfd);
+
+ /* thread <> context binding happens here */
+ if (RTE_PER_LCORE(glue_ctx) == NULL)
+ RTE_PER_LCORE(glue_ctx) = CTX(so);
+
+ return poll_common(CTX(so), events, maxevents, timeout, so->shadow_efd);
+}
+
+int
+PRE(epoll_pwait)(int epfd, struct epoll_event *events,
+ int maxevents, int timeout, const sigset_t *sigmask)
+{
+ if (sigmask != NULL) {
+ rte_panic("epoll_pwait with signal is not supported");
+ }
+
+ return epoll_wait(epfd, events, maxevents, timeout);
+}
+
+int
+fd_ready(int fd, int events)
+{
+ int ret = 0;
+ struct sock *so = fd2sock(fd);
+
+ if (unlikely(!so->s)) {
+ if (tle_event_state(&so->erev) == TLE_SEV_UP)
+ /* socket has been shutdown */
+ return events | EPOLLHUP;
+ else /* socket is not set up yet */
+ return 0;
+ }
+
+ if (unlikely(IS_TCP(so) &&
+ TCP_STREAM(so->s)->tcb.state == TCP_ST_CLOSED)) {
+ return events | EPOLLHUP | EPOLLERR;
+ }
+
+ if (tle_event_state(&so->erev) == TLE_SEV_UP)
+ ret |= EPOLLHUP;
+
+ if (events & EPOLLIN) {
+ if (so->rx_left ||
+ (IS_TCP(so) && rte_ring_count(TCP_STREAM(so->s)->rx.q) > 0) ||
+ (IS_UDP(so) && rte_ring_count(UDP_STREAM(so->s)->rx.q) > 0))
+ ret |= EPOLLIN;
+ }
+
+ if (events & EPOLLOUT) {
+ if ((IS_TCP(so) &&
+ TCP_STREAM(so->s)->tcb.state >= TCP_ST_ESTABLISHED &&
+ rte_ring_free_count(TCP_STREAM(so->s)->tx.q) > 0) ||
+ (IS_UDP(so) &&
+ rte_ring_count(UDP_STREAM(so->s)->tx.drb.r) > 0))
+ ret |= EPOLLOUT;
+ }
+
+ return ret;
+}
+
+void
+v_get_stats_snmp(unsigned long mibs[])
+{
+ int i, j, k;
+
+ memcpy(mibs, &default_mib, sizeof(default_mib));
+
+ for (i = 0; i < nb_ctx; ++i) {
+ for (j = 0; j < TCP_MIB_MAX; ++j)
+ mibs[j] += ctx_array[i].mib.tcp.mibs[j];
+
+ for (k = 0; k < UDP_MIB_MAX; ++k)
+ mibs[j+k] += ctx_array[i].mib.udp.mibs[k];
+ }
+}
diff --git a/lib/libtle_glue/fd.c b/lib/libtle_glue/fd.c
new file mode 100644
index 0000000..cc855f9
--- /dev/null
+++ b/lib/libtle_glue/fd.c
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <string.h>
+
+#include "fd.h"
+#include "log.h"
+#include "util.h"
+#include "config.h"
+
+bool fd_table_initialized;
+
+struct fd_table fd_table = { .fd_base = INT_MAX, };
+
+static int
+get_ulimit_nofile(void)
+{
+ struct rlimit rlim;
+
+#define GLUE_BASE_FD 1024
+ if (getrlimit(RLIMIT_NOFILE, &rlim) < 0)
+ return GLUE_BASE_FD;
+
+ return rlim.rlim_cur; /* soft limit, rlim_max is the hard limit */
+}
+
+static void
+fd_num_set(int *fd_base, int *fd_num)
+{
+ int limit = get_ulimit_nofile();
+
+ /* fix me: alignment of power of two */
+ /* fix me: use dup2 to occupy these fds */
+ *fd_num = limit / 2;
+ *fd_num = RTE_MIN(MAX_STREAMS_PER_CORE * 2 * MAX_NB_CTX, *fd_num);
+
+ *fd_base = limit - *fd_num;
+ GLUE_LOG(INFO, "fd_base = %d, fd_num = %d", *fd_base, *fd_num);
+}
+
+static void
+add_fd(struct rte_mempool *mp __rte_unused, void *opaque __rte_unused,
+ void *obj, unsigned obj_idx)
+{
+ ((struct sock *)obj)->fd = obj_idx + fd_table.fd_base;
+ fd_table.socks[obj_idx] = obj;
+}
+
+void
+fd_init(void)
+{
+ int ret;
+ size_t sz;
+ uint32_t socket_id;
+ int fd_base, fd_num;
+ struct rte_mempool *mp = NULL;
+ char name[RTE_MEMPOOL_NAMESIZE];
+
+ socket_id = get_socket_id();
+
+ fd_num_set(&fd_base, &fd_num);
+
+ sz = sizeof(fd_table.socks[0]) * fd_num;
+ fd_table.socks = rte_zmalloc_socket("fdtable", sz,
+ RTE_CACHE_LINE_SIZE, socket_id);
+ if (fd_table.socks == NULL) {
+ GLUE_LOG(ERR, "Failed to malloc fd table");
+ goto err;
+ }
+
+ snprintf(name, RTE_MEMPOOL_NAMESIZE, "mp_fd_%d_%d", fd_base, fd_num);
+ mp = rte_mempool_create_empty(name, fd_num - 1, sizeof(struct sock),
+ 32, 0, socket_id, MEMPOOL_F_DYNAMIC);
+ if (mp == NULL) {
+ GLUE_LOG(ERR, "Failed to create mp for fd table");
+ goto err;
+ }
+
+ GLUE_LOG(INFO, "sizeof(struct sock): %lu, elt_size of fd table = %u",
+ sizeof(struct sock), mp->elt_size);
+
+ ret = rte_mempool_set_ops_byname(mp, "ring_mp_mc", NULL);
+ if (ret != 0) {
+ GLUE_LOG(ERR, "Failed to set mp ops: %d", ret);
+ goto err;
+ }
+
+ rte_mempool_set_dynamic_size(mp, 1024);
+ rte_mempool_set_dynamic_cb(mp, add_fd);
+
+ fd_table.mp = mp;
+ fd_table.fd_base = fd_base;
+ fd_table.fd_num = fd_num;
+
+ /* should populate after fd_table is set */
+ ret = rte_mempool_populate_default(mp);
+ if (ret < 0) {
+ GLUE_LOG(ERR, "Failed to populate mp: %d", ret);
+ goto err;
+ }
+
+ fd_table_initialized = true;
+
+ return;
+err:
+ rte_mempool_free(mp);
+ rte_panic("Failed to init fd_table");
+}
diff --git a/lib/libtle_glue/fd.h b/lib/libtle_glue/fd.h
new file mode 100644
index 0000000..d0ac4fe
--- /dev/null
+++ b/lib/libtle_glue/fd.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _TLE_GLUE_FD_H_
+#define _TLE_GLUE_FD_H_
+
+#include <stdbool.h>
+#include <sys/epoll.h>
+#include <fcntl.h>
+
+#include <rte_mempool.h>
+#include <rte_malloc.h>
+
+#include <tle_event.h>
+#include <tle_ctx.h>
+#include <tle_tcp.h>
+
+#include "log.h"
+#include "sock.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct fd_table {
+ int fd_base; /* The mininum fd, 64 aligned */
+ int fd_num; /* The number of fds, 64 aligned */
+ struct rte_mempool *mp; /* O(1) get and put */
+ struct sock **socks;
+};
+
+extern bool fd_table_initialized;
+extern struct fd_table fd_table;
+
+static inline struct sock *
+fd2sock(int fd)
+{
+ return fd_table.socks[fd - fd_table.fd_base];
+}
+
+static inline int
+sock2fd(struct sock *so)
+{
+ return so->fd;
+}
+
+static inline int
+get_unused_fd(void)
+{
+ struct sock *so;
+
+ if (unlikely(rte_mempool_get(fd_table.mp, (void **)&so) < 0)) {
+ GLUE_LOG(ERR, "FDs have been exhausted");
+ return -1;
+ }
+
+ so->valid = 1;
+ return sock2fd(so);
+}
+
+static inline void
+tle_event_idle_err(struct tle_event *ev)
+{
+ struct tle_evq *q;
+
+ if (ev->state == TLE_SEV_IDLE)
+ return;
+
+ q = ev->head;
+ rte_compiler_barrier();
+
+ rte_spinlock_lock(&q->lock);
+ if (ev->state == TLE_SEV_UP && ev->data) {
+ TAILQ_REMOVE(&q->armed, ev, ql);
+ q->nb_armed--;
+ }
+ ev->state = TLE_SEV_IDLE;
+ rte_spinlock_unlock(&q->lock);
+}
+
+static inline void
+put_free_fd(int fd)
+{
+ struct sock *so = fd2sock(fd);
+
+ rte_mempool_put(fd_table.mp, so);
+}
+
+static inline bool
+is_kernel_fd(int fd)
+{
+ return fd < fd_table.fd_base;
+}
+
+void fd_init(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _TLE_GLUE_FD_H_ */
diff --git a/lib/libtle_glue/gateway.h b/lib/libtle_glue/gateway.h
new file mode 100644
index 0000000..29de6b1
--- /dev/null
+++ b/lib/libtle_glue/gateway.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2019 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _TLE_GATEWAY_H_
+#define _TLE_GATEWAY_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static inline bool
+is_ipv4_loopback_addr(in_addr_t addr, struct glue_ctx *ctx)
+{
+ if (addr == ctx->ipv4 || addr == htonl(INADDR_LOOPBACK))
+ return true;
+ else
+ return false;
+}
+
+static inline bool
+is_ipv6_loopback_addr(const struct in6_addr *addr, struct glue_ctx *ctx)
+{
+ if (memcmp(addr, &ctx->ipv6, sizeof(struct in6_addr)) == 0 ||
+ IN6_IS_ADDR_LOOPBACK(addr) ||
+ (IN6_IS_ADDR_V4COMPAT(addr) &&
+ addr->__in6_u.__u6_addr32[3] == htonl(INADDR_LOOPBACK)) ||
+ (IN6_IS_ADDR_V4MAPPED(addr) &&
+ addr->__in6_u.__u6_addr32[3] == htonl(INADDR_LOOPBACK)))
+ return true;
+ else
+ return false;
+}
+
+static inline const struct in_addr *
+ipv4_gateway_lookup(void *data, const struct in_addr *addr)
+{
+ uint8_t ls;
+ struct glue_ctx *ctx = data;
+
+ if (is_ipv4_loopback_addr(addr->s_addr, ctx))
+ return addr;
+
+ ls = 32 - ctx->ipv4_ml;
+ if ((addr->s_addr << ls) == (ctx->ipv4 << ls))
+ return addr;
+
+ if (ctx->ipv4_gw.s_addr != 0)
+ return &ctx->ipv4_gw;
+
+ return addr;
+}
+
+static inline const struct in6_addr *
+ipv6_gateway_lookup(void *data, const struct in6_addr *addr)
+{
+ uint8_t ls;
+ struct glue_ctx *ctx = data;
+
+ if (is_ipv6_loopback_addr(addr, ctx))
+ return addr;
+
+ if (ctx->ipv6_ml <= 64) {
+ ls = 64 - ctx->ipv6_ml;
+ if ((*(const uint64_t*)addr << ls) ==
+ (*(const uint64_t*)&ctx->ipv6 << ls))
+ return addr;
+ } else if (*(const uint64_t*)addr == *(const uint64_t*)&ctx->ipv6) {
+ ls = 128 - ctx->ipv6_ml;
+ if ((*((const uint64_t*)addr + 1) << ls) ==
+ (*((const uint64_t*)&ctx->ipv6 + 1) << ls))
+ return addr;
+ }
+
+ if (!IN6_IS_ADDR_UNSPECIFIED(&ctx->ipv6_gw))
+ return &ctx->ipv6_gw;
+
+ return addr;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _TLE_GATEWAY_H_ */
diff --git a/lib/libtle_glue/icmp.c b/lib/libtle_glue/icmp.c
new file mode 100644
index 0000000..aba1c4b
--- /dev/null
+++ b/lib/libtle_glue/icmp.c
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <time.h>
+#include <netinet/icmp6.h>
+
+#include <rte_common.h>
+#include <rte_byteorder.h>
+#include <rte_ethdev.h>
+#include <rte_icmp.h>
+#include <rte_ip.h>
+
+#include "log.h"
+#include "ctx.h"
+#include "internal.h"
+
+#define ICMP_ECHOREPLY 0 /* Echo Reply */
+#define ICMP_ECHO 8 /* Echo Request */
+#define ICMP_TIMESTAMP 13 /* Timestamp Request */
+#define ICMP_TIMESTAMPREPLY 14 /* Timestamp Reply */
+
+/* Codes for TIME_EXCEEDED. */
+#define ICMP_EXC_TTL 0 /* TTL count exceeded */
+#define ICMP_EXC_FRAGTIME 1 /* Fragment Reass time exceeded */
+
+/* Parameters used to convert the timespec values */
+#define SECONDS_PER_DAY 86400L
+#define MSEC_PER_SEC 1000L
+#define USEC_PER_MSEC 1000L
+#define NSEC_PER_USEC 1000L
+#define NSEC_PER_MSEC (NSEC_PER_USEC * USEC_PER_MSEC)
+
+#define IS_IPV4_BCAST(x) ((x) == (uint32_t)0xFFFFFFFF)
+
+struct icmp_pkt {
+ struct icmp_hdr icmp_h;
+ uint32_t times[3];
+};
+
+/* Return remainder for ``dividend / divisor`` */
+static inline uint32_t
+div_uint64_rem(uint64_t dividend, uint32_t divisor)
+{
+ return dividend % divisor;
+}
+
+/* Return milliseconds since midnight (UTC) in network byte order. */
+static uint32_t
+current_timestamp(void)
+{
+ struct timespec ts;
+ uint32_t msecs;
+ uint32_t secs;
+
+ (void)clock_gettime(CLOCK_REALTIME, &ts);
+
+ /* Get secs since midnight. */
+ secs = div_uint64_rem(ts.tv_sec, SECONDS_PER_DAY);
+ /* Convert to msecs. */
+ msecs = secs * MSEC_PER_SEC;
+ /* Convert nsec to msec. */
+ msecs += (uint32_t)ts.tv_nsec / NSEC_PER_MSEC;
+
+ /* Convert to network byte order. */
+ return rte_cpu_to_be_32(msecs);
+}
+
+/*
+ * Process the checksum of an ICMP packet. The checksum field must be set
+ * to 0 by the caller.
+ */
+static uint16_t
+icmp_cksum(const struct icmp_hdr *icmp, uint32_t data_len)
+{
+ uint16_t cksum;
+
+ cksum = rte_raw_cksum(icmp, sizeof(struct icmp_hdr) + data_len);
+ return (cksum == 0xffff) ? cksum : ~cksum;
+}
+
+/**
+ * Receive and handle an ICMP packet.
+ *
+ * @param ctx
+ * The pointer to the glue context.
+ * @param pkt
+ * The pointer to the raw packet data.
+ * @param l2_len
+ * The the size of the l2 header.
+ * @return
+ * MUST return NULL now. :-)
+ */
+struct rte_mbuf *
+icmp_recv(struct glue_ctx *ctx, struct rte_mbuf *pkt,
+ uint32_t l2_len, uint32_t l3_len)
+{
+ struct ether_addr eth_addr;
+ struct icmp_pkt *icmp_pkt;
+ struct ether_hdr *eth_h;
+ struct icmp_hdr *icmp_h;
+ struct ipv4_hdr *ip_h;
+ uint32_t ip_addr;
+ uint32_t cksum;
+
+ eth_h = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+ ip_h = (struct ipv4_hdr *) ((char *)eth_h + l2_len);
+
+ icmp_h = (struct icmp_hdr *)((char *)ip_h + l3_len);
+ if (icmp_h->icmp_type != IP_ICMP_ECHO_REQUEST &&
+ icmp_h->icmp_type != ICMP_TIMESTAMP)
+ goto drop_pkt;
+
+ icmp_pkt = (struct icmp_pkt *)icmp_h;
+
+ ether_addr_copy(&eth_h->s_addr, &eth_addr);
+ ether_addr_copy(&eth_h->d_addr, &eth_h->s_addr);
+ ether_addr_copy(&eth_addr, &eth_h->d_addr);
+
+ /*
+ * Similar to Linux implementation, we silently drop the broadcast or
+ * multicast ICMP pakcets.
+ *
+ * RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be
+ * silently ignored.
+ * RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently
+ * discarded if to broadcast/multicast.
+ */
+ ip_addr = rte_be_to_cpu_32(ip_h->dst_addr);
+ if (IS_IPV4_MCAST(ip_addr) || IS_IPV4_BCAST(ip_addr))
+ goto drop_pkt;
+
+ ip_addr = ip_h->src_addr;
+ ip_h->src_addr = ip_h->dst_addr;
+ ip_h->dst_addr = ip_addr;
+
+ if (icmp_h->icmp_type == IP_ICMP_ECHO_REQUEST &&
+ icmp_h->icmp_code == 0) {
+
+ /* Must clear checksum field before calling the helper. */
+ ip_h->hdr_checksum = 0;
+ ip_h->hdr_checksum = rte_ipv4_cksum(ip_h);
+
+ icmp_h->icmp_type = IP_ICMP_ECHO_REPLY;
+ icmp_h->icmp_code = 0;
+
+ /*
+ * Fix me: the data part of an ICMP echo request/reply
+ * message is implementation specific, we don't know
+ * how to verify or calculate the checksum.
+ *
+ * Need to see BSD or LINUX implementation.
+ */
+ cksum = ~icmp_h->icmp_cksum & 0xffff;
+ cksum += ~rte_cpu_to_be_16(IP_ICMP_ECHO_REQUEST << 8) & 0xffff;
+ cksum += rte_cpu_to_be_16(IP_ICMP_ECHO_REPLY << 8);
+ cksum = (cksum & 0xffff) + (cksum >> 16);
+ cksum = (cksum & 0xffff) + (cksum >> 16);
+ icmp_h->icmp_cksum = ~cksum;
+
+ } else if (icmp_h->icmp_type == ICMP_TIMESTAMP &&
+ icmp_h->icmp_code == 0) {
+
+ /*
+ * RFC 1122: 3.2.2.8 MAY implement ICMP timestamp requests.
+ * SHOULD be in the kernel for minimum random latency.
+ * MUST be accurate to a few minutes.
+ * MUST be updated at least at 15Hz.
+ */
+ icmp_h->icmp_type = ICMP_TIMESTAMPREPLY;
+ icmp_h->icmp_code = 0;
+ icmp_pkt->times[1] = current_timestamp();
+ icmp_pkt->times[2] = icmp_pkt->times[1];
+
+ icmp_h->icmp_cksum = 0;
+ /* the data part of an ICMP timestamp reply is 12 bytes. */
+ icmp_h->icmp_cksum = icmp_cksum(icmp_h, 12);
+ } else
+ goto drop_pkt;
+
+ if (pkt->pkt_len < ETHER_MIN_LEN)
+ rte_pktmbuf_append(pkt, ETHER_MIN_LEN - pkt->pkt_len);
+
+ if (rte_eth_tx_burst(ctx->port_id, ctx->queue_id, &pkt, 1))
+ GLUE_LOG(DEBUG, "Send ICMP echo reply OK");
+
+ return NULL;
+
+drop_pkt:
+ rte_pktmbuf_free(pkt);
+ return NULL;
+}
+
+/**
+ * Receive and handle an ICMPv6 packet.
+ *
+ * @param ctx
+ * The pointer to the glue context.
+ * @param pkt
+ * The pointer to the raw packet data.
+ * @param l2_len
+ * The the size of the l2 header.
+ * @return
+ * MUST return NULL now. :-)
+ */
+struct rte_mbuf *
+icmp6_recv(struct glue_ctx *ctx, struct rte_mbuf *pkt,
+ uint32_t l2_len, uint32_t l3_len)
+{
+ struct ether_addr eth_addr;
+ struct ether_hdr *eth_h;
+ struct icmp6_hdr *icmp6_h;
+ struct ipv6_hdr *ipv6_h;
+ struct in6_addr ipv6_addr;
+ uint32_t cksum;
+
+ eth_h = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+ ipv6_h = (struct ipv6_hdr *) ((char *)eth_h + l2_len);
+
+ icmp6_h = (struct icmp6_hdr *)((char *)ipv6_h + l3_len);
+
+ /* NDP pkt */
+ if ((icmp6_h->icmp6_type == ND_NEIGHBOR_SOLICIT ||
+ icmp6_h->icmp6_type == ND_NEIGHBOR_ADVERT) &&
+ icmp6_h->icmp6_code == 0)
+ return ndp_recv(ctx, pkt, l2_len, l3_len);
+
+ /* only support ECHO now, other types of pkts are dropped */
+ if ((icmp6_h->icmp6_type != ICMP6_ECHO_REQUEST &&
+ icmp6_h->icmp6_type != ICMP6_ECHO_REPLY) ||
+ icmp6_h->icmp6_code != 0)
+ goto drop_pkt;
+
+ ether_addr_copy(&eth_h->s_addr, &eth_addr);
+ ether_addr_copy(&eth_h->d_addr, &eth_h->s_addr);
+ ether_addr_copy(&eth_addr, &eth_h->d_addr);
+
+ /*
+ * Now, we silently drop the anycast or multicast ICMP pakcets.
+ * But it does not conform to RFC 4443. Maybe fix it latter.
+ *
+ * RFC 4443: 4.2 An Echo Reply SHOULD be sent in response to an
+ * Echo Request message sent to an IPv6 multicast or anycast address.
+ * In this case, thesource address of the reply MUST be a unicast
+ * address belonging to the interface on which the Echo Request
+ * message was received.
+ */
+ switch (icmp6_h->icmp6_type) {
+ case ICMP6_ECHO_REQUEST:
+ if (memcmp(ipv6_h->dst_addr, &ctx->ipv6,
+ sizeof(struct in6_addr)) != 0)
+ goto drop_pkt;
+
+ rte_memcpy(&ipv6_addr, ipv6_h->src_addr,
+ sizeof(struct in6_addr));
+ rte_memcpy(ipv6_h->src_addr, ipv6_h->dst_addr,
+ sizeof(struct in6_addr));
+ rte_memcpy(ipv6_h->dst_addr, &ipv6_addr,
+ sizeof(struct in6_addr));
+
+ icmp6_h->icmp6_type = ICMP6_ECHO_REPLY;
+
+ cksum = ~icmp6_h->icmp6_cksum & 0xffff;
+ cksum += ~rte_cpu_to_be_16(ICMP6_ECHO_REQUEST << 8) & 0xffff;
+ cksum += rte_cpu_to_be_16(ICMP6_ECHO_REPLY << 8);
+ cksum = (cksum & 0xffff) + (cksum >> 16);
+ cksum = (cksum & 0xffff) + (cksum >> 16);
+ icmp6_h->icmp6_cksum = ~cksum;
+
+ break;
+ default:
+ goto drop_pkt;
+ }
+
+ if (pkt->pkt_len < ETHER_MIN_LEN)
+ rte_pktmbuf_append(pkt, ETHER_MIN_LEN - pkt->pkt_len);
+
+ if (rte_eth_tx_burst(ctx->port_id, ctx->queue_id, &pkt, 1))
+ GLUE_LOG(DEBUG, "Send ICMP echo reply OK");
+
+ return NULL;
+
+drop_pkt:
+ rte_pktmbuf_free(pkt);
+ return NULL;
+}
diff --git a/lib/libtle_glue/init.c b/lib/libtle_glue/init.c
new file mode 100644
index 0000000..d845ef8
--- /dev/null
+++ b/lib/libtle_glue/init.c
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <sched.h>
+#include <pthread.h>
+#include <stdlib.h>
+
+#include <rte_compat.h>
+#include <rte_common.h>
+#include <rte_debug.h>
+#include <rte_eal.h>
+
+#include "util.h"
+#include "fd.h"
+#include "ctx.h"
+#include "sym.h"
+#include "log.h"
+#include "internal.h"
+#include "tle_glue.h"
+
+void
+glue_init1(int argc, char **argv)
+{
+ GLUE_LOG(INFO, "init: DPDK and fd table...");
+
+ if (rte_eal_init(argc, argv) < 0)
+ rte_panic("Failed to init DPDK");
+
+ fd_init();
+}
+
+static void __attribute__((constructor(1000)))
+glue_init(void)
+{
+ char *p;
+ int i, err, argc = 0;
+ char **argv = NULL, **argv_to_release = NULL;
+ char *vnic, *params, *no_huge;
+ cpu_set_t cpuset;
+ pthread_t tid = pthread_self();
+
+ symbol_init();
+
+#define DPDK_PARAMS "DPDK_PARAMS"
+ params = getenv(DPDK_PARAMS);
+#define DPDK_NO_HUGE "DPDK_NO_HUGE"
+ no_huge = getenv(DPDK_NO_HUGE);
+#define DPDK_VNIC "DPDK_VNIC"
+ vnic = getenv(DPDK_VNIC);
+
+ if (params == NULL && no_huge == NULL && vnic == NULL)
+ return;
+
+ argv = grow_argv(argv, argc, 1);
+ argv[argc++] = xstrdup("userspace-stack");
+
+ /* Get the main thread affinity */
+ CPU_ZERO(&cpuset);
+ err = pthread_getaffinity_np(tid, sizeof(cpu_set_t), &cpuset);
+ if (!err) {
+ for (i = 0; i < CPU_SETSIZE; i++) {
+ if (CPU_ISSET(i, &cpuset)) {
+ argv = grow_argv(argv, argc, 2);
+ argv[argc++] = xstrdup("-l");
+ argv[argc++] = xasprintf("%d", i);
+ i = CPU_SETSIZE;
+ }
+ }
+ } else {
+ argv = grow_argv(argv, argc, 2);
+ argv[argc++] = xstrdup("-l");
+ argv[argc++] = xasprintf("0");
+ }
+
+ if (params)
+ p = strtok(params, " ");
+ else
+ p = NULL;
+ while (p != NULL) {
+ argv = grow_argv(argv, argc, 1);
+ argv[argc++] = xstrdup(p);
+ p = strtok(NULL, " ");
+ }
+
+ if (no_huge) {
+ argv = grow_argv(argv, argc, 3);
+ argv[argc++] = xstrdup("-m");
+ argv[argc++] = xstrdup("2048");
+ argv[argc++] = xstrdup("--no-huge");
+ }
+
+ if (vnic) {
+ argv = grow_argv(argv, argc, 2);
+ argv[argc++] = xstrdup(vnic);
+ argv[argc++] = xstrdup("--no-pci");
+ }
+
+ argv = grow_argv(argv, argc, 1);
+ argv[argc++] = xstrdup("--");
+
+ argv_to_release = grow_argv(argv_to_release, 0, argc);
+ for (i = 0; i < argc; ++i)
+ argv_to_release[i] = argv[i];
+
+ glue_init1(argc, argv);
+
+ /* Alloc and setup this default ctx for any sockets operations before
+ * thread/ctx binding which happens when epoll_wait.
+ */
+ glue_ctx_alloc();
+
+ release_argv(argc, argv_to_release, argv);
+
+ /* Set back the affinity */
+ err = pthread_setaffinity_np(tid, sizeof(cpu_set_t), &cpuset);
+ if (err)
+ GLUE_LOG(ERR, "Failed to set back affinity");
+}
+
+static void __attribute__((destructor))
+glue_uninit(void)
+{
+ struct sock *so;
+ struct glue_ctx *ctx;
+ int i, max = fd_table.fd_base + fd_table.fd_num;
+
+ /* TODO: lets optimize it */
+ for (i = fd_table.fd_base; i < max; i++) {
+ so = fd2sock(i);
+ if (!so || !so->valid)
+ continue;
+ if (IS_TCP(so))
+ tle_tcp_stream_kill(so->s);
+ }
+
+ for (i = 0; i < nb_ctx; ++i) {
+ ctx = glue_ctx_lookup(0, i);
+ while (be_process(ctx)) { /* empty */ };
+ }
+}
diff --git a/lib/libtle_glue/internal.h b/lib/libtle_glue/internal.h
new file mode 100644
index 0000000..91fe784
--- /dev/null
+++ b/lib/libtle_glue/internal.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _TLE_GLUE_INTERNAL_H_
+#define _TLE_GLUE_INTERNAL_H_
+
+#include <rte_mbuf.h>
+#include <rte_atomic.h>
+
+#include <tle_ctx.h>
+
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <sys/epoll.h>
+
+#include "ctx.h"
+#include "sym.h"
+#include <rte_mempool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int stopped;
+
+extern uint64_t rx_offload;
+extern uint64_t tx_offload;
+
+void port_reconfig(void);
+
+uint16_t create_loopback(uint32_t socket_id);
+
+struct rte_mempool * get_mempool_by_socket(int32_t socket_id);
+
+int be_process(struct glue_ctx *ctx);
+
+int be_tx(struct glue_ctx *ctx);
+
+struct rte_mbuf * arp_recv(struct glue_ctx *ctx,
+ struct rte_mbuf *m, uint32_t l2len);
+
+struct rte_mbuf * ndp_recv(struct glue_ctx *ctx,
+ struct rte_mbuf *m, uint32_t l2len, uint32_t l3len);
+
+
+void mac_check(struct glue_ctx *ctx, const struct sockaddr* addr);
+
+int arp_ipv4_dst_lookup(void *data, const struct in_addr *addr,
+ struct tle_dest *res, int proto);
+
+int arp_ipv6_dst_lookup(void *data, const struct in6_addr *addr,
+ struct tle_dest *res, int proto);
+
+int mac_fill(struct glue_ctx *ctx, struct rte_mbuf *m);
+
+void mac_timeout(struct glue_ctx *ctx);
+
+int setup_rx_cb(uint16_t port_id, uint16_t qid);
+
+int epoll_kernel_wait(struct glue_ctx *ctx, int efd,
+ struct epoll_event *events,
+ int maxevents, int timeout, int *rx);
+
+int poll_common(struct glue_ctx *ctx, struct epoll_event *events,
+ int maxevents, int timeout, int shadow_efd);
+
+int dev_rxq_wakeup(uint16_t port_id);
+
+struct rte_mbuf * icmp_recv(struct glue_ctx *ctx, struct rte_mbuf *pkt,
+ uint32_t l2len, uint32_t l3len);
+
+struct rte_mbuf * icmp6_recv(struct glue_ctx *ctx, struct rte_mbuf *pkt,
+ uint32_t l2len, uint32_t l3len);
+
+uint16_t typen_rx_callback(uint16_t port, uint16_t queue,
+ struct rte_mbuf *pkt[], uint16_t nb_pkts,
+ uint16_t max_pkts, void *user_param);
+
+void ipv4_dst_add(struct glue_ctx *ctx, const struct in_addr *addr,
+ struct ether_addr *e_addr);
+
+void ipv6_dst_add(struct glue_ctx *ctx, const struct in6_addr *addr,
+ struct ether_addr *e_addr);
+
+#ifdef LOOK_ASIDE_BACKEND
+extern rte_atomic32_t flag_sleep;
+
+enum {
+ IOTHREAD_BUSY = 0, /* io thread is busy */
+ IOTHREAD_SLEEP, /* io thread is sleeping */
+ IOTHREAD_PREEMPT, /* io thread is preempted by another worker thread */
+};
+
+static inline int
+sleep_with_lock(int efd, struct epoll_event *events, int max, int to)
+{
+ int rc;
+
+ rte_atomic32_set(&flag_sleep, IOTHREAD_SLEEP);
+ rc = k_epoll_pwait(efd, events, max, to, NULL);
+ while (rte_atomic32_cmpset((volatile uint32_t *)&flag_sleep,
+ IOTHREAD_SLEEP, IOTHREAD_BUSY) == 0);
+
+ return rc;
+}
+
+static inline void
+be_tx_with_lock(struct glue_ctx *ctx)
+{
+ if (rte_atomic32_cmpset((volatile uint32_t *)&flag_sleep,
+ IOTHREAD_SLEEP, IOTHREAD_PREEMPT)) {
+ while (be_tx(ctx) > 0) {};
+ rte_atomic32_set(&flag_sleep, IOTHREAD_SLEEP);
+ }
+}
+
+static inline void
+wake_lookaside_backend(struct glue_ctx *ctx)
+{
+ if (rte_atomic32_read(&flag_sleep) == IOTHREAD_PREEMPT)
+ dev_rxq_wakeup(ctx->port_id);
+}
+
+static inline bool
+io_thread_in_sleep(void)
+{
+ return rte_atomic32_read(&flag_sleep) == IOTHREAD_SLEEP;
+}
+#else
+#define sleep_with_lock k_epoll_wait
+#define be_tx_with_lock(ctx) do {} while(0)
+#define wake_lookaside_backend(ctx) do {} while(0)
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _TLE_GLUE_INTERNAL_H_ */
diff --git a/lib/libtle_glue/log.h b/lib/libtle_glue/log.h
new file mode 100644
index 0000000..da31ea3
--- /dev/null
+++ b/lib/libtle_glue/log.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2019 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _GLUE_LOG_H_
+#define _GLUE_LOG_H_
+
+#include <arpa/inet.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <rte_vect.h>
+#include <rte_memcpy.h>
+#include <rte_spinlock.h>
+#include <rte_log.h>
+#include <rte_errno.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * logging related macros.
+ */
+
+#define GLUE_LOG(lvl, fmt, args...) RTE_LOG(lvl, USER1, fmt "\n", ##args)
+
+#define DUMMY_MACRO do {} while (0)
+
+#ifdef ENABLE_DEBUG
+#define GLUE_DEBUG(fmt, arg...) fprintf(stderr, fmt "\n", ##arg)
+#else
+#define GLUE_DEBUG(fmt, arg...) DUMMY_MACRO
+#endif
+
+#ifdef ENABLE_TRACE
+#define TRACE(fmt, arg...) fprintf(stderr, fmt "\n", ##arg)
+#define PKT_DUMP(p) rte_pktmbuf_dump(stderr, (p), 64)
+#else
+#define TRACE(fmt, arg...) DUMMY_MACRO
+#define PKT_DUMP(p) DUMMY_MACRO
+#endif
+
+#ifdef DEBUG_ARP
+static inline void
+print_arp(int af, const void *src, const struct ether_addr *mac,
+ const char *action)
+{
+ char str_ip[64];
+ char str_mac[32];
+ socklen_t sz;
+
+ ether_format_addr(str_mac, sizeof(str_mac), mac);
+ sz = (af == AF_INET) ? sizeof(struct in_addr) : sizeof(struct in6_addr);
+ inet_ntop(af, src, str_ip, sz);
+ RTE_LOG(INFO, "%s ARP entry: %s\tmac=%s", action, str_ip, str_mac);
+}
+#else
+#define print_arp(arg...) DUMMY_MACRO
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _GLUE_LOG_H_ */
diff --git a/lib/libtle_glue/ndp.h b/lib/libtle_glue/ndp.h
new file mode 100644
index 0000000..a61ff5b
--- /dev/null
+++ b/lib/libtle_glue/ndp.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2019 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _TLE_NDP_H_
+#define _TLE_NDP_H_
+
+#define ND_OPT_SOURCE_LINKLAYER_ADDR 1
+#define ND_OPT_TARGET_LINKLAYER_ADDR 2
+#define ND_OPT_PREFIX_INFORMATION 3
+#define ND_OPT_REDIRECTED_HEADER 4
+#define ND_OPT_MTU 5
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _TLE_NDP_H_ */
diff --git a/lib/libtle_glue/packetdrill.c b/lib/libtle_glue/packetdrill.c
new file mode 100644
index 0000000..79d1d52
--- /dev/null
+++ b/lib/libtle_glue/packetdrill.c
@@ -0,0 +1,544 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <stdarg.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <arpa/inet.h>
+
+#include "packetdrill.h"
+#include "tle_glue.h"
+#include "internal.h"
+#include "fd.h"
+
+#include <rte_arp.h>
+#include <rte_common.h>
+#include <rte_ethdev.h>
+#include <rte_ip.h>
+#include <rte_vhost.h>
+
+static int vhost_vid;
+enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
+static const char *sockname = "/tmp/sock0";
+
+static int
+new_device(int vid)
+{
+ vhost_vid = vid;
+
+ /* Disable notifications. */
+ rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
+ rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
+
+ return 0;
+}
+
+static void
+destroy_device(int vid)
+{
+ RTE_SET_USED(vid);
+}
+
+static const struct vhost_device_ops device_ops =
+{
+ .new_device = new_device,
+ .destroy_device = destroy_device,
+};
+
+static void
+vhost_init(void)
+{
+ unlink(sockname);
+
+ if (rte_vhost_driver_register(sockname, 0) != 0)
+ rte_exit(EXIT_FAILURE, "failed to register vhost driver \n");
+
+ if (rte_vhost_driver_callback_register(sockname, &device_ops) != 0)
+ rte_exit(EXIT_FAILURE, "failed to register vhost driver callbacks.\n");
+
+ if (rte_vhost_driver_start(sockname) < 0)
+ rte_exit(EXIT_FAILURE, "failed to start vhost driver.\n");
+
+ rte_log_set_level(RTE_LOGTYPE_USER1, RTE_LOG_NOTICE);
+}
+
+static uint64_t
+now_usecs(void)
+{
+ struct timeval tv;
+
+ gettimeofday(&tv, NULL);
+ return ((uint64_t) tv.tv_sec * 1000000) + tv.tv_usec;
+}
+
+static void
+pd_free(void *userdata)
+{
+ RTE_SET_USED(userdata);
+}
+
+static int
+pd_socket(void *userdata, int domain, int type, int protocol)
+{
+ RTE_SET_USED(userdata);
+ return PRE(socket)(domain, type, protocol);
+}
+
+static int
+pd_bind(void *userdata, int sockfd, const struct sockaddr *addr,
+ socklen_t addrlen)
+{
+ RTE_SET_USED(userdata);
+ return PRE(bind)(sockfd, addr, addrlen);
+}
+
+static int
+pd_listen(void *userdata, int sockfd, int backlog)
+{
+ RTE_SET_USED(userdata);
+ return PRE(listen)(sockfd, backlog);
+}
+
+static int
+pd_accept(void *userdata, int sockfd, struct sockaddr *addr,
+ socklen_t *addrlen)
+{
+ RTE_SET_USED(userdata);
+ return PRE(accept)(sockfd, addr, addrlen);
+}
+
+static int
+pd_connect(void *userdata, int sockfd, const struct sockaddr *addr,
+ socklen_t addrlen)
+{
+ RTE_SET_USED(userdata);
+ return PRE(connect)(sockfd, addr, addrlen);
+}
+
+static ssize_t
+pd_read(void *userdata, int fd, void *buf, size_t count)
+{
+ RTE_SET_USED(userdata);
+ return PRE(read)(fd, buf, count);
+}
+
+static ssize_t
+pd_readv(void *userdata, int fd, const struct iovec *iov, int iovcnt)
+{
+ RTE_SET_USED(userdata);
+ return PRE(readv)(fd, iov, iovcnt);
+}
+
+static ssize_t
+pd_recv(void *userdata, int sockfd, void *buf, size_t len, int flags)
+{
+ RTE_SET_USED(userdata);
+ return PRE(recv)(sockfd, buf, len, flags);
+}
+
+static ssize_t
+pd_recvfrom(void *userdata, int sockfd, void *buf, size_t len,
+ int flags, struct sockaddr *src_addr, socklen_t *addrlen)
+{
+ RTE_SET_USED(userdata);
+ return PRE(recvfrom)(sockfd, buf, len, flags, src_addr, addrlen);
+}
+
+static ssize_t
+pd_recvmsg(void *userdata, int sockfd, struct msghdr *msg, int flags)
+{
+ RTE_SET_USED(userdata);
+ return PRE(recvmsg)(sockfd, msg, flags);
+}
+
+static ssize_t
+pd_write(void *userdata, int fd, const void *buf, size_t count)
+{
+ RTE_SET_USED(userdata);
+ return PRE(write)(fd, buf, count);
+}
+
+static ssize_t
+pd_writev(void *userdata, int fd, const struct iovec *iov, int iovcnt)
+{
+ RTE_SET_USED(userdata);
+ return PRE(writev)(fd, iov, iovcnt);
+}
+
+static ssize_t
+pd_send(void *userdata, int sockfd, const void *buf, size_t len, int flags)
+{
+ RTE_SET_USED(userdata);
+ return PRE(send)(sockfd, buf, len, flags);
+}
+
+static ssize_t
+pd_sendto(void *userdata, int sockfd, const void *buf, size_t len, int flags,
+ const struct sockaddr *dest_addr, socklen_t addrlen)
+{
+ RTE_SET_USED(userdata);
+ return PRE(sendto)(sockfd, buf, len, flags, dest_addr, addrlen);
+}
+
+static ssize_t
+pd_sendmsg(void *userdata, int sockfd, const struct msghdr *msg, int flags)
+{
+ RTE_SET_USED(userdata);
+ return PRE(sendmsg)(sockfd, msg, flags);
+}
+
+static int
+pd_fcntl(void *userdata, int fd, int cmd, ...)
+{
+ void *arg;
+ va_list ap;
+
+ va_start(ap, cmd);
+ arg = va_arg(ap, void *);
+ va_end(ap);
+
+ RTE_SET_USED(userdata);
+ return PRE(fcntl)(fd, cmd, arg);
+}
+
+static int
+pd_ioctl(void *userdata, int fd, unsigned long request, ...)
+{
+ void *arg;
+ va_list ap;
+
+ va_start(ap, request);
+ arg = va_arg(ap, void *);
+ va_end(ap);
+
+ RTE_SET_USED(userdata);
+ return PRE(ioctl)(fd, request, arg);
+}
+
+static int
+pd_close(void *userdata, int fd)
+{
+ RTE_SET_USED(userdata);
+ return PRE(close)(fd);
+}
+
+static int
+pd_shutdown(void *userdata, int sockfd, int how)
+{
+ RTE_SET_USED(userdata);
+ return PRE(shutdown)(sockfd, how);
+}
+
+static int
+pd_getsockopt(void *userdata, int sockfd, int level, int optname,
+ void *optval, socklen_t *optlen)
+{
+ RTE_SET_USED(userdata);
+ return PRE(getsockopt)(sockfd, level, optname, optval, optlen);
+}
+
+static int
+pd_setsockopt(void *userdata, int sockfd, int level, int optname,
+ const void *optval, socklen_t optlen)
+{
+ RTE_SET_USED(userdata);
+ return PRE(setsockopt)(sockfd, level, optname, optval, optlen);
+}
+
+static int
+pd_poll(void *userdata, struct pollfd *fds, nfds_t nfds, int timeout)
+{
+ RTE_SET_USED(userdata);
+ return PRE(poll)(fds, nfds, timeout);
+}
+
+static struct rte_mbuf *
+from_buf_to_mbuf(const void *buf, size_t count)
+{
+ struct rte_mempool *mp = get_mempool_by_socket(0);
+ uint16_t nb_mbufs = (count + RTE_MBUF_DEFAULT_DATAROOM - 1) /
+ RTE_MBUF_DEFAULT_DATAROOM;
+ struct rte_mbuf *mbufs[nb_mbufs + 1];
+ uint16_t i, copy_len;
+ size_t done = 0;
+ char *dst;
+
+ if (unlikely(rte_pktmbuf_alloc_bulk(mp, mbufs, nb_mbufs) < 0))
+ rte_exit(EXIT_FAILURE, "allocate mbuf fails\n");
+
+ for (i = 0; i < nb_mbufs; ++i) {
+ copy_len = RTE_MIN((size_t)RTE_MBUF_DEFAULT_DATAROOM,
+ count - done);
+ dst = rte_pktmbuf_mtod(mbufs[i], char *);
+ rte_memcpy(dst, (const char *)buf + done, copy_len);
+ done += copy_len;
+ mbufs[i]->data_len = copy_len;
+ if (i > 0)
+ mbufs[i-1]->next = mbufs[i];
+ }
+
+ mbufs[0]->pkt_len = count;
+ mbufs[0]->nb_segs = nb_mbufs;
+
+ return mbufs[0];
+}
+
+/* Send @count bytes of data starting from @buf to the TCP stack.
+ * Return 0 on success or -1 on error.
+ */
+static int
+pd_netdev_send(void *userdata, const void *buf, size_t count)
+{
+ struct ether_hdr *hdr;
+ struct rte_mbuf *m;
+
+ RTE_SET_USED(userdata);
+
+ m = from_buf_to_mbuf(buf, count);
+
+ // add l2 header
+ hdr = (struct ether_hdr *)rte_pktmbuf_prepend(m, sizeof(struct ether_hdr));
+ hdr->ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv4);
+
+ if (rte_vhost_enqueue_burst(vhost_vid, VIRTIO_RXQ, &m, 1) == 1)
+ return 0;
+
+ return -1;
+}
+
+static inline struct rte_mbuf *
+from_mbuf_to_buf(struct rte_mbuf *m, char *buf, size_t len, int ispeek, int needcpy)
+{
+ void *src;
+ uint32_t done = 0;
+ uint32_t left = len, orig_pkt_len;
+ uint16_t copy_len, seg_len;
+ struct rte_mbuf *m_next, *orig_pkt;
+
+ if (len == 0)
+ return m;
+
+ orig_pkt = m;
+ orig_pkt_len = m->pkt_len;
+
+ do {
+ seg_len = rte_pktmbuf_data_len(m);
+ copy_len = RTE_MIN(seg_len, left);
+ src = rte_pktmbuf_mtod(m, void *);
+ if (needcpy)
+ rte_memcpy(buf + done, src, copy_len);
+ done += copy_len;
+ left -= copy_len;
+ if (copy_len < seg_len) {
+ if (!ispeek) {
+ rte_pktmbuf_adj(m, copy_len);
+ }
+ break;
+ }
+ m_next = m->next;
+ if (!ispeek) {
+ rte_pktmbuf_free_seg(m);
+ }
+ m = m_next;
+ } while (left && m);
+
+ if (m && !ispeek)
+ m->pkt_len = orig_pkt_len - done;
+
+ if(ispeek)
+ return orig_pkt;
+ else
+ return m;
+}
+
+/* Sniff the next packet leaving the TCP stack.
+ * Put packet data in @buf. @count is passed in as the buffer size.
+ * The actual number of bytes received should be put in @count.
+ * Set @count to 0 if received nothing.
+ * Set @time_usecs to the receive timestamp.
+ * Return 0 on success or -1 on error. */
+static int
+pd_netdev_recv(void *userdata, void *buf, size_t *count, long long *time_usecs)
+{
+ struct rte_mbuf *m;
+ struct rte_mempool *mp = get_mempool_by_socket(0);
+
+ RTE_SET_USED(userdata);
+
+ while (rte_vhost_dequeue_burst(vhost_vid, VIRTIO_TXQ, mp, &m, 1) == 0);
+
+ // remove l2 header
+ rte_pktmbuf_adj(m, sizeof(struct ether_hdr));
+
+ *count = m->pkt_len;
+ from_mbuf_to_buf(m, buf, *count, 0, 1);
+
+ *time_usecs = now_usecs();
+ return 0;
+}
+
+static int
+pd_usleep(void *userdata, useconds_t usec)
+{
+ RTE_SET_USED(userdata);
+ return usleep(usec);
+}
+
+static int
+pd_gettimeofday(void *userdata, struct timeval *tv, struct timezone *tz)
+{
+ RTE_SET_USED(userdata);
+ return gettimeofday(tv, tz);
+}
+
+static int
+pd_epoll_create(void *userdata, int size)
+{
+ RTE_SET_USED(userdata);
+ return PRE(epoll_create)(size);
+}
+
+static int
+pd_epoll_ctl(void *userdata, int epfd, int op, int fd,
+ struct epoll_event *event)
+{
+ RTE_SET_USED(userdata);
+ return PRE(epoll_ctl)(epfd, op, fd, event);
+}
+
+static int
+pd_epoll_wait(void *userdata, int epfd, struct epoll_event *events,
+ int maxevents, int timeout)
+{
+ RTE_SET_USED(userdata);
+ return PRE(epoll_wait)(epfd, events, maxevents, timeout);
+}
+
+static int
+pd_pipe(void *userdata, int pipefd[2])
+{
+ RTE_SET_USED(userdata);
+ return pipe(pipefd);
+}
+
+static int
+pd_splice(void *userdata, int fd_in, loff_t *off_in, int fd_out,
+ loff_t *off_out, size_t len, unsigned int flags)
+{
+ RTE_SET_USED(userdata);
+ return PRE(splice)(fd_in, off_in, fd_out, off_out, len, flags);
+}
+
+static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
+
+static void *
+io(void *arg)
+{
+ int epfd;
+ struct in_addr ipv4;
+ struct ether_addr mac = { .addr_bytes = { 0xee, 0xff, 0xff, 0xff, 0xff, 0xff}, };
+ struct epoll_event events[128];
+
+ RTE_SET_USED(arg);
+
+ setenv(DPDK_IP, "192.168.0.2", 1);
+ setenv(DPDK_IP_MASK, "16", 1);
+ setenv(DPDK_IP_GATEWAY, "192.168.0.1", 1);
+
+ setenv(DPDK_IPV6, "fd3d:fa7b:d17d::0", 1);
+ setenv(DPDK_IPV6_MASK, "48", 1);
+ setenv(DPDK_IPV6_GATEWAY, "fd3d:fa7b:d17d:8888::0", 1);
+
+ epfd = PRE(epoll_create)(0);
+
+ inet_pton(AF_INET, "192.168.0.1", &ipv4);
+
+ ipv4_dst_add(default_ctx, &ipv4, &mac);
+
+ pthread_mutex_unlock(&lock);
+
+ while (1) {
+ PRE(epoll_wait)(epfd, events, 128, 0);
+ }
+
+ return NULL;
+}
+
+void
+packetdrill_interface_init(const char *flags,
+ struct packetdrill_interface *ifc)
+{
+ int argc = 0;
+ char *argv[16];
+ pthread_t tid;
+
+ RTE_SET_USED(flags);
+
+ argv[argc++] = strdup("test");
+ argv[argc++] = strdup("-l");
+ argv[argc++] = strdup("0");
+ argv[argc++] = strdup("--no-pci");
+ argv[argc++] = strdup("--in-memory");
+ argv[argc++] = strdup("--single-file-segments");
+ argv[argc++] = strdup("--");
+
+ if (rte_eal_init(argc, argv) < 0)
+ rte_exit(EXIT_FAILURE, "Failed to init DPDK\n");
+
+ fd_init();
+
+ vhost_init();
+
+ if (rte_eal_hotplug_add("vdev", "virtio_user0", "path=/tmp/sock0") < 0)
+ rte_exit(EXIT_FAILURE, "hot plug virtio-user failed\n");
+
+ pthread_mutex_lock(&lock);
+
+ pthread_create(&tid, NULL, io, NULL);
+
+ pthread_mutex_lock(&lock);
+
+ ifc->free = pd_free;
+ ifc->socket = pd_socket;
+ ifc->bind = pd_bind;
+ ifc->listen = pd_listen;
+ ifc->accept = pd_accept;
+ ifc->connect = pd_connect;
+ ifc->read = pd_read;
+ ifc->readv = pd_readv;
+ ifc->recv = pd_recv;
+ ifc->recvfrom = pd_recvfrom;
+ ifc->recvmsg = pd_recvmsg;
+ ifc->write = pd_write;
+ ifc->writev = pd_writev;
+ ifc->send = pd_send;
+ ifc->sendto = pd_sendto;
+ ifc->sendmsg = pd_sendmsg;
+ ifc->fcntl = pd_fcntl;
+ ifc->ioctl = pd_ioctl;
+ ifc->close = pd_close;
+ ifc->shutdown = pd_shutdown;
+ ifc->getsockopt = pd_getsockopt;
+ ifc->setsockopt = pd_setsockopt;
+ ifc->poll = pd_poll;
+ ifc->netdev_send = pd_netdev_send;
+ ifc->netdev_receive = pd_netdev_recv;
+ ifc->usleep = pd_usleep;
+ ifc->gettimeofday = pd_gettimeofday;
+ ifc->epoll_create = pd_epoll_create;
+ ifc->epoll_ctl = pd_epoll_ctl;
+ ifc->epoll_wait = pd_epoll_wait;
+ ifc->pipe = pd_pipe;
+ ifc->splice = pd_splice;
+}
diff --git a/lib/libtle_glue/packetdrill.h b/lib/libtle_glue/packetdrill.h
new file mode 100644
index 0000000..6f84a87
--- /dev/null
+++ b/lib/libtle_glue/packetdrill.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright 2015 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: xiaoj@google.com (Xiao Jia)
+ *
+ * Interface for packetdrill.
+ *
+ * To be tested against as a shared object (*.so) file, implement this
+ * interface, export a function "packetdrill_interface_init", and
+ * initialize the interface struct passed in with your own functions.
+ */
+
+#ifndef __PACKETDRILL_H__
+#define __PACKETDRILL_H__
+
+#include <poll.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/epoll.h>
+#include <unistd.h>
+
+struct packetdrill_interface {
+ void *userdata;
+ void (*free)(void *userdata);
+ int (*socket)(void *userdata, int domain, int type, int protocol);
+ int (*bind)(void *userdata, int sockfd, const struct sockaddr *addr,
+ socklen_t addrlen);
+ int (*listen)(void *userdata, int sockfd, int backlog);
+ int (*accept)(void *userdata, int sockfd, struct sockaddr *addr,
+ socklen_t *addrlen);
+ int (*connect)(void *userdata, int sockfd, const struct sockaddr *addr,
+ socklen_t addrlen);
+ ssize_t (*read)(void *userdata, int fd, void *buf, size_t count);
+ ssize_t (*readv)(void *userdata, int fd, const struct iovec *iov,
+ int iovcnt);
+ ssize_t (*recv)(void *userdata, int sockfd, void *buf, size_t len,
+ int flags);
+ ssize_t (*recvfrom)(void *userdata, int sockfd, void *buf, size_t len,
+ int flags, struct sockaddr *src_addr,
+ socklen_t *addrlen);
+ ssize_t (*recvmsg)(void *userdata, int sockfd, struct msghdr *msg,
+ int flags);
+ ssize_t (*write)(void *userdata, int fd, const void *buf, size_t count);
+ ssize_t (*writev)(void *userdata, int fd, const struct iovec *iov,
+ int iovcnt);
+ ssize_t (*send)(void *userdata, int sockfd, const void *buf, size_t len,
+ int flags);
+ ssize_t (*sendto)(void *userdata, int sockfd, const void *buf,
+ size_t len, int flags,
+ const struct sockaddr *dest_addr, socklen_t addrlen);
+ ssize_t (*sendmsg)(void *userdata, int sockfd, const struct msghdr *msg,
+ int flags);
+ int (*fcntl)(void *userdata, int fd, int cmd, ...);
+ int (*ioctl)(void *userdata, int fd, unsigned long request, ...);
+ int (*close)(void *userdata, int fd);
+ int (*shutdown)(void *userdata, int sockfd, int how);
+ int (*getsockopt)(void *userdata, int sockfd, int level, int optname,
+ void *optval, socklen_t *optlen);
+ int (*setsockopt)(void *userdata, int sockfd, int level, int optname,
+ const void *optval, socklen_t optlen);
+ int (*poll)(void *userdata, struct pollfd *fds, nfds_t nfds,
+ int timeout);
+ /* Send @count bytes of data starting from @buf to the TCP stack.
+ * Return 0 on success or -1 on error. */
+ int (*netdev_send)(void *userdata, const void *buf, size_t count);
+ /* Sniff the next packet leaving the TCP stack.
+ * Put packet data in @buf. @count is passed in as the buffer size.
+ * The actual number of bytes received should be put in @count.
+ * Set @count to 0 if received nothing.
+ * Set @time_usecs to the receive timestamp.
+ * Return 0 on success or -1 on error. */
+ int (*netdev_receive)(void *userdata, void *buf, size_t *count,
+ long long *time_usecs);
+ int (*usleep)(void *userdata, useconds_t usec);
+ int (*gettimeofday)(void *userdata, struct timeval *tv,
+ struct timezone *tz);
+ int (*epoll_create)(void *userdata, int size);
+ int (*epoll_ctl)(void *userdata, int epfd, int op, int fd,
+ struct epoll_event *event);
+ int (*epoll_wait)(void *userdata, int epfd, struct epoll_event *events,
+ int maxevents, int timeout);
+ int (*pipe)(void *userdata, int pipefd[2]);
+ int (*splice)(void *userdata, int fd_in, loff_t *off_in, int fd_out,
+ loff_t *off_out, size_t len, unsigned int flags);
+};
+
+typedef void (*packetdrill_interface_init_t)(const char *flags,
+ struct packetdrill_interface *);
+
+void
+packetdrill_interface_init(const char *flags, struct packetdrill_interface *ifc);
+
+#endif /* __PACKETDRILL_H__ */
diff --git a/lib/libtle_glue/poll.c b/lib/libtle_glue/poll.c
new file mode 100644
index 0000000..ebc0110
--- /dev/null
+++ b/lib/libtle_glue/poll.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <signal.h>
+#include <poll.h>
+
+#include "fd.h"
+#include "ctx.h"
+#include "sym.h"
+#include "log.h"
+#include "util.h"
+#include "internal.h"
+#include "tle_glue.h"
+
+int
+PRE(poll)(struct pollfd *fds, nfds_t nfds, int timeout)
+{
+ int efd;
+ int total = 0, j;
+ int tmp_ev;
+ uint32_t i;
+ uint32_t k_n = 0;
+ int k_fds[nfds];
+ struct sock *so;
+ struct glue_ctx *ctx;
+ struct epoll_event k_ev;
+ struct epoll_event events[nfds];
+
+ for (i = 0; i < nfds; ++i) {
+ if (is_kernel_fd(fds[i].fd)) {
+ k_fds[k_n++] = i;
+ continue;
+ }
+
+ so = fd2sock(fds[i].fd);
+ if (!so->valid)
+ continue;
+
+ fds[i].revents = fd_ready(fds[i].fd, fds[i].events);
+ if (fds[i].revents) {
+ total++;
+ continue;
+ }
+
+ /* We fill sock->event here as we need this when
+ * we filter events in poll_common(). But it was
+ * originally set by epoll_ctl(). Now we have to
+ * assume that there are no application which
+ * uses epoll and poll at the same time.
+ */
+ so->event.events = fds[i].events;
+ so->event.data.u32 = i; /* store idx */
+ }
+
+ if (k_n == nfds)
+ return k_poll(fds, nfds, timeout);
+
+ if (total > 0)
+ return total;
+
+ /* thread <> context binding happens here */
+ if (RTE_PER_LCORE(glue_ctx) == NULL) {
+ ctx = &ctx_array[glue_ctx_alloc()];
+ RTE_PER_LCORE(glue_ctx) = ctx;
+ } else
+ ctx = RTE_PER_LCORE(glue_ctx);
+
+ total = poll_common(ctx, events, nfds, 0, -1);
+
+ /* We assume kernel I/O events are not as important as user ones */
+ if (total > 0)
+ goto format;
+
+ efd = k_epoll_create(1);
+ if (efd < 0)
+ rte_panic("k_epoll_create failed %d", errno);
+
+ for (i = 0; i < k_n; ++i) {
+ k_ev.events = fds[k_fds[i]].events;
+ k_ev.data.u32 = k_fds[i]; /* store idx */
+ k_epoll_ctl(efd, EPOLL_CTL_ADD, fds[k_fds[i]].fd, &k_ev);
+ }
+
+ total = poll_common(ctx, events, nfds, timeout, efd);
+ k_close(efd);
+format:
+ for (j = 0; j < total; ++j) {
+ tmp_ev = events[j].events;
+ if (tmp_ev == POLLHUP) {
+ tmp_ev |= POLLERR | (fds[events[j].data.u32].events &
+ (POLLIN | POLLOUT));
+ }
+ fds[events[j].data.u32].revents = tmp_ev;
+ }
+
+ return total;
+}
+
+int
+PRE(ppoll)(struct pollfd *fds, nfds_t nfds,
+ const struct timespec *tmo_p, const sigset_t *sigmask)
+{
+ int timeout;
+
+ if (sigmask != NULL)
+ rte_panic("ppoll with signal is not supported");
+
+ if (tmo_p == NULL)
+ timeout = -1;
+ else
+ timeout = tmo_p->tv_sec * 1000 + tmo_p->tv_nsec / 1000000;
+
+ return poll(fds, nfds, timeout);
+}
+
+extern int __poll_chk(struct pollfd *fds, nfds_t nfds, int timeout,
+ __SIZE_TYPE__ fdslen);
+int
+__poll_chk(struct pollfd *fds, nfds_t nfds, int timeout,
+ __SIZE_TYPE__ fdslen __rte_unused)
+{
+ return poll(fds, nfds, timeout);
+}
diff --git a/lib/libtle_glue/port.c b/lib/libtle_glue/port.c
new file mode 100644
index 0000000..7a4cf2e
--- /dev/null
+++ b/lib/libtle_glue/port.c
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <sys/eventfd.h>
+#include <unistd.h>
+
+#include <rte_ethdev.h>
+#include <rte_eth_ring.h>
+
+#include "log.h"
+#include "ctx.h"
+#include "config.h"
+#include "internal.h"
+
+int stopped;
+
+static struct rte_mempool *mpool[RTE_MAX_NUMA_NODES];
+
+struct rte_mempool *
+get_mempool_by_socket(int32_t socket_id)
+{
+ struct rte_mempool *mp;
+ char name[RTE_MEMPOOL_NAMESIZE];
+
+ if (socket_id == SOCKET_ID_ANY)
+ socket_id = 0;
+
+ if (mpool[socket_id])
+ return mpool[socket_id];
+
+ snprintf(name, sizeof(name), "MP%u", socket_id);
+ mp = rte_pktmbuf_dynamic_pool_create(name, MAX_MBUFS - 1,
+ MBUF_PERCORE_CACHE, 0,
+ RTE_MBUF_DEFAULT_BUF_SIZE,
+ socket_id, MBUF_DYNAMIC_SIZE);
+
+ if (mp == NULL)
+ rte_panic("Failed to create mbuf mempool");
+
+ mpool[socket_id] = mp;
+ return mp;
+}
+
+static void
+update_rss_conf(uint16_t port_id)
+{
+ struct rte_eth_rss_conf rss_conf = {
+ .rss_key = NULL,
+ .rss_key_len = 0,
+ .rss_hf = ETH_RSS_IP | ETH_RSS_TCP | ETH_RSS_UDP,
+ };
+
+ if (rte_eth_dev_rss_hash_update(port_id, &rss_conf) < 0)
+ rte_panic("Failed to update rss hash");
+}
+
+static void
+queue_init(uint16_t port_id, uint16_t nb_queues,
+ struct rte_eth_dev_info *dev_info,
+ struct rte_eth_conf *port_conf)
+{
+ uint16_t q;
+ int32_t socket_id, rc;
+ uint16_t nb_rxd = 1024, nb_txd = 1024;
+ struct rte_mempool *mp;
+ struct rte_eth_txconf txq_conf = dev_info->default_txconf;
+ struct rte_eth_rxconf rxq_conf = dev_info->default_rxconf;
+
+ socket_id = rte_eth_dev_socket_id(port_id);
+ mp = get_mempool_by_socket(socket_id);
+
+ dev_info->default_rxconf.rx_drop_en = 1;
+
+ rc = rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, &nb_txd);
+ if (rc < 0)
+ rte_panic("Cannot adjust number of desc");
+
+ rxq_conf.offloads = port_conf->rxmode.offloads;
+ txq_conf.offloads = port_conf->txmode.offloads;
+
+ /* faster free of tx entries */
+ txq_conf.tx_free_thresh = nb_txd - 64;
+
+ for (q = 0; q < nb_queues; q++) {
+ rc = rte_eth_rx_queue_setup(port_id, q, nb_rxd,
+ socket_id, &rxq_conf, mp);
+ if (rc < 0)
+ rte_panic("rx queue=%u setup failed: %d", q, rc);
+
+ rc = setup_rx_cb(port_id, q);
+ if (rc < 0)
+ rte_panic("rx queue=%u rx setup failed: %d", q, rc);
+ }
+
+ for (q = 0; q < nb_queues; q++) {
+ rc = rte_eth_tx_queue_setup(port_id, q, nb_txd,
+ socket_id, &txq_conf);
+ if (rc < 0)
+ rte_panic("tx queue=%u setup failed: %d", q, rc);
+ }
+}
+
+uint64_t rx_offload =
+ DEV_RX_OFFLOAD_IPV4_CKSUM |
+ DEV_RX_OFFLOAD_UDP_CKSUM |
+ DEV_RX_OFFLOAD_TCP_CKSUM;
+/* nice to have:
+ DEV_RX_OFFLOAD_CRC_STRIP |
+ DEV_RX_OFFLOAD_TCP_LRO |
+ DEV_RX_OFFLOAD_HEADER_SPLIT |
+ DEV_RX_OFFLOAD_SCATTER |
+ DEV_RX_OFFLOAD_TIMESTAMP
+*/
+
+uint64_t tx_offload =
+ DEV_TX_OFFLOAD_UDP_CKSUM |
+ DEV_TX_OFFLOAD_TCP_CKSUM |
+ DEV_TX_OFFLOAD_TCP_TSO |
+ DEV_TX_OFFLOAD_MULTI_SEGS;
+
+int
+dev_rxq_wakeup(uint16_t port_id)
+{
+ int fd;
+ uint16_t qid;
+ uint32_t vec, efd_idx;
+ struct rte_eth_dev *dev;
+ struct rte_intr_handle *intr_handle;
+
+ RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV);
+
+ dev = &rte_eth_devices[port_id];
+ intr_handle = dev->intr_handle;
+ if (!intr_handle)
+ return -ENOTSUP;
+ if (!intr_handle->intr_vec)
+ return -EPERM;
+
+ for (qid = 0; qid < dev->data->nb_rx_queues; qid++) {
+ vec = intr_handle->intr_vec[qid];
+ efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ?
+ (vec - RTE_INTR_VEC_RXTX_OFFSET) : vec;
+ fd = intr_handle->efds[efd_idx];
+ if (eventfd_write(fd, (eventfd_t) 1) < 0)
+ return -errno;
+ }
+
+ return 0;
+}
+
+void
+port_reconfig(void)
+{
+ int32_t rc;
+ struct rte_eth_dev_info dev_info;
+ uint16_t port_id = 0; /* We use and only use port 0 */
+ uint16_t nb_port;
+ uint16_t nb_queues = nb_ctx;
+
+ struct rte_eth_conf port_conf = {
+ .intr_conf = {
+ .rxq = 1,
+ },
+ };
+
+ /* 0. dev number check */
+ nb_port = rte_eth_dev_count_avail();
+ if (nb_port < 1 || nb_port >2)
+ rte_panic("One port is mandatory with an optional loopback device\n");
+
+ stopped = 1;
+ rte_wmb();
+ /* wake up all rxqs */
+ if (nb_ctx > 1)
+ dev_rxq_wakeup(port_id);
+
+ usleep(1); /* fix me: this cannot gurantee correctness */
+
+ rte_eth_dev_stop(port_id);
+
+ /* 1. offloading check and set*/
+ rte_eth_dev_info_get(port_id, &dev_info);
+ rx_offload &= dev_info.rx_offload_capa;
+ port_conf.rxmode.offloads = rx_offload;
+ tx_offload &= dev_info.tx_offload_capa;
+ port_conf.txmode.offloads = tx_offload;
+
+ GLUE_LOG(INFO, "configure queues = %d, offloads: rx = %"PRIx64", tx = %"PRIx64,
+ nb_queues, rx_offload, tx_offload);
+
+ /* 2. dev configure */
+ rc = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf);
+ if (rc != 0)
+ rte_panic("Failed to configure device, %d", rc);
+
+ /* 3. queue setup */
+ queue_init(port_id, nb_queues, &dev_info, &port_conf);
+
+ /* 4. rss conf */
+ if (nb_queues > 1)
+ update_rss_conf(port_id);
+
+ /* 5. dev start */
+ if (rte_eth_dev_start(port_id) < 0)
+ rte_panic("Failed to start device");
+
+ stopped = 0;
+}
+
+uint16_t
+create_loopback(uint32_t socket_id)
+{
+ int ret;
+ struct rte_ring* lb_queue;
+ static uint16_t lb_port_id = 0xFFFF;
+ const char *ring_name = "loopback-ring";
+
+ if (lb_port_id != 0xFFFF)
+ return lb_port_id;
+
+ lb_queue = rte_ring_create(ring_name, MAX_PKTS_BURST * 8, socket_id,
+ RING_F_SP_ENQ | RING_F_SC_DEQ);
+ if (!lb_queue)
+ rte_panic("Failed to create ring for loopback\n");
+ ret = rte_eth_from_ring(lb_queue);
+ if (ret < 0)
+ rte_panic("Failed to create ethdev from ring\n");
+ lb_port_id = ret;
+
+ if (setup_rx_cb(lb_port_id, 0) < 0)
+ rte_panic("Failed to set up rx cb for loopback\n");
+
+ return lb_port_id;
+}
diff --git a/lib/libtle_glue/rxcb.c b/lib/libtle_glue/rxcb.c
new file mode 100644
index 0000000..51f31c9
--- /dev/null
+++ b/lib/libtle_glue/rxcb.c
@@ -0,0 +1,834 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <rte_ethdev.h>
+#include <rte_arp.h>
+#include <rte_ip.h>
+#include <rte_tcp.h>
+#include <rte_udp.h>
+
+#include <netinet/in.h>
+#include <netinet/ip6.h>
+
+#include "log.h"
+#include "ctx.h"
+#include "internal.h"
+
+struct ptype2cb {
+ uint32_t mask;
+ const char *name;
+ rte_rx_callback_fn fn;
+};
+
+enum {
+ ETHER_ARP_PTYPE = 0x1,
+ IPV4_PTYPE = 0x2,
+ IPV4_EXT_PTYPE = 0x4,
+ IPV6_PTYPE = 0x8,
+ IPV6_EXT_PTYPE = 0x10,
+ TCP_PTYPE = 0x20,
+ UDP_PTYPE = 0x40,
+ ICMP_PTYPE = 0x80,
+};
+
+static inline uint64_t
+_mbuf_tx_offload(uint64_t il2, uint64_t il3, uint64_t il4, uint64_t tso,
+ uint64_t ol3, uint64_t ol2)
+{
+ return il2 | il3 << 7 | il4 << 16 | tso << 24 | ol3 << 40 | ol2 << 49;
+}
+
+static inline int32_t
+fill_pkt_hdr_len(struct rte_mbuf *m, uint32_t l2, uint32_t l3, uint32_t l4)
+{
+ if (l2 + l3 + l4 > m->pkt_len)
+ return -1;
+ m->tx_offload = _mbuf_tx_offload(l2, l3, l4, 0, 0, 0);
+ return 0;
+}
+
+static inline int
+is_ipv4_frag(const struct ipv4_hdr *iph)
+{
+ const uint16_t mask = rte_cpu_to_be_16(~IPV4_HDR_DF_FLAG);
+
+ return ((mask & iph->fragment_offset) != 0);
+}
+
+static inline uint32_t
+get_tcp_header_size(struct rte_mbuf *m, uint32_t l2_len, uint32_t l3_len)
+{
+ const struct tcp_hdr *tcp;
+
+ tcp = rte_pktmbuf_mtod_offset(m, struct tcp_hdr *, l2_len + l3_len);
+ return (tcp->data_off >> 4) * 4;
+}
+
+static inline int32_t
+adjust_ipv4_pktlen(struct rte_mbuf *m, uint32_t l2_len)
+{
+ uint32_t plen, trim;
+ const struct ipv4_hdr *iph;
+
+ iph = rte_pktmbuf_mtod_offset(m, const struct ipv4_hdr *, l2_len);
+ plen = rte_be_to_cpu_16(iph->total_length) + l2_len;
+ if (plen < m->pkt_len) {
+ trim = m->pkt_len - plen;
+ rte_pktmbuf_trim(m, trim);
+ } else if (plen > m->pkt_len)
+ return -1;
+
+ return 0;
+}
+
+static inline int32_t
+adjust_ipv6_pktlen(struct rte_mbuf *m, uint32_t l2_len)
+{
+ uint32_t plen, trim;
+ const struct ipv6_hdr *iph;
+
+ iph = rte_pktmbuf_mtod_offset(m, const struct ipv6_hdr *, l2_len);
+ plen = rte_be_to_cpu_16(iph->payload_len) + sizeof(*iph) + l2_len;
+ if (plen < m->pkt_len) {
+ trim = m->pkt_len - plen;
+ rte_pktmbuf_trim(m, trim);
+ } else if (plen > m->pkt_len)
+ return -1;
+
+ return 0;
+}
+
+static inline uint32_t
+get_ipv4_hdr_len(struct rte_mbuf *m, uint32_t l2, uint32_t proto, uint32_t frag)
+{
+ const struct ipv4_hdr *iph;
+ int32_t dlen, len;
+
+ dlen = rte_pktmbuf_data_len(m);
+ dlen -= l2;
+
+ iph = rte_pktmbuf_mtod_offset(m, const struct ipv4_hdr *, l2);
+ len = (iph->version_ihl & IPV4_HDR_IHL_MASK) * IPV4_IHL_MULTIPLIER;
+
+ if (frag != 0 && is_ipv4_frag(iph)) {
+ m->packet_type &= ~RTE_PTYPE_L4_MASK;
+ m->packet_type |= RTE_PTYPE_L4_FRAG;
+ }
+
+ if (len > dlen || (proto <= IPPROTO_MAX && iph->next_proto_id != proto))
+ m->packet_type = RTE_PTYPE_UNKNOWN;
+
+ return len;
+}
+
+static inline uint32_t
+get_ipv6x_hdr_len(struct rte_mbuf *m, uint32_t l2, uint32_t *fproto)
+{
+ const struct ipv6_hdr *ip6h;
+ const struct ip6_ext *ipx;
+ uint32_t nproto;
+ int32_t dlen, len, ofs;
+
+ ip6h = rte_pktmbuf_mtod_offset(m, struct ipv6_hdr*, l2);
+ nproto = ip6h->proto;
+ len = sizeof(struct ipv6_hdr);
+
+ dlen = rte_pktmbuf_data_len(m);
+ dlen -= l2;
+
+ ofs = l2 + len;
+ ipx = rte_pktmbuf_mtod_offset(m, const struct ip6_ext *, ofs);
+
+ while (ofs > 0 && len < dlen) {
+ switch (nproto) {
+ case IPPROTO_HOPOPTS:
+ case IPPROTO_ROUTING:
+ case IPPROTO_DSTOPTS:
+ ofs = (ipx->ip6e_len + 1) << 3;
+ break;
+ case IPPROTO_AH:
+ ofs = (ipx->ip6e_len + 2) << 2;
+ break;
+ case IPPROTO_FRAGMENT:
+ /*
+ * tso_segsz is not used by RX, so use it as temporary
+ * buffer to store the fragment offset.
+ */
+ m->tso_segsz = l2 + len;
+ ofs = sizeof(struct ip6_frag);
+ m->packet_type &= ~RTE_PTYPE_L4_MASK;
+ m->packet_type |= RTE_PTYPE_L4_FRAG;
+ break;
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ case IPPROTO_ICMPV6:
+ ofs = 0;
+ if (*fproto == 0)
+ *fproto = nproto;
+ break;
+ default:
+ ofs = 0;
+ }
+
+ if (ofs > 0) {
+ nproto = ipx->ip6e_nxt;
+ len += ofs;
+ ipx += ofs / sizeof(*ipx);
+ }
+ }
+
+ /* unrecognized or invalid packet. */
+ if (*fproto == 0 || len > dlen)
+ m->packet_type = RTE_PTYPE_UNKNOWN;
+
+ return len;
+}
+
+static inline uint32_t
+get_ipv6_hdr_len(struct rte_mbuf *m, uint32_t l2, uint32_t fproto)
+{
+ const struct ipv6_hdr *iph;
+
+ iph = rte_pktmbuf_mtod_offset(m, const struct ipv6_hdr *,
+ sizeof(struct ether_hdr));
+
+ if (iph->proto == fproto)
+ return sizeof(struct ipv6_hdr);
+ else
+ return get_ipv6x_hdr_len(m, l2, &fproto);
+}
+
+static inline struct rte_mbuf*
+process_ipv4_frag(struct rte_mbuf *m, struct glue_ctx *ctx,
+ uint32_t l2_len, uint32_t l3_len)
+{
+ struct ipv4_hdr* iph;
+
+ m->l2_len = l2_len;
+ m->l3_len = l3_len;
+ /* fixme: ip checksum should be checked here.
+ * After reassemble, the ip checksum would be invalid.
+ */
+ m = rte_ipv4_frag_reassemble_packet(ctx->frag_tbl,
+ &ctx->frag_dr, m, rte_rdtsc(),
+ rte_pktmbuf_mtod_offset(m, struct ipv4_hdr*, m->l2_len));
+ rte_ip_frag_free_death_row(&ctx->frag_dr, 3);
+ if (m == NULL)
+ return NULL;
+ iph = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr*, m->l2_len);
+ switch (iph->next_proto_id) {
+ case IPPROTO_TCP:
+ m->packet_type &= ~RTE_PTYPE_L4_MASK;
+ m->packet_type |= RTE_PTYPE_L4_TCP;
+ break;
+ case IPPROTO_UDP:
+ m->packet_type &= ~RTE_PTYPE_L4_MASK;
+ m->packet_type |= RTE_PTYPE_L4_UDP;
+ break;
+ }
+ return m;
+}
+
+static inline struct rte_mbuf*
+process_ipv6_frag(struct rte_mbuf *m, struct glue_ctx *ctx,
+ uint32_t l2_len, uint32_t l3_len)
+{
+ struct ipv6_hdr* ip6h;
+
+ m->l2_len = l2_len;
+ m->l3_len = l3_len;
+ m = rte_ipv6_frag_reassemble_packet(ctx->frag_tbl,
+ &ctx->frag_dr, m, rte_rdtsc(),
+ rte_pktmbuf_mtod_offset(m, struct ipv6_hdr*, l2_len),
+ rte_pktmbuf_mtod_offset(m, struct ipv6_extension_fragment*,
+ m->tso_segsz));
+ rte_ip_frag_free_death_row(&ctx->frag_dr, 3);
+ if (m == NULL)
+ return NULL;
+ ip6h = rte_pktmbuf_mtod_offset(m, struct ipv6_hdr*, m->l2_len);
+ switch (ip6h->proto) {
+ case IPPROTO_TCP:
+ m->packet_type &= ~RTE_PTYPE_L4_MASK;
+ m->packet_type |= RTE_PTYPE_L4_TCP;
+ break;
+ case IPPROTO_UDP:
+ m->packet_type &= ~RTE_PTYPE_L4_MASK;
+ m->packet_type |= RTE_PTYPE_L4_UDP;
+ break;
+ }
+ return m;
+}
+
+static inline struct rte_mbuf *
+fill_ptypes_and_hdr_len(struct glue_ctx *ctx, struct rte_mbuf *m)
+{
+ uint32_t dlen, l2_len, l3_len, l4_len, proto;
+ const struct ether_hdr *eth;
+ uint32_t ptypes;
+ uint16_t etp;
+ int32_t error = 0;
+
+ dlen = rte_pktmbuf_data_len(m);
+
+ /* L2 */
+ l2_len = sizeof(*eth);
+
+ eth = rte_pktmbuf_mtod(m, const struct ether_hdr *);
+ etp = eth->ether_type;
+ while (etp == rte_be_to_cpu_16(ETHER_TYPE_VLAN)) {
+ etp = rte_pktmbuf_mtod_offset(m, struct vlan_hdr*, l2_len)->eth_proto;
+ l2_len += sizeof(struct vlan_hdr);
+ }
+
+ if (etp == rte_be_to_cpu_16(ETHER_TYPE_ARP))
+ return arp_recv(ctx, m, l2_len);
+
+ if (etp == rte_be_to_cpu_16(ETHER_TYPE_IPv4)) {
+ const struct ipv4_hdr *hdr;
+
+ /* L3 */
+ hdr = rte_pktmbuf_mtod_offset(m, const struct ipv4_hdr *, l2_len);
+ error = adjust_ipv4_pktlen(m, l2_len);
+ if (error) {
+ rte_pktmbuf_free(m);
+ return NULL;
+ }
+ l3_len = get_ipv4_hdr_len(m, l2_len, IPPROTO_MAX + 1, 1);
+
+ if ((m->packet_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_FRAG) {
+ m = process_ipv4_frag(m, ctx, l2_len, l3_len);
+ if (m == NULL)
+ return NULL;
+ hdr = rte_pktmbuf_mtod_offset(m, const struct ipv4_hdr*,
+ m->l2_len);
+ l3_len = get_ipv4_hdr_len(m, m->l2_len,
+ IPPROTO_MAX + 1, 0);
+ }
+
+ /* L4 */
+ switch (hdr->next_proto_id) {
+ case IPPROTO_ICMP:
+ return icmp_recv(ctx, m, l2_len, l3_len);
+ case IPPROTO_TCP:
+ ptypes = RTE_PTYPE_L4_TCP |
+ RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L2_ETHER;
+ l4_len = get_tcp_header_size(m, l2_len, l3_len);
+ break;
+ case IPPROTO_UDP:
+ ptypes = RTE_PTYPE_L4_UDP |
+ RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L2_ETHER;
+ l4_len = sizeof(struct udp_hdr);
+ break;
+ default:
+ GLUE_LOG(ERR, "drop ipv4 pkt of unknow L4: (%d)",
+ hdr->next_proto_id);
+ rte_pktmbuf_free(m);
+ return NULL;
+ }
+
+ } else if (etp == rte_be_to_cpu_16(ETHER_TYPE_IPv6) &&
+ dlen >= l2_len + sizeof(struct ipv6_hdr) + sizeof(struct udp_hdr)) {
+ /* L3 */
+ error = adjust_ipv6_pktlen(m, l2_len);
+ if (error) {
+ rte_pktmbuf_free(m);
+ return NULL;
+ }
+ proto = 0;
+ l3_len = get_ipv6x_hdr_len(m, l2_len, &proto);
+
+ if ((m->packet_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_FRAG) {
+ m = process_ipv6_frag(m, ctx, l2_len, l3_len);
+ if (m == NULL)
+ return NULL;
+ l3_len = get_ipv6x_hdr_len(m, m->l2_len, &proto);
+ }
+
+ /* L4 */
+ switch (proto) {
+ case IPPROTO_TCP:
+ ptypes = RTE_PTYPE_L4_TCP |
+ RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L2_ETHER;
+ l4_len = get_tcp_header_size(m, l2_len, l3_len);
+ break;
+ case IPPROTO_UDP:
+ ptypes = RTE_PTYPE_L4_UDP |
+ RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L2_ETHER;
+ l4_len = sizeof(struct udp_hdr);
+ break;
+ case IPPROTO_ICMPV6:
+ return icmp6_recv(ctx, m, l2_len, l3_len);
+ default:
+ GLUE_DEBUG("drop ipv6 pkt of unknown L4: (%x)", proto);
+ rte_pktmbuf_free(m);
+ return NULL;
+ }
+ } else {
+ GLUE_DEBUG("Drop unknown L3 packet: %x", etp);
+ rte_pktmbuf_free(m);
+ return NULL;
+ }
+
+ m->packet_type = ptypes;
+ error = fill_pkt_hdr_len(m, l2_len, l3_len, l4_len);
+ if (error) {
+ rte_pktmbuf_free(m);
+ return NULL;
+ }
+
+ return m;
+}
+
+/* exclude NULLs from the final list of packets. */
+static inline uint32_t
+compress_pkt_list(struct rte_mbuf *pkt[], uint32_t nb_pkt, uint32_t nb_zero)
+{
+ uint32_t i, j, k, l;
+
+ for (j = nb_pkt; nb_zero != 0 && j-- != 0; ) {
+
+ /* found a hole. */
+ if (pkt[j] == NULL) {
+
+ /* find how big is it. */
+ for (i = j; i-- != 0 && pkt[i] == NULL; )
+ ;
+ /* fill the hole. */
+ for (k = j + 1, l = i + 1; k != nb_pkt; k++, l++)
+ pkt[l] = pkt[k];
+
+ nb_pkt -= j - i;
+ nb_zero -= j - i;
+ j = i + 1;
+ }
+ }
+
+ return nb_pkt;
+}
+
+static inline struct rte_mbuf *
+common_fill_hdr_len(struct rte_mbuf *m, uint32_t tp, struct glue_ctx *ctx)
+{
+ uint32_t l4_len, l3_len, l2_len = sizeof(struct ether_hdr);
+ int32_t error = 0;
+
+ switch (tp) {
+ /* possibly fragmented packets. */
+ case (RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L2_ETHER):
+ case (RTE_PTYPE_L3_IPV4_EXT | RTE_PTYPE_L2_ETHER):
+ l3_len = get_ipv4_hdr_len(m, l2_len, IPPROTO_MAX + 1, 1);
+ if ((m->packet_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_FRAG) {
+ m = process_ipv4_frag(m, ctx, l2_len, l3_len);
+ if (m == NULL)
+ return NULL;
+ tp = m->packet_type & (RTE_PTYPE_L2_MASK |
+ RTE_PTYPE_L3_MASK |
+ RTE_PTYPE_L4_MASK);
+ }
+ break;
+ case (RTE_PTYPE_L3_IPV6 | RTE_PTYPE_L2_ETHER):
+ case (RTE_PTYPE_L3_IPV6_EXT | RTE_PTYPE_L2_ETHER):
+ l3_len = get_ipv6_hdr_len(m, l2_len, IPPROTO_MAX + 1);
+ if ((m->packet_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_FRAG) {
+ m = process_ipv6_frag(m, ctx, l2_len, l3_len);
+ if (m == NULL)
+ return NULL;
+ tp = m->packet_type & (RTE_PTYPE_L2_MASK |
+ RTE_PTYPE_L3_MASK |
+ RTE_PTYPE_L4_MASK);
+ }
+ break;
+ }
+
+ switch (tp) {
+ /* non fragmented tcp packets. */
+ case (RTE_PTYPE_L4_TCP | RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L2_ETHER):
+ l3_len = sizeof(struct ipv4_hdr);
+ l4_len = get_tcp_header_size(m, l2_len, l3_len);
+ error = adjust_ipv4_pktlen(m, l2_len);
+ break;
+ case (RTE_PTYPE_L4_TCP | RTE_PTYPE_L3_IPV6 | RTE_PTYPE_L2_ETHER):
+ l3_len = sizeof(struct ipv6_hdr);
+ l4_len = get_tcp_header_size(m, l2_len, l3_len);
+ error = adjust_ipv6_pktlen(m, l2_len);
+ break;
+ case (RTE_PTYPE_L4_TCP | RTE_PTYPE_L3_IPV4_EXT | RTE_PTYPE_L2_ETHER):
+ l3_len = get_ipv4_hdr_len(m, l2_len,
+ IPPROTO_TCP, 0);
+ l4_len = get_tcp_header_size(m, l2_len, l3_len);
+ error = adjust_ipv4_pktlen(m, l2_len);
+ break;
+ case (RTE_PTYPE_L4_TCP | RTE_PTYPE_L3_IPV6_EXT | RTE_PTYPE_L2_ETHER):
+ l3_len = get_ipv6_hdr_len(m, l2_len, IPPROTO_TCP);
+ l4_len = get_tcp_header_size(m, l2_len, l3_len);
+ error = adjust_ipv6_pktlen(m, l2_len);
+ break;
+
+ /* non fragmented udp packets. */
+ case (RTE_PTYPE_L4_UDP | RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L2_ETHER):
+ l3_len = sizeof(struct ipv4_hdr);
+ l4_len = sizeof(struct udp_hdr);
+ error = adjust_ipv4_pktlen(m, l2_len);
+ break;
+ case (RTE_PTYPE_L4_UDP | RTE_PTYPE_L3_IPV6 | RTE_PTYPE_L2_ETHER):
+ l3_len = sizeof(struct ipv6_hdr);
+ l4_len = sizeof(struct udp_hdr);
+ error = adjust_ipv6_pktlen(m, l2_len);
+ break;
+ case (RTE_PTYPE_L4_UDP | RTE_PTYPE_L3_IPV4_EXT | RTE_PTYPE_L2_ETHER):
+ l3_len = get_ipv4_hdr_len(m, l2_len,
+ IPPROTO_UDP, 0);
+ l4_len = sizeof(struct udp_hdr);
+ error = adjust_ipv4_pktlen(m, l2_len);
+ break;
+ case (RTE_PTYPE_L4_UDP | RTE_PTYPE_L3_IPV6_EXT | RTE_PTYPE_L2_ETHER):
+ l3_len = get_ipv6_hdr_len(m, l2_len, IPPROTO_UDP);
+ l4_len = sizeof(struct udp_hdr);
+ error = adjust_ipv6_pktlen(m, l2_len);
+ break;
+ default:
+ GLUE_LOG(ERR, "drop unknown pkt");
+ rte_pktmbuf_free(m);
+ return NULL;
+ }
+
+ if (error) {
+ rte_pktmbuf_free(m);
+ return NULL;
+ }
+ error = fill_pkt_hdr_len(m, l2_len, l3_len, l4_len);
+ if (error) {
+ rte_pktmbuf_free(m);
+ return NULL;
+ }
+ return m;
+}
+
+
+/*
+ * HW can recognize L2-arp/L3 with/without extensions/L4 (i40e)
+ */
+static uint16_t
+type0_rx_callback(uint16_t port,
+ uint16_t queue,
+ struct rte_mbuf *pkt[],
+ uint16_t nb_pkts,
+ uint16_t max_pkts,
+ void *user_param)
+{
+ uint32_t j, tp, l2_len, l3_len;
+ struct glue_ctx *ctx;
+ uint16_t nb_zero = 0;
+
+ RTE_SET_USED(port);
+ RTE_SET_USED(queue);
+ RTE_SET_USED(max_pkts);
+
+ ctx = user_param;
+
+ for (j = 0; j != nb_pkts; j++) {
+ tp = pkt[j]->packet_type & (RTE_PTYPE_L4_MASK |
+ RTE_PTYPE_L3_MASK | RTE_PTYPE_L2_MASK);
+
+ switch (tp) {
+ case (RTE_PTYPE_L2_ETHER_ARP):
+ arp_recv(ctx, pkt[j], sizeof(struct ether_hdr));
+ pkt[j] = NULL;
+ nb_zero++;
+ break;
+ case (RTE_PTYPE_L4_ICMP | RTE_PTYPE_L3_IPV4 |
+ RTE_PTYPE_L2_ETHER):
+ case (RTE_PTYPE_L4_ICMP | RTE_PTYPE_L3_IPV4_EXT |
+ RTE_PTYPE_L2_ETHER):
+ l2_len = sizeof(struct ether_hdr);
+ l3_len = get_ipv4_hdr_len(pkt[j], l2_len, IPPROTO_ICMP, 0);
+ icmp_recv(ctx, pkt[j], l2_len, l3_len);
+ pkt[j] = NULL;
+ nb_zero++;
+ break;
+ case (RTE_PTYPE_L4_ICMP | RTE_PTYPE_L3_IPV6 |
+ RTE_PTYPE_L2_ETHER):
+ case (RTE_PTYPE_L4_ICMP | RTE_PTYPE_L3_IPV6_EXT |
+ RTE_PTYPE_L2_ETHER):
+ l2_len = sizeof(struct ether_hdr);
+ l3_len = get_ipv6_hdr_len(pkt[j], l2_len, IPPROTO_ICMPV6);
+ icmp6_recv(ctx, pkt[j], l2_len, l3_len);
+ pkt[j] = NULL;
+ nb_zero++;
+ break;
+ default:
+ if (common_fill_hdr_len(pkt[j], tp, ctx) == NULL) {
+ pkt[j] = NULL;
+ nb_zero++;
+ }
+ break;
+ }
+ }
+
+ if (nb_zero == 0)
+ return nb_pkts;
+
+ return compress_pkt_list(pkt, nb_pkts, nb_zero);
+}
+
+/*
+ * HW can recognize L2/L3/L4 and fragments; but cannot recognize ARP
+ * nor ICMP (ixgbe).
+ */
+static uint16_t
+type1_rx_callback(uint16_t port,
+ uint16_t queue,
+ struct rte_mbuf *pkt[],
+ uint16_t nb_pkts,
+ uint16_t max_pkts,
+ void *user_param)
+{
+ uint32_t j, tp, l2_len, l3_len;
+ struct glue_ctx *ctx;
+ uint16_t nb_zero = 0;
+ const struct ether_hdr *eth;
+ const struct ipv4_hdr *ip4;
+ const struct ipv6_hdr *ip6;
+ uint16_t etp;
+
+ RTE_SET_USED(port);
+ RTE_SET_USED(queue);
+ RTE_SET_USED(max_pkts);
+
+ ctx = user_param;
+
+ for (j = 0; j != nb_pkts; j++) {
+ tp = pkt[j]->packet_type & (RTE_PTYPE_L4_MASK | RTE_PTYPE_L3_MASK |
+ RTE_PTYPE_L2_MASK);
+
+ switch (tp) {
+ case RTE_PTYPE_L2_ETHER:
+ eth = rte_pktmbuf_mtod(pkt[j], const struct ether_hdr *);
+ etp = eth->ether_type;
+ if (etp == rte_be_to_cpu_16(ETHER_TYPE_ARP))
+ arp_recv(ctx, pkt[j], sizeof(*eth));
+ pkt[j] = NULL;
+ nb_zero++;
+ break;
+ case (RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L2_ETHER):
+ case (RTE_PTYPE_L3_IPV4_EXT | RTE_PTYPE_L2_ETHER):
+ ip4 = rte_pktmbuf_mtod_offset(pkt[j],
+ const struct ipv4_hdr *,
+ sizeof(*eth));
+ if (ip4->next_proto_id == IPPROTO_ICMP) {
+ l2_len = sizeof(struct ether_hdr);
+ l3_len = get_ipv4_hdr_len(pkt[j], l2_len,
+ IPPROTO_ICMP, 0);
+ icmp_recv(ctx, pkt[j], l2_len, l3_len);
+ } else
+ rte_pktmbuf_free(pkt[j]);
+
+ pkt[j] = NULL;
+ nb_zero++;
+ break;
+ case (RTE_PTYPE_L3_IPV6 | RTE_PTYPE_L2_ETHER):
+ case (RTE_PTYPE_L3_IPV6_EXT | RTE_PTYPE_L2_ETHER):
+ ip6 = rte_pktmbuf_mtod_offset(pkt[j],
+ const struct ipv6_hdr *,
+ sizeof(*eth));
+ if (ip6->proto == IPPROTO_ICMPV6) {
+ l2_len = sizeof(struct ether_hdr);
+ l3_len = get_ipv6_hdr_len(pkt[j], l2_len,
+ IPPROTO_ICMPV6);
+ icmp6_recv(ctx, pkt[j], l2_len, l3_len);
+ } else
+ rte_pktmbuf_free(pkt[j]);
+
+ pkt[j] = NULL;
+ nb_zero++;
+ break;
+ default:
+ if (common_fill_hdr_len(pkt[j], tp, ctx) == NULL) {
+ pkt[j] = NULL;
+ nb_zero++;
+ }
+ break;
+ }
+ }
+
+ if (nb_zero == 0)
+ return nb_pkts;
+
+ return compress_pkt_list(pkt, nb_pkts, nb_zero);
+}
+
+/*
+ * generic, assumes HW doesn't recognize any packet type.
+ */
+uint16_t
+typen_rx_callback(uint16_t port,
+ uint16_t queue,
+ struct rte_mbuf *pkt[],
+ uint16_t nb_pkts,
+ uint16_t max_pkts,
+ void *user_param)
+{
+ uint32_t j;
+ uint16_t nb_zero;
+ struct glue_ctx *ctx;
+
+ RTE_SET_USED(port);
+ RTE_SET_USED(queue);
+ RTE_SET_USED(max_pkts);
+
+ ctx = user_param;
+
+ nb_zero = 0;
+ for (j = 0; j != nb_pkts; j++) {
+ /* fix me: now we avoid checking ip checksum */
+ pkt[j]->ol_flags &= (~PKT_RX_IP_CKSUM_BAD);
+ pkt[j]->packet_type = 0;
+ pkt[j] = fill_ptypes_and_hdr_len(ctx, pkt[j]);
+ nb_zero += (pkt[j] == NULL);
+ }
+
+ if (nb_zero == 0)
+ return nb_pkts;
+
+ return compress_pkt_list(pkt, nb_pkts, nb_zero);
+}
+
+static uint32_t
+get_ptypes(uint16_t port_id)
+{
+ uint32_t smask;
+ int32_t i, rc;
+ const uint32_t pmask =
+ RTE_PTYPE_L2_MASK | RTE_PTYPE_L3_MASK | RTE_PTYPE_L4_MASK;
+
+ smask = 0;
+ rc = rte_eth_dev_get_supported_ptypes(port_id, pmask, NULL, 0);
+ if (rc < 0) {
+ RTE_LOG(ERR, USER1,
+ "%s(port=%u) failed to get supported ptypes;\n",
+ __func__, port_id);
+ return smask;
+ }
+
+ uint32_t ptype[rc];
+ rc = rte_eth_dev_get_supported_ptypes(port_id, pmask, ptype, rc);
+
+ for (i = 0; i != rc; i++) {
+ switch (ptype[i]) {
+ case RTE_PTYPE_L2_ETHER_ARP:
+ smask |= ETHER_ARP_PTYPE;
+ break;
+ case RTE_PTYPE_L3_IPV4:
+ case RTE_PTYPE_L3_IPV4_EXT_UNKNOWN:
+ smask |= IPV4_PTYPE;
+ break;
+ case RTE_PTYPE_L3_IPV4_EXT:
+ smask |= IPV4_EXT_PTYPE;
+ break;
+ case RTE_PTYPE_L3_IPV6:
+ case RTE_PTYPE_L3_IPV6_EXT_UNKNOWN:
+ smask |= IPV6_PTYPE;
+ break;
+ case RTE_PTYPE_L3_IPV6_EXT:
+ smask |= IPV6_EXT_PTYPE;
+ break;
+ case RTE_PTYPE_L4_TCP:
+ smask |= TCP_PTYPE;
+ break;
+ case RTE_PTYPE_L4_UDP:
+ smask |= UDP_PTYPE;
+ break;
+ case RTE_PTYPE_L4_ICMP:
+ smask |= ICMP_PTYPE;
+ break;
+ }
+ }
+
+ return smask;
+}
+
+/* In rx callbacks, we need to check and make sure below things are done,
+ * either by hw or by sw:
+ * 1. filter out arp packets, and handle arp packets properly
+ * - for arp request packet, reply arp if it's requesting myself.
+ * 2. fill l2, l3, l4 header length
+ *
+ * 3. GSO/GRO setup (TODO)
+ *
+ */
+int
+setup_rx_cb(uint16_t port_id, uint16_t qid)
+{
+ int32_t rc;
+ uint32_t i, n, smask;
+ const void *cb;
+ struct glue_ctx *ctx;
+ const struct ptype2cb *ptype2cb;
+
+ static const struct ptype2cb tcp_arp_ptype2cb[] = {
+ { /* i40e */
+ .mask = ETHER_ARP_PTYPE |
+ ICMP_PTYPE |
+ IPV4_PTYPE | IPV4_EXT_PTYPE |
+ IPV6_PTYPE | IPV6_EXT_PTYPE |
+ TCP_PTYPE | UDP_PTYPE,
+ .name = "HW l2-arp/l3x/l4-tcp ptype",
+ .fn = type0_rx_callback,
+ },
+ { /* ixgbe does not support ARP ptype */
+ .mask = IPV4_PTYPE | IPV4_EXT_PTYPE |
+ IPV6_PTYPE | IPV6_EXT_PTYPE |
+ TCP_PTYPE | UDP_PTYPE,
+ .name = "HW l3x/l4-tcp ptype",
+ .fn = type1_rx_callback,
+ },
+ { /* virtio */
+ .mask = 0,
+ .name = "HW does not support any ptype",
+ .fn = typen_rx_callback,
+ },
+ };
+
+ ctx = glue_ctx_lookup(port_id, qid);
+ if (ctx == NULL) {
+ GLUE_LOG(ERR, "no ctx fount by port(%d) and queue (%d)",
+ port_id, qid);
+ return -EINVAL;
+ }
+
+ smask = get_ptypes(port_id);
+
+ ptype2cb = tcp_arp_ptype2cb;
+ n = RTE_DIM(tcp_arp_ptype2cb);
+
+ for (i = 0; i != n; i++) {
+ if ((smask & ptype2cb[i].mask) == ptype2cb[i].mask) {
+ cb = rte_eth_add_rx_callback(port_id, qid,
+ ptype2cb[i].fn, ctx);
+ rc = -rte_errno;
+ GLUE_LOG(ERR, "%s(port=%u), setup RX callback \"%s\";",
+ __func__, port_id, ptype2cb[i].name);
+ return ((cb == NULL) ? rc : 0);
+ }
+ }
+
+ GLUE_LOG(ERR, "%s(port=%u) failed to find an appropriate callback",
+ __func__, port_id);
+ return -ENOENT;
+}
diff --git a/lib/libtle_glue/rxtx.c b/lib/libtle_glue/rxtx.c
new file mode 100644
index 0000000..b80a3ac
--- /dev/null
+++ b/lib/libtle_glue/rxtx.c
@@ -0,0 +1,573 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "sym.h"
+
+#include <rte_common.h>
+#include <rte_mbuf.h>
+#include <rte_ip.h>
+#include <rte_udp.h>
+#include <rte_atomic.h>
+
+#include <tle_tcp.h>
+
+#include <stddef.h>
+#include <fcntl.h>
+
+#include "tle_glue.h"
+#include "fd.h"
+#include "util.h"
+#include "internal.h"
+
+rte_atomic32_t thr_cnt;
+
+#define MAX_UDP_PKT_LEN ((2 << 16) - 1 - sizeof(struct ipv4_hdr) - sizeof(struct udp_hdr))
+
+static inline struct rte_mbuf *
+from_mbuf_to_buf(struct rte_mbuf *m, char *buf,
+ size_t len, int ispeek, int needcpy)
+{
+ void *src;
+ uint32_t done = 0;
+ uint32_t left = len, orig_pkt_len;
+ uint16_t copy_len, seg_len, segs;
+ struct rte_mbuf *m_next, *orig_pkt;
+
+ if (len == 0)
+ return m;
+
+ orig_pkt = m;
+ orig_pkt_len = m->pkt_len;
+ segs = m->nb_segs;
+
+ do {
+ seg_len = rte_pktmbuf_data_len(m);
+ copy_len = RTE_MIN(seg_len, left);
+ src = rte_pktmbuf_mtod(m, void *);
+ if (needcpy)
+ rte_memcpy(buf + done, src, copy_len);
+ done += copy_len;
+ left -= copy_len;
+ if (copy_len < seg_len) {
+ if (!ispeek)
+ rte_pktmbuf_adj(m, copy_len);
+ break;
+ }
+ m_next = m->next;
+ if (!ispeek) {
+ rte_pktmbuf_free_seg(m);
+ segs--;
+ }
+ m = m_next;
+ } while (left && m);
+
+ if (m && !ispeek) {
+ m->nb_segs = segs;
+ m->pkt_len = orig_pkt_len - done;
+ }
+
+ if(ispeek)
+ return orig_pkt;
+ else
+ return m;
+}
+
+static inline bool
+is_peer_closed(struct sock *so)
+{
+ if (errno == EAGAIN && tle_event_state(&so->erev) == TLE_SEV_UP)
+ return true;
+
+ return false;
+}
+
+static ssize_t
+_recv(int sockfd, void *buf, size_t len, struct sockaddr *src_addr, int flags)
+{
+ int rx;
+ ssize_t rc;
+ ssize_t recvlen;
+ size_t tmplen;
+ struct sock *so;
+ struct rte_mbuf *m;
+ struct epoll_event event;
+ int needcpy;
+
+ if (RTE_PER_LCORE(_lcore_id) == LCORE_ID_ANY) {
+ RTE_PER_LCORE(_lcore_id) = rte_atomic32_add_return(&thr_cnt, 1);
+ }
+
+ so = fd2sock(sockfd);
+
+ if (so->s == NULL) {
+ if (IS_UDP(so) && is_nonblock(so, flags))
+ errno = EAGAIN;
+ else
+ errno = ENOTCONN;
+ return -1;
+ }
+
+ if (so->rx_left) {
+ m = so->rx_left;
+ so->rx_left = NULL;
+ if (src_addr) {
+ OPS(so)->getname(so, src_addr, 1);
+ /* fixme: cannot get addr for UDP in this way */
+ }
+ } else {
+ rc = OPS(so)->recv(so->s, &m, 1, src_addr);
+ if (rc == 0) {
+ if (is_nonblock(so, flags)) {
+ /* socket closed, return 0 */
+ if (is_peer_closed(so)) {
+ GLUE_DEBUG("peer closed: %d", sockfd);
+ return 0;
+ }
+
+ /* According to linux stack,
+ * receive from shutdown tcp socket returns 0.
+ * And receive from shutdown udp socket generate
+ * EAGAIN. In special case, we return ESHUTDOWN
+ * to notify upper application.
+ */
+ if (so->shutdown & RECV_SHUTDOWN) {
+ if (so->proto == PROTO_TCP)
+ return 0;
+ else {
+#ifdef LOOK_ASIDE_BACKEND
+ errno = ESHUTDOWN;
+#else
+ errno = EAGAIN;
+#endif
+ return -1;
+ }
+ }
+ return -1;
+ }
+
+ do {
+ /* in blocking mode, recv from shutdown socket
+ * return 0 immediately */
+ if (so->shutdown & RECV_SHUTDOWN)
+ return 0;
+
+ /* some error occured, return -1 */
+ if (errno != EAGAIN)
+ return -1;
+
+ /* socket closed, return 0 */
+ if (is_peer_closed(so)) {
+ GLUE_DEBUG("peer closed: %d", sockfd);
+ return 0;
+ }
+
+ epoll_kernel_wait(CTX(so), -1, &event, 1, 1, &rx);
+
+ be_process(CTX(so));
+ } while((rc = OPS(so)->recv(so->s, &m, 1, src_addr)) == 0);
+ }
+ }
+
+ /* get one pkt */
+ if (!so->option.timestamp)
+ so->s->timestamp = m->timestamp;
+
+ needcpy = 1;
+ recvlen = RTE_MIN(m->pkt_len, len);
+ if (flags & MSG_TRUNC) {
+ if (IS_UDP(so))
+ recvlen = m->pkt_len;
+ else
+ /* According to linux manual, data will be discarded
+ * if recv TCP stream with MSG_TRUNC flag */
+ needcpy = 0;
+ }
+
+ so->rx_left = from_mbuf_to_buf(m, buf, len, flags & MSG_PEEK, needcpy);
+
+ if (((flags & MSG_PEEK) == 0) && IS_UDP(so) && so->rx_left) {
+ rte_pktmbuf_free(so->rx_left);
+ so->rx_left = NULL;
+ }
+
+ /* UDP socket only receive one pkt at one time */
+ if (IS_UDP(so) || (flags & MSG_PEEK)) {
+ return recvlen;
+ }
+ /* TCP socket: try best to fill buf */
+ len -= recvlen;
+ buf = (char*)buf + recvlen;
+ while (len) {
+ if (OPS(so)->recv(so->s, &m, 1, src_addr) == 0)
+ break;
+
+ tmplen = (m->pkt_len < len) ? m->pkt_len : len;
+ so->rx_left = from_mbuf_to_buf(m, buf, tmplen, 0, needcpy);
+ len -= tmplen;
+ recvlen += tmplen;
+ buf = (char*)buf + tmplen;
+ }
+
+ if (so->rx_left)
+ tle_event_raise(&so->rxev);
+
+ /* may send window increase ACK after receive*/
+ if (recvlen > 0)
+ be_tx_with_lock(CTX(so));
+
+ return recvlen;
+}
+
+ssize_t PRE(recv)(int sockfd, void *buf, size_t len, int flags)
+{
+ if (is_kernel_fd(sockfd))
+ return k_read(sockfd, buf, len);
+
+ return _recv(sockfd, buf, len, NULL, flags);
+}
+
+ssize_t PRE(recvfrom)(int sockfd, void *buf, size_t len, int flags,
+ struct sockaddr *src_addr, socklen_t *addrlen)
+{
+ ssize_t rc;
+ if (is_kernel_fd(sockfd))
+ return k_recv(sockfd, buf, len, flags);
+
+ if (src_addr && !addrlen) {
+ errno = EINVAL;
+ return -1;
+ }
+ rc = _recv(sockfd, buf, len, src_addr, flags);
+ if (rc >= 0 && src_addr) {
+ if (src_addr->sa_family == AF_INET) {
+ *addrlen = sizeof(struct sockaddr_in);
+ } else {
+ *addrlen = sizeof(struct sockaddr_in6);
+ }
+ }
+ return rc;
+}
+
+#define RECV_CONTINUE (-2)
+static inline ssize_t
+try_recvmsg(struct sock *so, struct msghdr *msg, int flags)
+{
+ ssize_t sz;
+
+ if (so->s == NULL) {
+ if (IS_UDP(so) && is_nonblock(so, flags))
+ errno = EAGAIN;
+ else
+ errno = ENOTCONN;
+ return -1;
+ }
+
+ sz = OPS(so)->readv(so->s, msg, flags);
+ if (sz >= 0) { /* get data */
+ /* may send window increase ACK after receive*/
+ if (sz > 0)
+ be_tx_with_lock(CTX(so));
+ return sz;
+ }
+ else if (errno != EAGAIN) /* error occurred */
+ return -1;
+ else if (is_peer_closed(so)) {
+ GLUE_DEBUG("peer closed: %d", so->fd);
+ return 0;
+ } else if (is_nonblock(so, flags))
+ return -1;
+
+ return RECV_CONTINUE;
+}
+
+ssize_t PRE(recvmsg)(int sockfd, struct msghdr *msg, int flags)
+{
+ ssize_t sz;
+ struct sock *so;
+
+ if (is_kernel_fd(sockfd))
+ return k_recvmsg(sockfd, msg, flags);
+
+ so = fd2sock(sockfd);
+
+ if (so->rx_left == NULL && OPS(so)->readv &&
+ (flags & MSG_PEEK) == 0 &&
+ ((flags & MSG_TRUNC) == 0 || so->proto == PROTO_UDP)) {
+ /* udp_readv supports MSG_TRUNC, tcp_readv not yet.
+ * so only udp socket implement with readv interface.
+ */
+ sz = try_recvmsg(so, msg, flags);
+ if (sz != RECV_CONTINUE)
+ return sz;
+ }
+
+ /* 1. rx_left != NULL; 2. get no data, fall back to blocking read */
+
+ if (so->rx_left != NULL && msg != NULL && msg->msg_control != NULL) {
+ if (so->option.timestamp)
+ tle_set_timestamp(msg, so->rx_left);
+ else
+ msg->msg_controllen = 0;
+ }
+
+ sz = PRE(recvfrom)(sockfd, msg->msg_iov[0].iov_base,
+ msg->msg_iov[0].iov_len, flags,
+ (struct sockaddr *)msg->msg_name,
+ &msg->msg_namelen);
+
+ return sz;
+}
+
+ssize_t PRE(read)(int fd, void *buf, size_t count)
+{
+ if (is_kernel_fd(fd))
+ return k_read(fd, buf, count);
+
+ return _recv(fd, buf, count, NULL, 0);
+}
+
+#define DECONST(type, var) ((type)(uintptr_t)(const void *)(var))
+
+ssize_t PRE(readv)(int fd, const struct iovec *iov, int iovcnt)
+{
+ ssize_t sz;
+ struct sock *so;
+ struct msghdr msg;
+
+ if (is_kernel_fd(fd))
+ return k_readv(fd, iov, iovcnt);
+
+ if (RTE_PER_LCORE(_lcore_id) == LCORE_ID_ANY) {
+ RTE_PER_LCORE(_lcore_id) = rte_atomic32_add_return(&thr_cnt, 1);
+ }
+
+ so = fd2sock(fd);
+
+ if (so->rx_left == NULL && OPS(so)->readv) {
+ memset(&msg, 0, sizeof(msg));
+ msg.msg_iov = DECONST(struct iovec *, iov);
+ msg.msg_iovlen = iovcnt;
+ sz = try_recvmsg(so, &msg, 0);
+ if (sz != RECV_CONTINUE)
+ return sz;
+ }
+
+ /* 1. rx_left != NULL; 2. get no data, fall back to blocking read */
+
+ /* fixme: when so->rx_left != NULL, also needs readv.
+ * maybe need to modify readv interface args of ops */
+ return _recv(fd, iov[0].iov_base, iov[0].iov_len, NULL, 0);
+}
+
+static ssize_t
+_send(int sockfd, const void *buf, size_t len,
+ const struct sockaddr *peer, int flags)
+{
+ struct sock *so = fd2sock(sockfd);
+ struct rte_mempool *mp = get_mempool_by_socket(0); /* fix me */
+ uint16_t nb_mbufs = (len + RTE_MBUF_DEFAULT_DATAROOM - 1)
+ / RTE_MBUF_DEFAULT_DATAROOM;
+ uint16_t i, cnt, copy_len;
+ int rc;
+ struct rte_mbuf *mbufs[nb_mbufs + 1];
+ size_t done = 0;
+ uint32_t left = 0;
+ char *dst;
+ int blocking = !is_nonblock(so, flags);
+
+ if (RTE_PER_LCORE(_lcore_id) == LCORE_ID_ANY) {
+ RTE_PER_LCORE(_lcore_id) = rte_atomic32_add_return(&thr_cnt, 1);
+ }
+
+ if (!blocking && len > def_sndbuf && so->proto == PROTO_TCP) {
+ len = def_sndbuf;
+ nb_mbufs = (len + RTE_MBUF_DEFAULT_DATAROOM - 1)
+ / RTE_MBUF_DEFAULT_DATAROOM;
+ }
+
+ if (unlikely(len == 0)) {
+ if (so->proto == PROTO_TCP)
+ return 0;
+ else
+ nb_mbufs = 1;
+ }
+
+ if (unlikely(len > MAX_UDP_PKT_LEN && IS_UDP(so))) {
+ errno = EMSGSIZE;
+ return -1;
+ }
+
+ if (blocking)
+ be_process(get_ctx());
+
+ if (unlikely(rte_pktmbuf_alloc_bulk(mp, mbufs, nb_mbufs) < 0)) {
+ errno = ENOMEM;
+ return -1;
+ }
+
+ for (i = 0; i < nb_mbufs; ++i) {
+ copy_len = RTE_MIN((size_t)RTE_MBUF_DEFAULT_DATAROOM,
+ len - done);
+ dst = rte_pktmbuf_mtod(mbufs[i], char *);
+ rte_memcpy(dst, (const char *)buf + done, copy_len);
+ done += copy_len;
+ mbufs[i]->data_len = copy_len;
+ mbufs[i]->pkt_len = copy_len;
+ }
+
+ cnt = 0;
+do_send:
+ rc = OPS(so)->send(so, mbufs + cnt, nb_mbufs - cnt, peer);
+
+ cnt += rc;
+
+ if (cnt > 0)
+ be_tx_with_lock(CTX(so));
+
+ if (cnt > 0 && blocking)
+ be_process(get_ctx());
+
+ if (blocking &&
+ cnt < nb_mbufs &&
+ (rc > 0 || errno == EAGAIN) &&
+ tle_event_state(&so->erev) != TLE_SEV_UP) {
+ be_process(get_ctx());
+ goto do_send;
+ }
+
+ for (i = cnt; i < nb_mbufs; ++i) {
+ left += mbufs[i]->pkt_len;
+ rte_pktmbuf_free_seg(mbufs[i]);
+ }
+
+ if (cnt == 0)
+ return -1;
+ else
+ return len - left;
+}
+
+ssize_t PRE(send)(int sockfd, const void *buf, size_t len, int flags)
+{
+ if (is_kernel_fd(sockfd))
+ return k_write(sockfd, buf, len);
+
+ /* MSG_NOSIGNAL means "Do not generate SIGPIPE". Ignore this flag */
+ flags &= ~MSG_NOSIGNAL;
+
+ return _send(sockfd, buf, len, NULL, flags);
+}
+
+ssize_t PRE(sendto)(int sockfd, const void *buf, size_t len, int flags,
+ const struct sockaddr *dest_addr, socklen_t addrlen)
+{
+ if (is_kernel_fd(sockfd))
+ return k_sendto(sockfd, buf, len, flags, dest_addr, addrlen);
+
+ /* MSG_NOSIGNAL means "Do not generate SIGPIPE". Ignore this flag */
+ flags &= ~MSG_NOSIGNAL;
+
+ return _send(sockfd, buf, len, dest_addr, flags);
+}
+
+ssize_t PRE(sendmsg)(int sockfd, const struct msghdr *msg, int flags)
+{
+ ssize_t ret;
+ struct sock *so;
+
+ if (is_kernel_fd(sockfd))
+ return k_sendmsg(sockfd, msg, flags);
+
+ /* MSG_NOSIGNAL means "Do not generate SIGPIPE". Ignore this flag */
+ flags &= ~MSG_NOSIGNAL;
+
+ so = fd2sock(sockfd);
+ if (OPS(so)->writev) {
+ ret = OPS(so)->writev(so, msg->msg_iov, msg->msg_iovlen,
+ msg->msg_name);
+ if (ret < 0) {
+ if (errno != EAGAIN || is_nonblock(so, flags))
+ return -1;
+ } else {
+ /* TODO: blocking && ret < total length */
+ be_tx_with_lock(CTX(so));
+ return ret;
+ }
+
+ /* fall through to blocking send */
+ }
+
+ return _send(sockfd, msg->msg_iov[0].iov_base, msg->msg_iov[0].iov_len,
+ (struct sockaddr *)msg->msg_name, flags);
+}
+
+ssize_t PRE(write)(int fd, const void *buf, size_t count)
+{
+ if (is_kernel_fd(fd))
+ return k_write(fd, buf, count);
+
+ return _send(fd, buf, count, NULL, 0);
+}
+
+ssize_t PRE(writev)(int fd, const struct iovec *iov, int iovcnt)
+{
+ ssize_t ret;
+ struct sock *so;
+
+ if (is_kernel_fd(fd))
+ return k_writev(fd, iov, iovcnt);
+
+ if (RTE_PER_LCORE(_lcore_id) == LCORE_ID_ANY) {
+ RTE_PER_LCORE(_lcore_id) = rte_atomic32_add_return(&thr_cnt, 1);
+ }
+
+ so = fd2sock(fd);
+ if (OPS(so)->writev) {
+ ret = OPS(so)->writev(so, iov, iovcnt, NULL);
+ if (ret < 0) {
+ if (errno != EAGAIN || is_nonblock(so, 0))
+ return -1;
+ } else {
+ /* TODO: blocking && ret < total length */
+ be_tx_with_lock(CTX(so));
+ return ret;
+ }
+
+ /* fall through to blocking send */
+ }
+
+ return _send(fd, iov[0].iov_base, iov[0].iov_len, NULL, 0);
+}
+
+/* advanced functions */
+ssize_t PRE(splice)(int fd_in, loff_t *off_in, int fd_out,
+ loff_t *off_out, size_t len, unsigned int flags)
+{
+ if (is_kernel_fd(fd_in) && is_kernel_fd(fd_out))
+ return k_splice(fd_in, off_in, fd_out, off_out, len, flags);
+
+ rte_panic("splice is not supported yet");
+ errno = EOPNOTSUPP;
+ return -1;
+}
+
+ssize_t PRE(sendfile)(int out_fd, int in_fd, off_t *offset, size_t count)
+{
+ if (is_kernel_fd(out_fd) && is_kernel_fd(in_fd))
+ return k_sendfile(out_fd, in_fd, offset, count);
+
+ rte_panic("sendfile is not supported yet");
+ errno = EOPNOTSUPP;
+ return -1;
+}
diff --git a/lib/libtle_glue/select.c b/lib/libtle_glue/select.c
new file mode 100644
index 0000000..b3b8539
--- /dev/null
+++ b/lib/libtle_glue/select.c
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <signal.h>
+#include <sys/select.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "fd.h"
+#include "ctx.h"
+#include "sym.h"
+#include "log.h"
+#include "util.h"
+#include "internal.h"
+#include "tle_glue.h"
+
+#define FD_ZERO_N(s, n) do { memset((s)->fds_bits, 0, n/sizeof(long)); } while(0)
+
+static int
+fdset_to_events_user(int nfds, fd_set *fdset, int *total, int event)
+{
+ int i, num = 0;
+ struct sock *so;
+ const struct tle_event *ev;
+
+ for (i = fd_table.fd_base; i < nfds; ++i) {
+ if (!FD_ISSET(i, fdset))
+ continue;
+
+ so = fd2sock(i); /* fix me: check if fd is opened */
+
+ switch (event) {
+ case EPOLLIN:
+ ev = &so->rxev;
+ break;
+ case EPOLLOUT:
+ ev = &so->txev;
+ break;
+ case EPOLLERR:
+ ev = &so->erev;
+ break;
+ default:
+ rte_panic("non-sense value\n");
+ }
+ /* Check event is ready */
+ if (TLE_SEV_UP == tle_event_state(ev)) {
+ *total = *total + 1;
+ } else {
+ FD_CLR(i, fdset);
+ num++;
+ }
+
+ /* We fill sock->event here as we need this when
+ * we filter events in poll_common(). But it was
+ * originally set by epoll_ctl(). Now we have to
+ * assume that there are no application which
+ * uses epoll/poll/select at the same time.
+ */
+ so->event.events |= event;
+ so->event.data.u32 = i;
+ }
+
+ return num;
+}
+
+static int
+fdset_to_events_kernel(int nfds, fd_set *fdset, int efd, int event)
+{
+ int i, num = 0;
+ struct epoll_event k_ev;
+
+ for (i = 0; i < nfds; ++i) {
+ if (!FD_ISSET(i, fdset))
+ continue;
+
+ k_ev.events = event;
+ k_ev.data.u32 = i;
+ k_epoll_ctl(efd, EPOLL_CTL_ADD, i, &k_ev);
+ num++;
+ }
+
+ return num;
+}
+
+int
+PRE(select)(int nfds, fd_set *readfds, fd_set *writefds,
+ fd_set *exceptfds, struct timeval *timeout)
+{
+ int to;
+ struct glue_ctx *ctx;
+ int j, efd, total = 0, max = 0;
+
+ /* thread <> context binding happens here */
+ if (RTE_PER_LCORE(glue_ctx) == NULL) {
+ ctx = &ctx_array[glue_ctx_alloc()];
+ RTE_PER_LCORE(glue_ctx) = ctx;
+ } else
+ ctx = RTE_PER_LCORE(glue_ctx);
+
+ /* step 0, process some packets */
+ be_process(ctx);
+
+ /* step 1, check if any userspace events are ready */
+
+ if (readfds)
+ max += fdset_to_events_user(nfds, readfds,
+ &total, EPOLLIN);
+ if (writefds)
+ max += fdset_to_events_user(nfds, writefds,
+ &total, EPOLLOUT);
+ if (exceptfds)
+ max += fdset_to_events_user(nfds, writefds,
+ &total, EPOLLERR);
+ if (total > 0) {
+ /* userspace events go firstly */
+ if (readfds)
+ FD_ZERO_N(readfds, fd_table.fd_base);
+ if (writefds)
+ FD_ZERO_N(writefds, fd_table.fd_base);
+ if (exceptfds)
+ FD_ZERO_N(exceptfds, fd_table.fd_base);
+
+ return total;
+ }
+
+ /* step 2, only wait for kernel events? */
+ if (max == 0)
+ return k_select(nfds, readfds, writefds, exceptfds, timeout);
+
+ /* step 3, slow path: wait for I/O and kernel events */
+ efd = k_epoll_create(1);
+ if (efd < 0)
+ rte_panic("k_epoll_create failed %d", errno);
+
+ nfds = RTE_MIN(nfds, fd_table.fd_base);
+ if (readfds)
+ max += fdset_to_events_kernel(nfds, readfds,
+ efd, EPOLLIN);
+ if (writefds)
+ max += fdset_to_events_kernel(nfds, writefds,
+ efd, EPOLLOUT);
+ if (exceptfds)
+ max += fdset_to_events_kernel(nfds, exceptfds,
+ efd, EPOLLERR);
+
+ struct epoll_event events[max];
+
+ if (timeout)
+ to = timeout->tv_sec * 1000 + timeout->tv_usec / 1000;
+ else
+ to = -1;
+ total = poll_common(ctx, events, max, to, efd);
+
+ k_close(efd);
+ for (j = 0; j < total; ++j) {
+ if (events[j].events & EPOLLIN)
+ FD_SET(events[j].data.fd, readfds);
+
+ if (events[j].events & EPOLLOUT)
+ FD_SET(events[j].data.fd, writefds);
+
+ if ((events[j].events & (EPOLLHUP | EPOLLERR)) && exceptfds)
+ FD_SET(events[j].data.fd, exceptfds);
+ }
+ return total;
+}
+
+int
+PRE(pselect)(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds,
+ const struct timespec *timeout, const sigset_t *sigmask)
+{
+ struct timeval tv, *tv_to;
+
+ if (sigmask != NULL)
+ rte_panic("pselect with signal is not supported");
+
+ if (timeout) {
+ tv.tv_usec = timeout->tv_nsec / 1000;
+ tv.tv_sec = timeout->tv_sec;
+ tv_to = &tv;
+ } else
+ tv_to = NULL;
+
+ return select(nfds, readfds, writefds, exceptfds, tv_to);
+}
diff --git a/lib/libtle_glue/sock.h b/lib/libtle_glue/sock.h
new file mode 100644
index 0000000..fcd6362
--- /dev/null
+++ b/lib/libtle_glue/sock.h
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef _SOCK_H_
+#define _SOCK_H_
+
+#include <stdarg.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <tle_event.h>
+#include <tle_ctx.h>
+
+#include "ctx.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern unsigned int def_sndbuf;
+extern unsigned int def_rcvbuf;
+
+#ifndef TCP_FASTOPEN
+#define TCP_FASTOPEN 23
+#endif
+
+#ifndef TCP_USER_TIMEOUT
+#define TCP_USER_TIMEOUT 18
+#endif
+
+#ifndef TCP_FASTOPEN_CONNECT
+#define TCP_FASTOPEN_CONNECT 30
+#endif
+
+struct sock;
+
+struct proto {
+ int (*setsockopt)(struct sock *sk, int optname, const void *optval,
+ socklen_t optlen);
+ int (*getsockopt)(struct sock *sk, int optname, void *optval,
+ socklen_t *option);
+ int (*getname)(struct sock *sk, struct sockaddr *addr, int peer);
+
+ int (*bind)(struct sock *sk, const struct sockaddr *addr);
+ int (*listen)(struct sock *sk, int backlog);
+ int (*connect)(struct sock *sk, const struct sockaddr *addr);
+ int (*accept)(struct sock *sk, struct sockaddr *addr,
+ socklen_t *addrlen, int flags);
+
+ ssize_t (*recv)(struct tle_stream *s, struct rte_mbuf *pkt[],
+ uint16_t num, struct sockaddr *addr);
+ ssize_t (*send)(struct sock *sk, struct rte_mbuf *pkt[],
+ uint16_t num, const struct sockaddr *dst_addr);
+
+ ssize_t (*readv)(struct tle_stream *s, struct msghdr *msg, int flags);
+ ssize_t (*writev)(struct sock *sk, const struct iovec *iov,
+ int iovcnt, const struct sockaddr *dst_addr);
+
+ int (*shutdown)(struct sock *sk, int how);
+ int (*close)(struct tle_stream *s);
+
+ void (*update_cfg)(struct sock *sk);
+
+ char name[32];
+};
+
+enum {
+ PROTO_TCP,
+ PROTO_UDP
+};
+
+#define RECV_SHUTDOWN 1
+#define SEND_SHUTDOWN 2
+
+extern struct proto udp_prot;
+extern struct proto tcp_prot;
+extern struct proto *supported_proto_ops[];
+
+struct sock {
+ int fd;
+ uint32_t cid:8, /* ctx id for indexing ctx_array */
+ domain:8, /* for AF_INET, AF_INET6 */
+ proto:8, /* PROTO_TCP, PROTO_UDP */
+ valid:1,
+ epoll:1,
+ ubind:1,
+ ubindany:1,
+ nonblock:1,
+ tcp_connected:1,
+ shutdown:2;
+ struct tle_stream *s;
+ struct rte_mbuf *rx_left;
+ tle_stream_options_t option;
+ union {
+ struct epoll_event event;
+ int shadow_efd;
+ };
+ struct tle_event txev;
+ struct tle_event rxev;
+ struct tle_event erev;
+} __rte_cache_aligned;
+
+#define CTX(so) (&ctx_array[so->cid])
+#define OPS(so) (supported_proto_ops[so->proto])
+#define IS_TCP(so) (so->proto == PROTO_TCP)
+#define IS_UDP(so) (so->proto == PROTO_UDP)
+
+static inline int
+is_nonblock(struct sock *so, int flags)
+{
+ return (flags & MSG_DONTWAIT) || so->nonblock;
+}
+
+static inline struct tle_ctx *
+get_sock_ctx(struct sock *so)
+{
+ if (IS_TCP(so))
+ return CTX(so)->tcp_ctx;
+ else
+ return CTX(so)->udp_ctx;
+}
+
+static inline size_t
+get_sockaddr_len(sa_family_t family)
+{
+ switch (family) {
+ case AF_INET:
+ return sizeof(struct sockaddr_in);
+ case AF_INET6:
+ return sizeof(struct sockaddr_in6);
+ case AF_UNSPEC:
+ return sizeof(sa_family_t);
+ default:
+ return 0;
+ }
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /*_SOCK_H_ */
diff --git a/lib/libtle_glue/socket.c b/lib/libtle_glue/socket.c
new file mode 100644
index 0000000..31b28be
--- /dev/null
+++ b/lib/libtle_glue/socket.c
@@ -0,0 +1,720 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "sym.h"
+
+#include <stdarg.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
+
+#include "tle_glue.h"
+#include "fd.h"
+#include "log.h"
+#include "util.h"
+#include "internal.h"
+#include "sock.h"
+
+struct proto *supported_proto_ops[] = {
+ [PROTO_TCP] = &tcp_prot,
+ [PROTO_UDP] = &udp_prot,
+};
+
+/* for setup, settings, and destroy */
+int PRE(socket)(int domain, int type, int protocol)
+{
+ int fd;
+ struct sock *so;
+
+ if ((domain != AF_INET && domain != AF_INET6) ||
+ (type != SOCK_STREAM && type != SOCK_DGRAM))
+ return k_socket(domain, type, protocol);
+
+ if (domain == AF_INET) {
+ if (default_ctx->ipv4 == 0 && !default_ctx->lo4_enabled) {
+ errno = EAFNOSUPPORT;
+ return -1;
+ }
+ } else {
+ if (IN6_IS_ADDR_UNSPECIFIED(&default_ctx->ipv6) &&
+ !default_ctx->lo6_enabled) {
+ errno = EAFNOSUPPORT;
+ return -1;
+ }
+ }
+
+ fd = get_unused_fd();
+ if (fd < 0) {
+ errno = ENFILE;
+ return -1;
+ }
+ so = fd2sock(fd);
+ so->cid = get_cid();
+ if (type == SOCK_STREAM)
+ so->proto = PROTO_TCP;
+ else /* type == SOCK_DGRAM */
+ so->proto = PROTO_UDP;
+
+ so->domain = domain;
+ so->option.raw = 0;
+ so->option.mulloop = 1;
+ so->option.multtl = 1;
+ if (type == SOCK_STREAM) {
+ so->option.tcpquickack = 1;
+ /* linux default value: 2 hours */
+ so->option.keepidle = 2 * 60 * 60;
+ /* linux default value: 75seconds */
+ so->option.keepintvl = 75;
+ /* linux default value: 9 */
+ so->option.keepcnt = 9;
+ }
+
+ sock_alloc_events(so);
+
+ GLUE_DEBUG("socket fd = %d", fd);
+ printf("socket fd = %d", fd);
+ return fd;
+}
+
+int PRE(bind)(int sockfd, const struct sockaddr *addr, socklen_t addrlen)
+{
+ struct sock *so;
+
+ if (is_kernel_fd(sockfd))
+ return k_bind(sockfd, addr, addrlen);
+
+ so = fd2sock(sockfd);
+ if (so->s) {
+ /* The socket is already bound to an address */
+ errno = EINVAL;
+ return -1;
+ }
+
+ if (addrlen < get_sockaddr_len(addr->sa_family)) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ so->cid = get_cid(); /* allow ctx reset as stream is null */
+ if (OPS(so)->bind)
+ return OPS(so)->bind(so, addr);
+
+ errno = EOPNOTSUPP;
+ return -1;
+}
+
+int PRE(listen)(int sockfd, int backlog)
+{
+ struct sock *so;
+
+ if (is_kernel_fd(sockfd))
+ return k_listen(sockfd, backlog);
+
+ so = fd2sock(sockfd);
+
+ if (OPS(so)->listen)
+ return OPS(so)->listen(so, backlog);
+
+ errno = EOPNOTSUPP;
+ return -1;
+}
+
+int PRE(accept)(int sockfd, struct sockaddr *addr, socklen_t *addrlen)
+{
+ struct sock *so;
+
+ if (is_kernel_fd(sockfd))
+ return k_accept(sockfd, addr, addrlen);
+
+ so = fd2sock(sockfd);
+ if (OPS(so)->accept)
+ return OPS(so)->accept(so, addr, addrlen, 0);
+
+ errno = EOPNOTSUPP;
+ return -1;
+}
+
+int PRE(accept4)(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags)
+{
+ int fd;
+ struct sock *so;
+
+ if (is_kernel_fd(sockfd))
+ return k_accept4(sockfd, addr, addrlen, flags);
+
+ fd = PRE(accept)(sockfd, addr, addrlen);
+
+ /* inherit NONBLOCK flag */
+ if (fd >= 0 && (flags & SOCK_NONBLOCK)) {
+ so = fd2sock(fd);
+ so->nonblock = 1;
+ }
+
+ return fd;
+}
+
+int PRE(connect)(int sockfd, const struct sockaddr *addr, socklen_t addrlen)
+{
+ struct sock *so;
+
+ if (is_kernel_fd(sockfd))
+ return k_connect(sockfd, addr, addrlen);
+
+ if (addrlen < get_sockaddr_len(addr->sa_family)) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ so = fd2sock(sockfd);
+ so->cid = get_cid();
+
+ if (!(is_nonblock(so, 0)))
+ mac_check(CTX(so), addr);
+
+ if (OPS(so)->connect)
+ return OPS(so)->connect(so, addr);
+
+ errno = EOPNOTSUPP;
+ return -1;
+}
+
+unsigned int def_sndbuf = 212992;
+unsigned int def_rcvbuf = 212992;
+static struct linger ling;
+
+int PRE(getsockopt)(int sockfd, int level, int optname,
+ void *optval, socklen_t *optlen)
+{
+ struct sock *so;
+ union {
+ int val;
+ uint64_t val64;
+ struct linger ling;
+ struct timeval tm;
+ } *p = optval;
+
+
+ if (is_kernel_fd(sockfd))
+ return k_getsockopt(sockfd, level, optname, optval, optlen);
+
+ if (!optval && !optlen)
+ return -1;
+
+ so = fd2sock(sockfd);
+
+ switch (level) {
+ case IPPROTO_IP:
+ switch (optname) {
+ case IP_OPTIONS:
+ *optlen = 0;
+ return 0;
+ case IP_MULTICAST_LOOP:
+ p->val = so->option.mulloop;
+ return 0;
+ case IP_MULTICAST_TTL:
+ p->val = so->option.multtl;
+ return 0;
+ }
+ break;
+ case IPPROTO_IPV6:
+ switch (optname) {
+ case IPV6_V6ONLY:
+ p->val = so->option.ipv6only;
+ return 0;
+ }
+ break;
+ case SOL_SOCKET:
+ /* man socket(7), see /usr/include/asm-generic/socket.h */
+ switch (optname) {
+ case SO_REUSEADDR:
+ p->val = so->option.reuseaddr;
+ return 0;
+ case SO_REUSEPORT:
+ p->val = so->option.reuseport;
+ return 0;
+ case SO_ERROR:
+ if (TLE_SEV_DOWN == tle_event_state(&so->erev))
+ p->val = 0;
+ else
+ p->val = ECONNREFUSED;
+ /* fixe me: ETIMEDOUT */
+ return 0;
+ case SO_LINGER:
+ p->ling.l_onoff = 0;
+ return 0;
+ case SO_SNDBUF:
+ p->val = def_sndbuf;
+ return 0;
+ case SO_RCVBUF:
+ p->val = def_rcvbuf;
+ return 0;
+ case SO_ACCEPTCONN:
+ if (IS_TCP(so)
+ && TCP_STREAM(so->s)->tcb.state == TCP_ST_LISTEN)
+ p->val = 1;
+ else
+ p->val = 0;
+ return 0;
+ case SO_KEEPALIVE:
+ p->val = so->option.keepalive;
+ return 0;
+ case SO_TYPE:
+ if (IS_TCP(so))
+ p->val = SOCK_STREAM;
+ else
+ p->val = SOCK_DGRAM;
+ return 0;
+ case SO_OOBINLINE:
+ p->val = so->option.oobinline;
+ return 0;
+ case SO_TIMESTAMP:
+ p->val = so->option.timestamp;
+ return 0;
+ case SO_PROTOCOL:
+ if (so->proto == PROTO_TCP)
+ p->val = IPPROTO_TCP;
+ else
+ p->val = IPPROTO_UDP;
+ return 0;
+ default:
+ break;
+ }
+
+ break;
+ case SOL_TCP:
+ case SOL_UDP:
+ return OPS(so)->getsockopt(so, optname, optval, optlen);
+ }
+
+ GLUE_LOG(WARNING, "getsockopt(%d) with level = %d, optname = %d",
+ sockfd, level, optname);
+ errno = EOPNOTSUPP;
+ return -1;
+}
+
+int PRE(setsockopt)(int sockfd, int level, int optname,
+ const void *optval, socklen_t optlen)
+{
+ int val;
+ struct sock *so;
+ if (is_kernel_fd(sockfd))
+ return k_setsockopt(sockfd, level, optname, optval, optlen);
+ if (!optval && !optlen)
+ return -1;
+
+ val = 0; /* just to make compiler happy */
+ switch (optlen) {
+ case sizeof(char):
+ val = *(const char *)optval;
+ break;
+ case sizeof(int):
+ val = *(const int *)optval;
+ break;
+ }
+
+ so = fd2sock(sockfd);
+
+ switch (level) {
+ case IPPROTO_IP:
+ switch (optname) {
+ case IP_RECVERR:
+ /* needed by netperf */
+ return 0;
+ case IP_MULTICAST_LOOP:
+ if (val == 0)
+ so->option.mulloop = 0;
+ else
+ so->option.mulloop = 1;
+ if (so->s != NULL)
+ so->s->option.mulloop = so->option.mulloop;
+ return 0;
+ case IP_MULTICAST_TTL:
+ if (val > 255 || val < -1) {
+ errno = EINVAL;
+ return -1;
+ }
+ if(val == -1) {
+ val = 1;
+ }
+ so->option.multtl = val;
+ if (so->s != NULL)
+ so->s->option.multtl = so->option.multtl;
+ return 0;
+ case IP_ADD_MEMBERSHIP:
+ if (optlen < sizeof(struct ip_mreq)) {
+ errno = EINVAL;
+ return -1;
+ }
+ const struct ip_mreq* mreq = (const struct ip_mreq*)optval;
+ if (mreq->imr_multiaddr.s_addr == INADDR_ANY) {
+ errno = EINVAL;
+ return -1;
+ }
+ errno = EOPNOTSUPP;
+ return -1;
+ case IP_MTU_DISCOVER:
+ return 0;
+ case IP_TOS:
+ return 0;
+ case IP_RECVTOS:
+ return 0;
+ }
+ break;
+ case IPPROTO_IPV6:
+ switch (optname) {
+ case IPV6_V6ONLY:
+ if (val == 0)
+ so->option.ipv6only = 0;
+ else
+ so->option.ipv6only = 1;
+ if (so->s != NULL)
+ so->s->option.ipv6only = so->option.ipv6only;
+ return 0;
+ case IPV6_TCLASS:
+ return 0;
+ case IPV6_RECVTCLASS:
+ return 0;
+ }
+ break;
+ case SOL_SOCKET:
+ switch (optname) {
+ case SO_REUSEADDR:
+ if (val == 0)
+ so->option.reuseaddr = 0;
+ else
+ so->option.reuseaddr = 1;
+ if (so->s != NULL)
+ so->s->option.reuseaddr = so->option.reuseaddr;
+ return 0;
+ case SO_LINGER:
+ ling = *(const struct linger *)optval;
+ if (ling.l_onoff == 0)
+ return 0;
+ else {
+ GLUE_LOG(ERR, "app is enabling SO_LINGER which is not really supported");
+ return 0;
+ }
+ break;
+ case SO_KEEPALIVE:
+ if (val == 0)
+ so->option.keepalive = 0;
+ else
+ so->option.keepalive = 1;
+ if (so->s != NULL) {
+ so->s->option.keepalive = so->option.keepalive;
+ if (so->proto == PROTO_TCP)
+ tle_tcp_stream_set_keepalive(so->s);
+ }
+ return 0;
+ case SO_REUSEPORT:
+ if (val == 0)
+ so->option.reuseport = 0;
+ else
+ so->option.reuseport = 1;
+ if (so->s != NULL)
+ so->s->option.reuseport = so->option.reuseport;
+ return 0;
+ case SO_SNDBUF:
+ def_sndbuf = val;
+ return 0;
+ case SO_RCVBUF:
+ def_rcvbuf = val;
+ return 0;
+ case SO_DONTROUTE:
+ /* needed by netperf */
+ return 0;
+ case SO_BROADCAST:
+ /* needed by nc */
+ /* todo: only supported for DGRAM */
+ return 0;
+ case SO_TIMESTAMP:
+ so->option.timestamp = !!val;
+ if (so->s != NULL)
+ so->s->option.timestamp = so->option.timestamp;
+ return 0;
+ case SO_OOBINLINE:
+ if (val == 0)
+ so->option.oobinline = 0;
+ else
+ so->option.oobinline = 1;
+ if (so->s != NULL)
+ so->s->option.oobinline = so->option.oobinline;
+ return 0;
+ default:
+ break;
+ }
+ break;
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ return OPS(so)->setsockopt(so, optname, optval, optlen);
+ }
+
+ GLUE_LOG(WARNING, "setsockopt(%d) with level = %d, optname = %d\n",
+ sockfd, level, optname);
+ errno = EOPNOTSUPP;
+ return -1;
+}
+
+/*
+ * Refer to glibc/sysdeps/unix/sysv/linux/fcntl.c
+ */
+int PRE(fcntl)(int fd, int cmd, ...)
+{
+ int rc;
+ void *arg;
+ va_list ap;
+ struct sock *so;
+
+ va_start(ap, cmd);
+ arg = va_arg(ap, void *);
+ va_end(ap);
+
+ if (is_kernel_fd(fd))
+ return k_fcntl(fd, cmd, arg);
+
+ so = fd2sock(fd);
+ switch (cmd) {
+ case F_SETFL:
+ if ((unsigned long)arg & O_NONBLOCK)
+ so->nonblock = 1;
+ else
+ so->nonblock = 0;
+ rc = 0;
+ break;
+ case F_GETFL:
+ if (so->nonblock)
+ rc = O_NONBLOCK | O_RDWR;
+ else
+ rc = O_RDWR;
+ break;
+ case F_SETFD:
+ rc = 0;
+ break;
+ default:
+ rc = -1;
+ errno = EOPNOTSUPP;
+ GLUE_LOG(WARNING, "fcntl(%d) with cmd = %d", fd, cmd);
+ }
+
+ return rc;
+}
+
+/*
+ * Refer to musl/src/misc/ioctl.c
+ */
+int PRE(ioctl)(int fd, unsigned long int request, ...)
+{
+ int rc;
+ void *arg;
+ va_list ap;
+ uint16_t left;
+ struct sock *so;
+ struct rte_mbuf *m;
+
+ va_start(ap, request);
+ arg = va_arg(ap, void *);
+ va_end(ap);
+
+ if (is_kernel_fd(fd))
+ return k_ioctl(fd, request, arg);
+
+ so = fd2sock(fd);
+
+ switch (request) {
+ case FIONREAD: /* SIOCINQ */
+ if (so->s == NULL)
+ *(int *)arg = 0;
+ else if (IS_TCP(so)) {
+ left = tle_tcp_stream_inq(so->s);
+ if (so->rx_left)
+ left += rte_pktmbuf_pkt_len(so->rx_left);
+ *(int *)arg = left;
+ } else {
+ if (so->rx_left)
+ *(int *)arg = rte_pktmbuf_pkt_len(so->rx_left);
+ else {
+ if (tle_udp_stream_recv(so->s, &m , 1) == 0)
+ *(int *)arg = 0;
+ else {
+ *(int *)arg = rte_pktmbuf_pkt_len(m);
+ so->rx_left = m;
+ }
+ }
+ }
+ rc = 0;
+ break;
+ case FIONBIO:
+ if (*(int *)arg)
+ so->nonblock = 1;
+ else
+ so->nonblock = 0;
+ rc = 0;
+ break;
+ case SIOCGSTAMP:
+ if (so->s->timestamp == 0) {
+ errno = ENOENT;
+ rc = -1;
+ } else {
+ ((struct timeval*)arg)->tv_sec = so->s->timestamp >> 20;
+ ((struct timeval*)arg)->tv_usec = so->s->timestamp & 0xFFFFFUL;
+ rc = 0;
+ }
+ break;
+ default:
+ errno = EOPNOTSUPP;
+ rc = -1;
+ GLUE_LOG(WARNING, "ioctl(%d) with request = %ld", fd, request);
+ }
+
+ return rc;
+}
+
+int PRE(shutdown)(int sockfd, int how)
+{
+ struct sock *so;
+
+ if (is_kernel_fd(sockfd))
+ return k_shutdown(sockfd, how);
+
+ so = fd2sock(sockfd);
+ switch (how) {
+ case SHUT_RD:
+ so->shutdown |= RECV_SHUTDOWN;
+ break;
+ case SHUT_WR:
+ so->shutdown |= SEND_SHUTDOWN;
+ break;
+ case SHUT_RDWR:
+ so->shutdown = RECV_SHUTDOWN | SEND_SHUTDOWN;
+ break;
+ }
+ if (OPS(so)->shutdown)
+ return OPS(so)->shutdown(so, how);
+
+ errno = EOPNOTSUPP;
+ return -1;
+}
+
+static inline int
+getname(int sockfd, struct sockaddr *uaddr, socklen_t *addrlen, int peer)
+{
+ struct sock *so;
+ size_t socklen;
+ int rc;
+
+ so = fd2sock(sockfd);
+
+ /* This is ugly, but netperf ask for local addr (before any
+ * connect or bind) to check family.
+ *
+ * To formally fix this, we shall bind a local address in advance
+ */
+ socklen = get_sockaddr_len(so->domain);
+ /* fixme: It is not conform to linux standard, fix it later. */
+ if (*addrlen < socklen) {
+ errno = EINVAL;
+ return -1;
+ }
+ *addrlen = socklen;
+
+ if (so->s == NULL) {
+ if (peer) {
+ errno = ENOTCONN;
+ return -1;
+ } else {
+ memset(uaddr, 0, socklen);
+ uaddr->sa_family = so->domain;
+ return 0;
+ }
+ }
+
+ if (OPS(so)->getname) {
+ rc = OPS(so)->getname(so, uaddr, peer);
+ if (rc < 0)
+ return rc;
+ if (peer) {
+ if ((uaddr->sa_family == AF_INET &&
+ ((struct sockaddr_in*)uaddr)->sin_addr.s_addr == 0) ||
+ (uaddr->sa_family == AF_INET6 &&
+ IN6_IS_ADDR_UNSPECIFIED(&((struct sockaddr_in6*)
+ uaddr)->sin6_addr))) {
+ errno = ENOTCONN;
+ return -1;
+ }
+ }
+ if (uaddr->sa_family == AF_INET && so->domain == AF_INET6)
+ trans_4mapped6_addr(uaddr);
+ return rc;
+ }
+
+ errno = EOPNOTSUPP;
+ return -1;
+}
+
+int PRE(getsockname)(int sockfd, struct sockaddr *addr, socklen_t *addrlen)
+{
+ if (is_kernel_fd(sockfd))
+ return k_getsockname(sockfd, addr, addrlen);
+
+ return getname(sockfd, addr, addrlen, 0);
+}
+
+int PRE(getpeername)(int sockfd, struct sockaddr *addr, socklen_t *addrlen)
+{
+ if (is_kernel_fd(sockfd))
+ return k_getpeername(sockfd, addr, addrlen);
+
+ return getname(sockfd, addr, addrlen, 1);
+}
+
+int PRE(close)(int fd)
+{
+ struct sock *so;
+
+ if (is_kernel_fd(fd))
+ return k_close(fd);
+
+ GLUE_DEBUG("close fd = %d", fd);
+
+ so = fd2sock(fd);
+ if (unlikely(so->valid == 0)) {
+ errno = EBADF;
+ return -1;
+ } else if (unlikely(so->epoll)) {
+ k_close(so->shadow_efd);
+ glue_ctx_free(CTX(so));
+ } else if (so->s) {
+ if (OPS(so)->close)
+ OPS(so)->close(so->s);
+
+ if (IS_TCP(so))
+ be_tx_with_lock(CTX(so));
+
+ if (so->rx_left)
+ rte_pktmbuf_free(so->rx_left);
+ }
+
+ tle_event_idle_err(&so->erev);
+ tle_event_idle(&so->rxev);
+ tle_event_idle(&so->txev);
+
+ memset(((int*)so) + 1, 0, sizeof(*so) - sizeof(int));
+ put_free_fd(fd);
+ return 0;
+}
diff --git a/lib/libtle_glue/sym.c b/lib/libtle_glue/sym.c
new file mode 100644
index 0000000..39b1707
--- /dev/null
+++ b/lib/libtle_glue/sym.c
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#ifndef __USE_GNU
+#define __USE_GNU
+#endif
+#include <dlfcn.h>
+
+#include <rte_debug.h>
+
+#include "sym.h"
+#include "log.h"
+
+#ifdef PRELOAD
+int (*k_epoll_create)(int size);
+int (*k_epoll_create1)(int flags);
+int (*k_epoll_create1)(int flags);
+int (*k_epoll_ctl)(int epfd, int op, int fd, struct epoll_event *event);
+int (*k_epoll_wait)(int epfd, struct epoll_event *events, int maxevents, int timeout);
+int (*k_epoll_pwait)(int epfd, struct epoll_event *events, int maxevents, int timeout, const sigset_t *sigmask);
+int (*k_poll)(struct pollfd *fds, nfds_t nfds, int timeout);
+int (*k_select)(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, struct timeval *timeout);
+int (*k_pselect)(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, const struct timespec *timeout, const sigset_t *sigmask);
+int (*k_socket)(int domain, int type, int protocol);
+int (*k_listen)(int sockfd, int backlog);
+int (*k_bind)(int sockfd, const struct sockaddr *addr, socklen_t addrlen);
+int (*k_accept)(int sockfd, struct sockaddr *addr, socklen_t *addrlen);
+int (*k_accept4)(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags);
+int (*k_connect)(int sockfd, const struct sockaddr *addr, socklen_t addrlen);
+int (*k_getsockopt)(int sockfd, int level, int optname, void *optval, socklen_t *optlen);
+int (*k_setsockopt)(int sockfd, int level, int optname, const void *optval, socklen_t optlen);
+int (*k_fcntl)(int fd, int cmd, ... /* arg */ );
+int (*k_ioctl)(int d, int request, ...);
+int (*k_shutdown)(int sockfd, int how);
+int (*k_close)(int fd);
+ssize_t (*k_recv)(int sockfd, void *buf, size_t len, int flags);
+ssize_t (*k_recvfrom)(int sockfd, void *buf, size_t len, int flags, struct sockaddr *src_addr, socklen_t *addrlen);
+ssize_t (*k_recvmsg)(int sockfd, struct msghdr *msg, int flags);
+ssize_t (*k_read)(int fd, void *buf, size_t count);
+ssize_t (*k_readv)(int fd, const struct iovec *iov, int iovcnt);
+ssize_t (*k_send)(int sockfd, const void *buf, size_t len, int flags);
+ssize_t (*k_sendto)(int sockfd, const void *buf, size_t len, int flags, const struct sockaddr *dest_addr, socklen_t addrlen);
+ssize_t (*k_sendmsg)(int sockfd, const struct msghdr *msg, int flags);
+ssize_t (*k_write)(int fd, const void *buf, size_t count);
+ssize_t (*k_writev)(int fd, const struct iovec *iov, int iovcnt);
+ssize_t (*k_splice)(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags);
+ssize_t (*k_sendfile)(int out_fd, int in_fd, off_t *offset, size_t count);
+int (*k_getsockname)(int sockfd, struct sockaddr *addr, socklen_t *addrlen);
+int (*k_getpeername)(int sockfd, struct sockaddr *addr, socklen_t *addrlen);
+
+#define INIT_FUNC(func, handle) do { \
+ k_##func = dlsym(handle, #func); \
+ if ((error = dlerror()) != NULL) { \
+ rte_panic(#func "is not init"); \
+ } \
+ RTE_ASSERT(k_##func); \
+} while (0)
+
+#endif
+
+void
+symbol_init(void)
+{
+#ifdef PRELOAD
+ void *handle;
+ char *error;
+
+ TRACE("in %s", __func__);
+
+ handle = dlopen("libc.so.6", RTLD_NOW);
+ error = dlerror();
+ if (!handle) {
+ fprintf(stderr, "%s\n", error);
+ exit(EXIT_FAILURE);
+ }
+
+ INIT_FUNC(epoll_create, handle);
+ INIT_FUNC(epoll_create1, handle);
+ INIT_FUNC(epoll_create1, handle);
+ INIT_FUNC(epoll_ctl, handle);
+ INIT_FUNC(epoll_wait, handle);
+ INIT_FUNC(epoll_pwait, handle);
+ INIT_FUNC(socket, handle);
+ INIT_FUNC(listen, handle);
+ INIT_FUNC(bind, handle);
+ INIT_FUNC(accept, handle);
+ INIT_FUNC(accept4, handle);
+ INIT_FUNC(connect, handle);
+ INIT_FUNC(getsockopt, handle);
+ INIT_FUNC(setsockopt, handle);
+ INIT_FUNC(fcntl, handle);
+ INIT_FUNC(ioctl, handle);
+ INIT_FUNC(shutdown, handle);
+ INIT_FUNC(close, handle);
+ INIT_FUNC(recv, handle);
+ INIT_FUNC(recvfrom, handle);
+ INIT_FUNC(recvmsg, handle);
+ INIT_FUNC(read, handle);
+ INIT_FUNC(readv, handle);
+ INIT_FUNC(send, handle);
+ INIT_FUNC(sendto, handle);
+ INIT_FUNC(sendmsg, handle);
+ INIT_FUNC(write, handle);
+ INIT_FUNC(writev, handle);
+ INIT_FUNC(splice, handle);
+ INIT_FUNC(sendfile, handle);
+ INIT_FUNC(poll, handle);
+ INIT_FUNC(getsockname, handle);
+ INIT_FUNC(getpeername, handle);
+ INIT_FUNC(select, handle);
+ INIT_FUNC(pselect, handle);
+
+ dlclose(handle);
+#endif
+}
diff --git a/lib/libtle_glue/sym.h b/lib/libtle_glue/sym.h
new file mode 100644
index 0000000..b5a333d
--- /dev/null
+++ b/lib/libtle_glue/sym.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _TLE_KSYM_H_
+#define _TLE_KSYM_H_
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <sys/socket.h>
+
+#include <sys/epoll.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <poll.h>
+#include <sys/uio.h>
+#include <sys/sendfile.h>
+#include <sys/select.h>
+#include <sys/time.h>
+
+#include "tle_glue.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void symbol_init(void);
+
+#ifdef PRELOAD
+int (*k_epoll_create)(int size);
+int (*k_epoll_create1)(int flags);
+int (*k_epoll_ctl)(int epfd, int op, int fd, struct epoll_event *event);
+int (*k_epoll_wait)(int epfd, struct epoll_event *events, int maxevents, int timeout);
+int (*k_epoll_pwait)(int epfd, struct epoll_event *events, int maxevents, int timeout, const sigset_t *sigmask);
+int (*k_poll)(struct pollfd *fds, nfds_t nfds, int timeout);
+int (*k_select)(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, struct timeval *timeout);
+int (*k_pselect)(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, const struct timespec *timeout, const sigset_t *sigmask);
+
+int (*k_socket)(int domain, int type, int protocol);
+int (*k_listen)(int sockfd, int backlog);
+int (*k_bind)(int sockfd, const struct sockaddr *addr, socklen_t addrlen);
+int (*k_accept)(int sockfd, struct sockaddr *addr, socklen_t *addrlen);
+int (*k_accept4)(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags);
+int (*k_connect)(int sockfd, const struct sockaddr *addr, socklen_t addrlen);
+int (*k_getsockopt)(int sockfd, int level, int optname, void *optval, socklen_t *optlen);
+int (*k_setsockopt)(int sockfd, int level, int optname, const void *optval, socklen_t optlen);
+int (*k_fcntl)(int fd, int cmd, ... /* arg */ );
+int (*k_ioctl)(int d, int request, ...);
+int (*k_shutdown)(int sockfd, int how);
+int (*k_close)(int fd);
+ssize_t (*k_recv)(int sockfd, void *buf, size_t len, int flags);
+ssize_t (*k_recvfrom)(int sockfd, void *buf, size_t len, int flags, struct sockaddr *src_addr, socklen_t *addrlen);
+ssize_t (*k_recvmsg)(int sockfd, struct msghdr *msg, int flags);
+ssize_t (*k_read)(int fd, void *buf, size_t count);
+ssize_t (*k_readv)(int fd, const struct iovec *iov, int iovcnt);
+ssize_t (*k_send)(int sockfd, const void *buf, size_t len, int flags);
+ssize_t (*k_sendto)(int sockfd, const void *buf, size_t len, int flags, const struct sockaddr *dest_addr, socklen_t addrlen);
+ssize_t (*k_sendmsg)(int sockfd, const struct msghdr *msg, int flags);
+ssize_t (*k_write)(int fd, const void *buf, size_t count);
+ssize_t (*k_writev)(int fd, const struct iovec *iov, int iovcnt);
+ssize_t (*k_splice)(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags);
+ssize_t (*k_sendfile)(int out_fd, int in_fd, off_t *offset, size_t count);
+int (*k_getsockname)(int sockfd, struct sockaddr *addr, socklen_t *addrlen);
+int (*k_getpeername)(int sockfd, struct sockaddr *addr, socklen_t *addrlen);
+#else
+#define k_epoll_create epoll_create
+#define k_epoll_create1 epoll_create1
+#define k_epoll_ctl epoll_ctl
+#define k_epoll_wait epoll_wait
+#define k_epoll_pwait epoll_pwait
+#define k_poll poll
+#define k_select select
+#define k_pselect pselect
+#define k_socket socket
+#define k_listen listen
+#define k_bind bind
+#define k_accept accept
+#define k_accept4 accept4
+#define k_connect connect
+#define k_getsockopt getsockopt
+#define k_setsockopt setsockopt
+#define k_fcntl fcntl
+#define k_ioctl ioctl
+#define k_shutdown shutdown
+#define k_close close
+#define k_recv recv
+#define k_recvfrom recvfrom
+#define k_recvmsg recvmsg
+#define k_read read
+#define k_readv readv
+#define k_send send
+#define k_sendto sendto
+#define k_sendmsg sendmsg
+#define k_write write
+#define k_writev writev
+#define k_splice splice
+#define k_sendfile sendfile
+#define k_getsockname getsockname
+#define k_getpeername getpeername
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _TLE_KSYM_H_ */
diff --git a/lib/libtle_glue/tcp.c b/lib/libtle_glue/tcp.c
new file mode 100644
index 0000000..e5186c0
--- /dev/null
+++ b/lib/libtle_glue/tcp.c
@@ -0,0 +1,558 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdarg.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
+
+#include <tle_tcp.h>
+
+#include "sym.h"
+#include "fd.h"
+#include "log.h"
+#include "util.h"
+#include "internal.h"
+#include "sock.h"
+
+#define MAX_TCP_KEEPIDLE 32767
+#define MAX_TCP_KEEPINTVL 32767
+#define MAX_TCP_KEEPCNT 127
+
+static inline void
+foo_support(const char *msg)
+{
+ GLUE_LOG(WARNING, "%s, return ok without really supporting it", msg);
+}
+
+static int
+tcp_setsockopt(struct sock *sk, int optname,
+ const void *optval, socklen_t optlen)
+{
+ int val;
+
+ val = 0; /* just to make compiler happy */
+ if (optlen == sizeof(val))
+ val = *(const int *)optval;
+
+ /* man tcp(7) or see /usr/include/netinet/tcp.h */
+ switch (optname) {
+ case TCP_NODELAY: /* antonym: TCP_CORK */
+ if (val == 0)
+ sk->option.tcpnodelay = 0;
+ else
+ sk->option.tcpnodelay = 1;
+ if (sk->s != NULL)
+ sk->s->option.tcpnodelay = sk->option.tcpnodelay;
+ return 0;
+ case TCP_CORK:
+ if (val == 0)
+ sk->option.tcpcork = 0;
+ else
+ sk->option.tcpcork = 1;
+ if (sk->s != NULL)
+ sk->s->option.tcpcork = sk->option.tcpcork;
+ return 0;
+ case TCP_KEEPIDLE:
+ if (val <= 0 || val > MAX_TCP_KEEPIDLE) {
+ errno = EINVAL;
+ return -1;
+ }
+ sk->option.keepidle = val;
+ if (sk->s != NULL) {
+ sk->s->option.keepidle = sk->option.keepidle;
+ tle_tcp_stream_set_keepalive(sk->s);
+ }
+ return 0;
+ case TCP_KEEPINTVL:
+ if (val <= 0 || val > MAX_TCP_KEEPINTVL) {
+ errno = EINVAL;
+ return -1;
+ }
+ sk->option.keepintvl = val;
+ if (sk->s != NULL) {
+ sk->s->option.keepintvl = sk->option.keepintvl;
+ tle_tcp_stream_set_keepalive(sk->s);
+ }
+ return 0;
+ case TCP_KEEPCNT:
+ if (val <= 0 || val > MAX_TCP_KEEPCNT) {
+ errno = EINVAL;
+ return -1;
+ }
+ sk->option.keepcnt = val;
+ if (sk->s != NULL)
+ sk->s->option.keepcnt = sk->option.keepcnt;
+ return 0;
+ case TCP_USER_TIMEOUT:
+ foo_support("set TCP_USER_TIMEOUT");
+ return 0;
+ case TCP_DEFER_ACCEPT:
+ if (val == 0)
+ return 0;
+ break;
+ case TCP_FASTOPEN:
+ case TCP_FASTOPEN_CONNECT:
+ if (val == 0)
+ return 0;
+ break;
+ case TCP_QUICKACK:
+ /* Based on below info, it's safe to just return 0:
+ * "This flag is not permanent, it only enables a
+ * switch to or from quickack mode. Subsequent
+ * operationof the TCP protocol will once again ..."
+ */
+ if (val == 0)
+ sk->option.tcpquickack = 0;
+ else
+ sk->option.tcpquickack = 8;
+ if (sk->s != NULL)
+ sk->s->option.tcpquickack = sk->option.tcpquickack;
+ return 0;
+ case TCP_CONGESTION:
+ /* only support NewReno; but we return success for
+ * any kind of setting.
+ */
+ foo_support("set TCP_CONGESTION");
+ return 0;
+ default:
+ break;
+ }
+
+ GLUE_LOG(WARNING, "setsockopt(%d) with level = SOL_TCP, optname = %d\n",
+ sock2fd(sk), optname);
+ errno = EOPNOTSUPP;
+ return -1;
+}
+
+static int
+tcp_getsockopt(struct sock *sk, int optname,
+ void *optval, socklen_t *optlen)
+{
+ int rc;
+ union {
+ int val;
+ uint64_t val64;
+ struct linger ling;
+ struct timeval tm;
+ } *p = optval;
+
+ RTE_SET_USED(optlen);
+
+ /* man tcp(7) or see /usr/include/netinet/tcp.h */
+ switch (optname) {
+ case TCP_MAXSEG:
+ p->val = 64 * 1024;
+ return 0;
+ case TCP_FASTOPEN:
+ case TCP_FASTOPEN_CONNECT:
+ p->val = 0;
+ return 0;
+ case TCP_INFO:
+ /* needed by netperf */
+ rc = tle_tcp_stream_get_info(sk->s, optval, optlen);
+ if (rc < 0) {
+ errno = -rc;
+ return -1;
+ }
+ return 0;
+ case TCP_CONGESTION:
+ strncpy(optval, "NewReno", *optlen);
+ ((char *)optval)[*optlen - 1] = '\0';
+ return 0;
+ case TCP_CORK:
+ p->val = sk->option.tcpcork;
+ return 0;
+ case TCP_QUICKACK:
+ p->val = sk->option.tcpquickack != 0 ? 1 : 0;
+ return 0;
+ case TCP_NODELAY:
+ p->val = sk->option.tcpnodelay;
+ return 0;
+ case TCP_KEEPIDLE:
+ p->val = sk->option.keepidle;
+ return 0;
+ case TCP_KEEPINTVL:
+ p->val = sk->option.keepintvl;
+ return 0;
+ case TCP_KEEPCNT:
+ p->val = sk->option.keepcnt;
+ return 0;
+ default:
+ break;
+ }
+
+ GLUE_LOG(WARNING, "getsockopt(%d) with level = SOL_TCP, optname = %d",
+ sock2fd(sk), optname);
+ errno = EOPNOTSUPP;
+ return -1;
+}
+
+static int
+tcp_getname(struct sock *sk, struct sockaddr *addr, int peer)
+{
+ int rc;
+ int addrlen;
+ struct tle_tcp_stream_addr a;
+
+ rc = tle_tcp_stream_get_addr(sk->s, &a);
+ if (rc) {
+ errno = -rc;
+ return -1;
+ }
+
+ if (a.local.ss_family == AF_INET)
+ addrlen = sizeof(struct sockaddr_in);
+ else
+ addrlen = sizeof(struct sockaddr_in6);
+
+ if (peer)
+ memcpy(addr, &a.remote, addrlen);
+ else
+ memcpy(addr, &a.local, addrlen);
+
+ addr->sa_family = a.local.ss_family;
+
+ return 0;
+}
+
+static int
+tcp_bind(struct sock *sk, const struct sockaddr *addr)
+{
+ sk->s = open_bind(sk, addr, NULL);
+ if (sk->s == NULL)
+ return -1;
+ return 0;
+}
+
+static int
+tcp_listen(struct sock *sk, int backlog)
+{
+ int32_t rc;
+
+ if (backlog < 0) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ /*
+ * if socket is unbind, should call open_bind to assign an ramdon addres
+ * before listening
+ */
+ if (sk->s == NULL) {
+ sk->s = open_bind(sk, NULL, NULL);
+ if (sk->s == NULL)
+ return -1;
+ }
+
+ rc = tle_tcp_stream_listen(sk->s);
+ if (rc) {
+ errno = -rc;
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+tcp_connect(struct sock *sk, const struct sockaddr *addr)
+{
+ int rc;
+ int rx;
+ int ret;
+ struct epoll_event event;
+ struct sockaddr_storage laddr;
+ struct sockaddr_storage raddr;
+ struct sockaddr_in *addr4;
+ struct sockaddr_in6 *addr6;
+ struct sockaddr *local = NULL;
+
+ /* TODO: For multi-thread case, we shall properly manage local
+ * L4 port so that packets coming back can be put into the same
+ * queue pair.
+ */
+ if (sk->s) {
+ struct tle_tcp_stream *ts = TCP_STREAM(sk->s);
+ /* case 1: bind happens before connect;
+ * case 2: connect after a previous connect, failed
+ * or succeeded.
+ */
+ if (ts->tcb.err != 0) {
+ errno = ts->tcb.err;
+ return -1;
+ }
+
+ int state = ts->tcb.state;
+
+ if (state >= TCP_ST_ESTABLISHED && sk->tcp_connected == 0) {
+ sk->tcp_connected = 1;
+ return 0; /* connect succeeds */
+ }
+
+ if (state == TCP_ST_CLOSED) {
+ if (tcp_getname(sk, (struct sockaddr *)&laddr, 0) == 0)
+ local = (struct sockaddr *)&laddr;
+ tle_tcp_stream_close(sk->s);
+ sk->s = NULL;
+ goto do_connect; /* case 1 */
+ } else if (state >= TCP_ST_SYN_SENT &&
+ state < TCP_ST_ESTABLISHED)
+ errno = EALREADY;
+ else if (state >= TCP_ST_ESTABLISHED)
+ errno = EISCONN;
+ else
+ errno = EINVAL;
+ return -1;
+ }
+
+do_connect:
+ sk->s = open_bind(sk, local, addr);
+ if (sk->s == NULL) /* errno is set */
+ return -1;
+
+ if (sk->domain == AF_INET) {
+ addr4 = (struct sockaddr_in*)&raddr;
+ addr4->sin_family = AF_INET;
+ addr4->sin_port = sk->s->port.src;
+ addr4->sin_addr.s_addr = sk->s->ipv4.addr.src;
+ } else {
+ addr6 = (struct sockaddr_in6*)&raddr;
+ addr6->sin6_family = AF_INET6;
+ addr6->sin6_port = sk->s->port.src;
+ rte_memcpy(&addr6->sin6_addr, &sk->s->ipv6.addr.src,
+ sizeof(struct in6_addr));
+ }
+ rc = tle_tcp_stream_connect(sk->s, (const struct sockaddr*)&raddr);
+ if (rc < 0) {
+ errno = -rc;
+ return -1;
+ }
+
+ if (is_nonblock(sk, 0)) {
+ be_tx_with_lock(CTX(sk));
+ errno = EINPROGRESS; /* It could not be ready so fast */
+ return -1;
+ }
+
+ do {
+ be_process(CTX(sk));
+
+ if (tle_event_state(&sk->txev) == TLE_SEV_UP) {
+ sk->tcp_connected = 1;
+ tle_event_down(&sk->txev);
+ ret = 0;
+ break;
+ }
+
+ if (tle_event_state(&sk->erev) == TLE_SEV_UP) {
+ tle_event_down(&sk->erev);
+ errno = ECONNREFUSED;
+ ret = -1;
+ break;
+ }
+
+ /* fix me: timeout? */
+ epoll_kernel_wait(CTX(sk), -1, &event, 1, 1, &rx);
+ } while (1);
+
+ return ret;
+}
+
+static void tcp_update_cfg(struct sock *sk);
+
+static int
+tcp_accept(struct sock *sk, struct sockaddr *addr,
+ socklen_t *addrlen, int flags)
+{
+ int fd;
+ int rx;
+ struct sock *newsk;
+ struct tle_stream *rs;
+ struct epoll_event event;
+ struct tle_tcp_stream_addr a;
+
+ if (sk->s == NULL) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ fd = get_unused_fd();
+ if (fd < 0) {
+ errno = ENFILE;
+ return -1;
+ }
+
+ newsk = fd2sock(fd);
+again:
+ if (tle_tcp_stream_accept(sk->s, &rs, 1) == 0) {
+ if (rte_errno != EAGAIN) {
+ errno = rte_errno;
+ return -1;
+ }
+
+ if (is_nonblock(sk, flags)) {
+ newsk->valid = 0;
+ put_free_fd(fd);
+ errno = EAGAIN;
+ return -1;
+ }
+
+ epoll_kernel_wait(CTX(sk), -1, &event, 1, 1, &rx);
+ be_process(CTX(sk));
+ goto again;
+ }
+
+ newsk->s = rs;
+ newsk->cid = sk->cid;
+ newsk->domain = sk->domain;
+ newsk->proto = sk->proto;
+ newsk->option.raw = 0;
+ newsk->option.tcpquickack = 1;
+ newsk->option.mulloop = 1;
+ newsk->option.multtl = 1;
+ newsk->option.keepidle = 2 * 60 * 60;
+ newsk->option.keepintvl = 75;
+ newsk->option.keepcnt = 9;
+ newsk->s->option.raw = newsk->option.raw;
+ sock_alloc_events(newsk);
+ tcp_update_cfg(newsk);
+
+ if (addr) {
+ /* We assume this function never fails */
+ tle_tcp_stream_get_addr(rs, &a);
+
+ *addrlen = sizeof(struct sockaddr_in);
+ memcpy(addr, &a.remote, *addrlen);
+ }
+
+ GLUE_DEBUG("accept fd = %d", fd);
+ return fd;
+}
+
+static ssize_t
+tcp_send(struct sock *sk, struct rte_mbuf *pkt[],
+ uint16_t num, const struct sockaddr *dst_addr)
+{
+ uint16_t rc;
+ RTE_SET_USED(dst_addr);
+
+ if (sk->s == NULL) {
+ errno = EPIPE;
+ return 0;
+ }
+
+ rc = tle_tcp_stream_send(sk->s, pkt, num);
+ if (rc == 0)
+ errno = rte_errno;
+ return rc;
+}
+
+static ssize_t
+tcp_recv(struct tle_stream *s, struct rte_mbuf *pkt[],
+ uint16_t num, struct sockaddr *addr)
+{
+ uint16_t rc;
+
+ RTE_SET_USED(addr);
+
+ /* optimize me: merge multiple mbufs into one */
+ rc = tle_tcp_stream_recv(s, pkt, num);
+ if (rc == 0)
+ errno = rte_errno;
+
+ return rc;
+}
+
+static ssize_t
+tcp_readv(struct tle_stream *ts, struct msghdr *msg, int flags __rte_unused)
+{
+ ssize_t rc;
+
+ rc = tle_tcp_stream_recvmsg(ts, msg);
+ if (rc < 0)
+ errno = rte_errno;
+ return rc;
+}
+
+static ssize_t
+tcp_writev(struct sock *sk, const struct iovec *iov,
+ int iovcnt, const struct sockaddr *dst_addr)
+{
+ ssize_t rc;
+ struct rte_mempool *mp = get_mempool_by_socket(0); /* fix me */
+
+ RTE_SET_USED(dst_addr);
+
+ if (sk->s == NULL) {
+ errno = EPIPE;
+ return -1;
+ }
+
+ rc = tle_tcp_stream_writev(sk->s, mp, iov, iovcnt);
+ if (rc < 0)
+ errno = rte_errno;
+ return rc;
+}
+
+static int
+tcp_shutdown(struct sock *sk, int how)
+{
+ int ret;
+
+ /* Refer to linux/net/ipv4/tcp.c:tcp_shutdown() */
+ if (how == SHUT_RD)
+ return 0;
+
+ ret = tle_tcp_stream_shutdown(sk->s, how);
+ if (ret < 0)
+ errno = rte_errno;
+ else
+ be_tx_with_lock(CTX(sk)); /* Make sure fin is sent */
+ return ret;
+
+}
+
+static void
+tcp_update_cfg(struct sock *sk)
+{
+ struct tle_tcp_stream_cfg prm = {0};
+
+ prm.recv_ev = &sk->rxev;
+ prm.send_ev = &sk->txev;
+ prm.err_ev = &sk->erev;
+ tle_tcp_stream_update_cfg(&sk->s, &prm, 1);
+}
+
+struct proto tcp_prot = {
+ .name = "TCP",
+ .setsockopt = tcp_setsockopt,
+ .getsockopt = tcp_getsockopt,
+ .getname = tcp_getname,
+ .bind = tcp_bind,
+ .listen = tcp_listen,
+ .connect = tcp_connect,
+ .accept = tcp_accept,
+ .recv = tcp_recv,
+ .send = tcp_send,
+ .readv = tcp_readv,
+ .writev = tcp_writev,
+ .shutdown = tcp_shutdown,
+ .close = tle_tcp_stream_close,
+ .update_cfg = tcp_update_cfg,
+};
diff --git a/lib/libtle_glue/tle_glue.h b/lib/libtle_glue/tle_glue.h
new file mode 100644
index 0000000..38357e4
--- /dev/null
+++ b/lib/libtle_glue/tle_glue.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _TLE_GLUE_H_
+#define _TLE_GLUE_H_
+
+#include <sys/types.h>
+#include <sys/epoll.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <signal.h>
+#include <poll.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef PRELOAD
+
+#define PRE(name) name
+
+#else
+
+#define PRE(name) tle_ ## name
+
+#endif
+
+void glue_init1(int argc, char **argv);
+
+/* epoll */
+int PRE(epoll_create)(int size);
+int PRE(epoll_create1)(int flags);
+int PRE(epoll_ctl)(int epfd, int op, int fd, struct epoll_event *event);
+int PRE(epoll_wait)(int epfd, struct epoll_event *events, int maxevents, int timeout);
+int PRE(epoll_pwait)(int epfd, struct epoll_event *events,
+ int maxevents, int timeout, const sigset_t *sigmask);
+
+/* for setup, settings, and destroy */
+int PRE(socket)(int domain, int type, int protocol);
+int PRE(listen)(int sockfd, int backlog);
+int PRE(bind)(int sockfd, const struct sockaddr *addr, socklen_t addrlen);
+int PRE(accept)(int sockfd, struct sockaddr *addr, socklen_t *addrlen);
+int PRE(accept4)(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags);
+int PRE(connect)(int sockfd, const struct sockaddr *addr, socklen_t addrlen);
+int PRE(getsockopt)(int sockfd, int level, int optname,
+ void *optval, socklen_t *optlen);
+int PRE(setsockopt)(int sockfd, int level, int optname,
+ const void *optval, socklen_t optlen);
+int PRE(getsockname)(int sockfd, struct sockaddr *addr, socklen_t *addrlen);
+int PRE(getpeername)(int sockfd, struct sockaddr *addr, socklen_t *addrlen);
+int PRE(fcntl)(int fd, int cmd, ... /* arg */ );
+int PRE(ioctl)(int d, unsigned long int request, ...);
+int PRE(shutdown)(int sockfd, int how);
+int PRE(close)(int fd);
+
+/* for recv */
+ssize_t PRE(recv)(int sockfd, void *buf, size_t len, int flags);
+ssize_t PRE(recvfrom)(int sockfd, void *buf, size_t len, int flags,
+ struct sockaddr *src_addr, socklen_t *addrlen);
+ssize_t PRE(recvmsg)(int sockfd, struct msghdr *msg, int flags);
+ssize_t PRE(read)(int fd, void *buf, size_t count);
+ssize_t PRE(readv)(int fd, const struct iovec *iov, int iovcnt);
+
+/* for send */
+ssize_t PRE(send)(int sockfd, const void *buf, size_t len, int flags);
+ssize_t PRE(sendto)(int sockfd, const void *buf, size_t len, int flags,
+ const struct sockaddr *dest_addr, socklen_t addrlen);
+ssize_t PRE(sendmsg)(int sockfd, const struct msghdr *msg, int flags);
+ssize_t PRE(write)(int fd, const void *buf, size_t count);
+ssize_t PRE(writev)(int fd, const struct iovec *iov, int iovcnt);
+
+/* advanced functions */
+ssize_t PRE(splice)(int fd_in, loff_t *off_in, int fd_out,
+ loff_t *off_out, size_t len, unsigned int flags);
+ssize_t PRE(sendfile)(int out_fd, int in_fd, off_t *offset, size_t count);
+
+/* poll */
+int PRE(poll)(struct pollfd *fds, nfds_t nfds, int timeout);
+int PRE(ppoll)(struct pollfd *fds, nfds_t nfds,
+ const struct timespec *tmo_p, const sigset_t *sigmask);
+
+/* select */
+int PRE(select)(int nfds, fd_set *readfds, fd_set *writefds,
+ fd_set *exceptfds, struct timeval *timeout);
+int PRE(pselect)(int nfds, fd_set *readfds, fd_set *writefds,
+ fd_set *exceptfds, const struct timespec *timeout,
+ const sigset_t *sigmask);
+
+/* non-posix APIs */
+int fd_ready(int fd, int events);
+void v_get_stats_snmp(unsigned long mibs[]);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _TLE_GLUE_H_ */
diff --git a/lib/libtle_glue/udp.c b/lib/libtle_glue/udp.c
new file mode 100644
index 0000000..9f199bc
--- /dev/null
+++ b/lib/libtle_glue/udp.c
@@ -0,0 +1,419 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdarg.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+
+#include <rte_ethdev.h>
+#include <tle_udp.h>
+
+#include "sym.h"
+#include "fd.h"
+#include "log.h"
+#include "util.h"
+#include "internal.h"
+#include "sock.h"
+
+static int
+udp_setsockopt(__rte_unused struct sock *sk, __rte_unused int optname,
+ __rte_unused const void *optval, __rte_unused socklen_t optlen)
+{
+ return 0;
+}
+
+static int
+udp_getsockopt(__rte_unused struct sock *sk, __rte_unused int optname,
+ __rte_unused void *optval, __rte_unused socklen_t *optlen)
+{
+ return 0;
+}
+
+static int
+udp_getname(struct sock *sk, struct sockaddr *addr, int peer)
+{
+ struct tle_udp_stream_param p;
+ size_t addrlen;
+ int rc;
+
+ rc = tle_udp_stream_get_param(sk->s, &p);
+ if (rc) {
+ errno = -rc;
+ return -1;
+ }
+
+ addrlen = get_sockaddr_len(sk->domain);
+ if (peer)
+ memcpy(addr, &p.remote_addr, addrlen);
+ else
+ memcpy(addr, &p.local_addr, addrlen);
+ addr->sa_family = p.local_addr.ss_family;
+ return 0;
+}
+
+static int
+udp_bind(struct sock *sk, const struct sockaddr *addr)
+{
+ if (sk->ubind) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ sk->s = open_bind(sk, addr, NULL);
+ if (sk->s != NULL) {
+ sk->ubind = 1;
+ if (is_any_addr(addr))
+ sk->ubindany = 1;
+ return 0;
+ }
+
+ return -1;
+}
+
+static int
+udp_connect(struct sock *sk, const struct sockaddr *addr)
+{
+ struct sockaddr_storage laddr;
+
+ /* According to linux manual, connectionless sockets may dissolve the
+ * association by connecting to an address with the sa_family member of
+ * sockaddr set to AF_UNSPEC (supported on Linux since kernel 2.2).
+ */
+ if (sk->ubind) {
+ if (udp_getname(sk, (struct sockaddr *)&laddr, 0))
+ return -1;
+ if (addr->sa_family == AF_UNSPEC) {
+ addr = NULL;
+ if (sk->ubindany)
+ set_any_addr((struct sockaddr *)&laddr);
+ }
+ sk->s = open_bind(sk, (const struct sockaddr *)&laddr, addr);
+ } else {
+ if (addr->sa_family == AF_UNSPEC) {
+ tle_udp_stream_close(sk->s);
+ sk->s = NULL;
+ return 0;
+ }
+ sk->s = open_bind(sk, NULL, addr);
+ }
+
+ if (sk->s)
+ return 0;
+
+ return -1;
+}
+
+static int
+udp_addr_prepare(struct sock *sk, const struct sockaddr **p_dst_addr,
+ struct sockaddr_storage *addr)
+{
+ const struct sockaddr *dst_addr = *p_dst_addr;
+
+ if (dst_addr != NULL &&
+ dst_addr->sa_family == AF_INET6 &&
+ IN6_IS_ADDR_V4MAPPED(&((const struct sockaddr_in6 *)dst_addr)->sin6_addr)) {
+ rte_memcpy(addr, dst_addr, sizeof(struct sockaddr_in6));
+ dst_addr = (const struct sockaddr*)(addr);
+ *p_dst_addr = dst_addr;
+ retrans_4mapped6_addr((struct sockaddr_storage*)(addr));
+ }
+
+ if (sk->s == NULL) {
+ if (dst_addr == NULL) {
+ errno = EDESTADDRREQ;
+ return -1;
+ }
+
+ sk->s = open_bind(sk, NULL, dst_addr);
+ if (sk->s == NULL) /* errno is set */
+ return -1;
+ } else if (dst_addr != NULL) {
+ if (dst_addr->sa_family == AF_INET6 && sk->domain == AF_INET) {
+ errno = EINVAL;
+ return -1;
+ }
+ if (dst_addr->sa_family == AF_INET && sk->domain == AF_INET6) {
+ if (IN6_IS_ADDR_UNSPECIFIED(&sk->s->ipv6.addr.dst)) {
+ sk->s->type = TLE_V4;
+ sk->s->ipv4.addr.dst = 0;
+ } else {
+ errno = ENETUNREACH;
+ return -1;
+ }
+ }
+ }
+
+ return 0;
+}
+
+/* abstract client info from mbuf into s */
+static inline void
+udp_pkt_addr(const struct rte_mbuf *m, struct sockaddr *addr,
+ __rte_unused uint16_t family)
+{
+ const struct ipv4_hdr *ip4h;
+ const struct ipv6_hdr *ip6h;
+ const struct udp_hdr *udph;
+ struct sockaddr_in *in4;
+ struct sockaddr_in6 *in6;
+ int off = -(m->l4_len + m->l3_len);
+
+ udph = rte_pktmbuf_mtod_offset(m, struct udp_hdr *, -m->l4_len);
+ ip4h = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr *, off);
+ if ((ip4h->version_ihl>>4) == 4) {
+ addr->sa_family = AF_INET;
+ in4 = (struct sockaddr_in *)addr;
+ in4->sin_port = udph->src_port;
+ in4->sin_addr.s_addr = ip4h->src_addr;
+ } else {
+ addr->sa_family = AF_INET6;
+ ip6h = (const struct ipv6_hdr*)ip4h;
+ in6 = (struct sockaddr_in6 *)addr;
+ in6->sin6_port = udph->src_port;
+ rte_memcpy(&in6->sin6_addr, ip6h->src_addr,
+ sizeof(in6->sin6_addr));
+ }
+}
+
+static ssize_t
+udp_send(struct sock *sk, struct rte_mbuf *pkt[],
+ uint16_t num, const struct sockaddr *dst_addr)
+{
+ uint16_t i;
+ struct sockaddr_storage addr;
+
+ if (udp_addr_prepare(sk, &dst_addr, &addr) != 0)
+ return 0;
+
+ /* chain them together as *one* message */
+ for (i = 1; i < num; ++i) {
+ pkt[i-1]->next = pkt[i];
+ pkt[0]->pkt_len += pkt[i]->pkt_len;
+ }
+ pkt[0]->nb_segs = num;
+
+ if (tle_udp_stream_send(sk->s, &pkt[0], 1, dst_addr) == 0) {
+ errno = rte_errno;
+ return 0;
+ }
+
+ return num;
+}
+
+static ssize_t
+udp_readv(struct tle_stream *s, struct msghdr *msg, int flags)
+{
+ int i;
+ ssize_t sz;
+ uint16_t rc;
+ uint32_t fin;
+ struct iovec iv;
+ struct rte_mbuf *m;
+ const struct iovec *iov = msg->msg_iov;
+ int iovcnt = msg->msg_iovlen;
+
+ rc = tle_udp_stream_recv(s, &m, 1);
+ if (rc == 0) {
+ errno = rte_errno;
+ return -1;
+ }
+
+ if (!s->option.timestamp)
+ s->timestamp = m->timestamp;
+ if (msg != NULL && msg->msg_control != NULL) {
+ if (s->option.timestamp)
+ tle_set_timestamp(msg, m);
+ else
+ msg->msg_controllen = 0;
+ }
+
+ if (msg != NULL && msg->msg_name != NULL) {
+ udp_pkt_addr(m, (struct sockaddr*)msg->msg_name, 0);
+ if (((struct sockaddr *)msg->msg_name)->sa_family == AF_INET)
+ msg->msg_namelen = sizeof(struct sockaddr_in);
+ else
+ msg->msg_namelen = sizeof(struct sockaddr_in6);
+ }
+
+ for (i = 0, sz = 0; i != iovcnt; i++) {
+ iv = iov[i];
+ sz += iv.iov_len;
+ fin = _mbus_to_iovec(&iv, &m, 1);
+ if (fin == 1) {
+ sz -= iv.iov_len;
+ break;
+ }
+ }
+ if (fin == 0) {
+ if (flags & MSG_TRUNC)
+ sz += m->pkt_len;
+ rte_pktmbuf_free_seg(m);
+ msg->msg_flags |= MSG_TRUNC;
+ }
+ return sz;
+}
+
+static ssize_t
+udp_writev(struct sock *sk, const struct iovec *iov,
+ int iovcnt, const struct sockaddr *dst_addr)
+{
+ struct rte_mempool *mp = get_mempool_by_socket(0); /* fix me */
+ struct sockaddr_storage addr;
+ uint32_t slen, left_m, left_b, copy_len, left;
+ uint16_t i, rc, nb_mbufs;
+ char *dst, *src;
+ uint64_t ufo;
+ size_t total;
+ int j;
+
+ if (udp_addr_prepare(sk, &dst_addr, &addr) != 0)
+ return -1;
+
+ for (j = 0, total = 0; j < iovcnt; ++j)
+ total += iov[j].iov_len;
+
+ ufo = tx_offload & DEV_TX_OFFLOAD_UDP_TSO;
+ if (ufo)
+ slen = RTE_MBUF_DEFAULT_DATAROOM;
+ else
+ slen = 1500 - 20; /* mtu - ip_hdr_len */
+
+ nb_mbufs = (total + 8 + slen - 1) / slen;
+ struct rte_mbuf *mbufs[nb_mbufs];
+ if (unlikely(rte_pktmbuf_alloc_bulk(mp, mbufs, nb_mbufs) != 0)) {
+ errno = ENOMEM;
+ return -1;
+ }
+
+ left_b = iov[0].iov_len;
+ for (i = 0, j = 0; i < nb_mbufs && j < iovcnt; ++i) {
+ /* first frag has udp hdr, its payload is 8 bytes less */
+ if (i == 0)
+ slen -= 8;
+ else if (i == 1)
+ slen += 8;
+ left_m = slen;
+ while (left_m > 0 && j < iovcnt) {
+ copy_len = RTE_MIN(left_m, left_b);
+ dst = rte_pktmbuf_mtod_offset(mbufs[i], char *,
+ slen - left_m);
+ src = (char *)iov[j].iov_base + iov[j].iov_len - left_b;
+ rte_memcpy(dst, src, copy_len);
+
+ left_m -= copy_len;
+ left_b -= copy_len;
+ if (left_b == 0) {
+ j++;
+ left_b = iov[j].iov_len;
+ }
+ }
+ mbufs[i]->data_len = slen;
+ mbufs[i]->pkt_len = slen;
+ }
+
+ /* last seg */
+ if (nb_mbufs == 1) {
+ mbufs[nb_mbufs - 1]->data_len = total;
+ mbufs[nb_mbufs - 1]->pkt_len = total;
+ } else {
+ mbufs[nb_mbufs - 1]->data_len = total - (nb_mbufs - 1) * slen + 8;
+ mbufs[nb_mbufs - 1]->pkt_len = total - (nb_mbufs - 1) * slen + 8;
+ }
+
+ /* chain as *one* message */
+ for (i = 1; i < nb_mbufs; ++i)
+ mbufs[i-1]->next = mbufs[i];
+ mbufs[0]->nb_segs = nb_mbufs;
+ mbufs[0]->pkt_len = total;
+ nb_mbufs = 1;
+
+ rc = tle_udp_stream_send(sk->s, mbufs, nb_mbufs, dst_addr);
+ for (i = rc, left = 0; i < nb_mbufs; ++i) {
+ left += mbufs[i]->pkt_len;
+ rte_pktmbuf_free(mbufs[i]);
+ }
+
+ if (rc == 0) {
+ errno = rte_errno;
+ return -1;
+ }
+
+ return total - left;
+}
+
+static ssize_t
+udp_recv(struct tle_stream *s, struct rte_mbuf *pkt[], uint16_t num,
+ struct sockaddr *addr)
+{
+ uint16_t rc;
+
+ rc = tle_udp_stream_recv(s, pkt, num);
+ if (addr && num == 1 && rc == 1)
+ udp_pkt_addr(pkt[0], addr, 0);
+
+ if (rc == 0)
+ errno = rte_errno;
+ return rc;
+}
+
+static void
+udp_update_cfg(struct sock *sk)
+{
+ struct tle_udp_stream_param prm;
+ memset(&prm, 0, sizeof(prm));
+
+ prm.recv_ev = &sk->rxev;
+ prm.send_ev = &sk->txev;
+
+ tle_udp_stream_update_cfg(&sk->s, &prm, 1);
+}
+
+static int
+udp_shutdown(struct sock *sk, int how)
+{
+ int rc;
+
+ if (sk->s == NULL) {
+ errno = ENOTCONN;
+ return -1;
+ }
+
+ rc = tle_udp_stream_shutdown(sk->s, how);
+ if (rc < 0) {
+ errno = -rc;
+ return -1;
+ }
+ return 0;
+}
+
+struct proto udp_prot = {
+ .name = "UDP",
+ .setsockopt = udp_setsockopt,
+ .getsockopt = udp_getsockopt,
+ .getname = udp_getname,
+ .bind = udp_bind,
+ .connect = udp_connect,
+ .recv = udp_recv,
+ .send = udp_send,
+ .readv = udp_readv,
+ .writev = udp_writev,
+ .shutdown = udp_shutdown,
+ .close = tle_udp_stream_close,
+ .update_cfg = udp_update_cfg,
+};
diff --git a/lib/libtle_glue/util.c b/lib/libtle_glue/util.c
new file mode 100644
index 0000000..69fc555
--- /dev/null
+++ b/lib/libtle_glue/util.c
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <pthread.h>
+#include <sched.h>
+#include <unistd.h>
+
+#include "util.h"
+
+#define NUMA_NODE_PATH "/sys/devices/system/node"
+
+static unsigned
+eal_cpu_socket_id(unsigned lcore_id)
+{
+ unsigned socket;
+ char path[PATH_MAX];
+
+ for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) {
+ snprintf(path, sizeof(path), "%s/node%u/cpu%u", NUMA_NODE_PATH,
+ socket, lcore_id);
+ if (access(path, F_OK) == 0)
+ return socket;
+ }
+ return 0;
+}
+
+uint32_t
+get_socket_id(void)
+{
+ int err;
+ uint32_t i;
+ cpu_set_t cpuset;
+
+ CPU_ZERO(&cpuset);
+ err = pthread_getaffinity_np(pthread_self(),
+ sizeof(cpuset), &cpuset);
+ if (err)
+ return 0;
+
+ for (i = 0; i < CPU_SETSIZE; i++)
+ if (CPU_ISSET(i, &cpuset))
+ break;
+
+ return eal_cpu_socket_id(i);
+}
diff --git a/lib/libtle_glue/util.h b/lib/libtle_glue/util.h
new file mode 100644
index 0000000..ac67d8b
--- /dev/null
+++ b/lib/libtle_glue/util.h
@@ -0,0 +1,377 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _TLE_GLUE_UTIL_H_
+#define _TLE_GLUE_UTIL_H_
+
+#include <stdarg.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <tle_tcp.h>
+#include <tle_udp.h>
+
+#include "../libtle_l4p/tcp_stream.h"
+
+#include "fd.h"
+#include "ctx.h"
+#include "sock.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static inline void *
+xstrdup(const void *old)
+{
+ void *new = strdup(old);
+ if (unlikely(new == NULL))
+ rte_panic("Failed to strdup");
+ return new;
+}
+
+static inline void *
+xmalloc(size_t size)
+{
+ void *p = malloc(size ? size : 1);
+ if (p == NULL)
+ rte_panic("Failed to malloc");
+ return p;
+}
+
+static inline char *
+xvasprintf(const char *format, va_list args)
+{
+ va_list args2;
+ size_t needed;
+ char *s;
+
+ va_copy(args2, args);
+ needed = vsnprintf(NULL, 0, format, args);
+
+ s = xmalloc(needed + 1);
+
+ vsnprintf(s, needed + 1, format, args2);
+ va_end(args2);
+
+ return s;
+}
+
+static inline char *
+xasprintf(const char *format, ...)
+{
+ va_list args;
+ char *s;
+
+ va_start(args, format);
+ s = xvasprintf(format, args);
+ va_end(args);
+
+ return s;
+}
+
+static inline char **
+grow_argv(char **argv, size_t cur_siz, size_t grow_by)
+{
+ char **p;
+
+ p = realloc(argv, sizeof(char *) * (cur_siz + grow_by));
+ if (unlikely(p == NULL))
+ rte_panic("Failed to grow argv");
+ return p;
+}
+
+static inline void
+release_argv(int argc, char **argv_to_release, char **argv)
+{
+ int i;
+
+ for (i = 0; i < argc; ++i)
+ free(argv_to_release[i]);
+
+ free(argv_to_release);
+ free(argv);
+}
+
+static inline void
+tle_event_attach(struct tle_event *ev, struct tle_evq *evq, const void *data)
+{
+ ev->head = evq;
+ ev->data = data;
+}
+
+static inline void
+sock_alloc_events(struct sock *so)
+{
+ tle_event_attach(&so->erev, CTX(so)->ereq, so);
+ tle_event_attach(&so->rxev, CTX(so)->rxeq, so);
+ tle_event_attach(&so->txev, CTX(so)->txeq, so);
+ tle_event_active(&so->erev, TLE_SEV_DOWN);
+#ifndef LOOK_ASIDE_BACKEND
+ tle_event_active(&so->rxev, TLE_SEV_DOWN);
+ tle_event_active(&so->txev, TLE_SEV_DOWN);
+#endif
+}
+
+static inline void
+sock_active_events(struct sock *so)
+{
+ tle_event_active(&so->erev, TLE_SEV_DOWN);
+ tle_event_active(&so->rxev, TLE_SEV_DOWN);
+ tle_event_active(&so->txev, TLE_SEV_DOWN);
+}
+
+static inline const struct in6_addr*
+select_local_addr_v6(const struct sockaddr *remote, struct glue_ctx *ctx)
+{
+ /* todo: implement route table to decide local address */
+
+ if (IN6_IS_ADDR_LOOPBACK(&((const struct sockaddr_in6 *)remote)
+ ->sin6_addr))
+ return &in6addr_loopback;
+ else
+ return &ctx->ipv6;
+}
+
+static inline in_addr_t
+select_local_addr(const struct sockaddr *remote, struct glue_ctx *ctx)
+{
+ /* todo: implement route table to decide local address */
+ in_addr_t remote_addr;
+
+ remote_addr = ((const struct sockaddr_in*)remote)->sin_addr.s_addr;
+ if (remote_addr == htonl(INADDR_LOOPBACK))
+ return htonl(INADDR_LOOPBACK);
+ else
+ return ctx->ipv4;
+}
+
+static inline bool
+is_any_addr(const struct sockaddr *addr)
+{
+ const struct sockaddr_in *addr4;
+ const struct sockaddr_in6 *addr6;
+
+ if (addr->sa_family == AF_INET) {
+ addr4 = (const struct sockaddr_in *)addr;
+ if (addr4->sin_addr.s_addr == htonl(INADDR_ANY))
+ return true;
+ else
+ return false;
+ } else if (addr->sa_family == AF_INET6) {
+ addr6 = (const struct sockaddr_in6 *)addr;
+ if (IN6_IS_ADDR_UNSPECIFIED(&addr6->sin6_addr))
+ return true;
+ else
+ return false;
+ } else
+ return false;
+}
+
+static inline void
+set_any_addr(struct sockaddr *addr)
+{
+ struct sockaddr_in *addr4;
+ struct sockaddr_in6 *addr6;
+
+ if (addr->sa_family == AF_INET) {
+ addr4 = (struct sockaddr_in *)addr;
+ addr4->sin_addr.s_addr = htonl(INADDR_ANY);
+ } else if (addr->sa_family == AF_INET6) {
+ addr6 = (struct sockaddr_in6 *)addr;
+ addr6->sin6_addr = in6addr_any;
+ }
+}
+
+/* transform an IPv4 address(in struct sockaddr_in) to
+ * an IPv4 mapped IPv6 address(in struct sockaddr_in6) */
+static inline void
+trans_4mapped6_addr(struct sockaddr *addr)
+{
+ struct sockaddr_in6 *addr6;
+
+ if (addr->sa_family != AF_INET)
+ return;
+
+ addr6 = (struct sockaddr_in6*)addr;
+ addr6->sin6_family = AF_INET6;
+ addr6->sin6_addr.s6_addr32[0] = 0;
+ addr6->sin6_addr.s6_addr32[1] = 0;
+ addr6->sin6_addr.s6_addr32[2] = 0xffff0000;
+ addr6->sin6_addr.s6_addr32[3] = ((struct sockaddr_in*)addr)->sin_addr.s_addr;
+}
+
+/* transform an IPv4 mapped IPv6 address(in struct sockaddr_in6) to
+ * an IPv4 address(in struct sockaddr_in) */
+static inline void
+retrans_4mapped6_addr(struct sockaddr_storage * addr)
+{
+ struct in6_addr* addr6;
+ if (addr->ss_family == AF_INET)
+ return;
+
+ addr6 = &((struct sockaddr_in6*)addr)->sin6_addr;
+ if(IN6_IS_ADDR_V4MAPPED(addr6)) {
+ addr->ss_family = AF_INET;
+ ((struct sockaddr_in*)addr)->sin_addr.s_addr = addr6->__in6_u.__u6_addr32[3];
+ }
+}
+
+static inline struct tle_stream *
+open_bind(struct sock *so, const struct sockaddr *local,
+ const struct sockaddr *remote)
+{
+ struct tle_stream *s;
+ struct sockaddr_storage *l, *r;
+ struct sockaddr_in *addr4;
+ struct sockaddr_in6 *addr6;
+ struct tle_tcp_stream_param pt = {0};
+ struct tle_udp_stream_param pu = {0};
+
+ if (IS_TCP(so)) {
+ pt.option = so->option.raw;
+ l = &pt.addr.local;
+ r = &pt.addr.remote;
+ pt.cfg.err_ev = &so->erev;
+ pt.cfg.recv_ev = &so->rxev;
+ pt.cfg.send_ev = &so->txev;
+ } else {
+ pu.option = so->option.raw;
+ l = &pu.local_addr;
+ r = &pu.remote_addr;
+ pu.recv_ev = &so->rxev;
+ pu.send_ev = &so->txev;
+ }
+
+ if (remote) {
+ memcpy(r, remote, get_sockaddr_len(remote->sa_family));
+ retrans_4mapped6_addr(r);
+ if(r->ss_family == AF_INET) {
+ addr4 = (struct sockaddr_in*)r;
+ if (addr4->sin_addr.s_addr == 0)
+ addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ } else {
+ addr6 = (struct sockaddr_in6*)r;
+ if (IN6_IS_ADDR_UNSPECIFIED(&addr6->sin6_addr))
+ rte_memcpy(&addr6->sin6_addr, &in6addr_loopback,
+ sizeof(struct in6_addr));
+ }
+ }
+
+ if (local) {
+ memcpy(l, local, get_sockaddr_len(local->sa_family));
+ retrans_4mapped6_addr(l);
+ } else {
+ if (remote)
+ l->ss_family = r->ss_family;
+ else
+ l->ss_family = so->domain;
+ }
+
+ if (!remote)
+ r->ss_family = l->ss_family;
+
+ /* Endpoints of stream have different socket families */
+ if (r->ss_family != l->ss_family) {
+ if (l->ss_family == AF_INET) {
+ errno = EINVAL;
+ return NULL;
+ } else {
+ /* if local addr is unbound, convert into remote family */
+ if (IN6_IS_ADDR_UNSPECIFIED(&((struct sockaddr_in6*)l)->sin6_addr)) {
+ l->ss_family = AF_INET;
+ ((struct sockaddr_in*)l)->sin_addr.s_addr = 0;
+ } else {
+ errno = ENETUNREACH;
+ return NULL;
+ }
+ }
+ }
+
+ if (l->ss_family == AF_INET) {
+ addr4 = (struct sockaddr_in*)l;
+ if (addr4->sin_addr.s_addr == htonl(INADDR_ANY) && remote) {
+ addr4->sin_addr.s_addr =
+ select_local_addr((struct sockaddr*)r, CTX(so));
+ if (addr4->sin_addr.s_addr == htonl(INADDR_ANY)) {
+ errno = EADDRNOTAVAIL;
+ return NULL;
+ }
+ }
+ else if (addr4->sin_addr.s_addr != CTX(so)->ipv4 &&
+ addr4->sin_addr.s_addr != htonl(INADDR_LOOPBACK) &&
+ addr4->sin_addr.s_addr != htonl(INADDR_ANY)) {
+ errno = EADDRNOTAVAIL;
+ return NULL;
+ }
+ } else {
+ addr6 = (struct sockaddr_in6 *)l;
+ if (IN6_IS_ADDR_UNSPECIFIED(&addr6->sin6_addr) && remote) {
+ memcpy(&addr6->sin6_addr,
+ select_local_addr_v6((struct sockaddr*)r, CTX(so)),
+ sizeof(struct in6_addr));
+ if (IN6_IS_ADDR_UNSPECIFIED(&addr6->sin6_addr)) {
+ errno = EADDRNOTAVAIL;
+ return NULL;
+ }
+ }
+ else if (memcmp(&addr6->sin6_addr, &CTX(so)->ipv6,
+ sizeof(struct in6_addr)) != 0 &&
+ (!IN6_IS_ADDR_LOOPBACK(&addr6->sin6_addr)) &&
+ (!IN6_IS_ADDR_UNSPECIFIED(&addr6->sin6_addr))) {
+ errno = EADDRNOTAVAIL;
+ return NULL;
+ }
+ }
+
+ if (IS_TCP(so))
+ s = tle_tcp_stream_open(CTX(so)->tcp_ctx, &pt);
+ else {
+ if (so->s == NULL)
+ s = tle_udp_stream_open(CTX(so)->udp_ctx, &pu);
+ else
+ s = tle_udp_stream_set(so->s, CTX(so)->udp_ctx, &pu);
+ }
+
+ if (s == NULL)
+ errno = rte_errno;
+
+ return s;
+}
+
+static inline struct tle_stream *
+open_bind_listen(struct sock *so, const struct sockaddr *local)
+{
+ struct tle_stream *s = open_bind(so, local, NULL);
+
+ if (s == NULL)
+ return NULL;
+
+ if (tle_tcp_stream_listen(s) != 0) {
+ tle_tcp_stream_close(s);
+ return NULL;
+ }
+
+ return s;
+}
+
+uint32_t get_socket_id(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /*_TLE_GLUE_UTIL_H_ */
diff --git a/lib/libtle_glue/zerocopy.h b/lib/libtle_glue/zerocopy.h
new file mode 100644
index 0000000..a37f8f5
--- /dev/null
+++ b/lib/libtle_glue/zerocopy.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _TLE_GLUE_ZEROCOPY_H_
+#define _TLE_GLUE_ZEROCOPY_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * This API performs recv operation on specified socket, and it's
+ * optimized for zero copy, which means the caller does not need to
+ * prepare the buffer, instead, it will get a pointer on success.
+ * @param sockfd
+ * the file descriptor for the socket.
+ * @param buf
+ * after successfully receiving some payload, the pointer of the
+ * received buffer will be stored in *buf.
+ * @return
+ * the number of bytes received, or -1 if an error occurred, or 0
+ * if a stream socket peer has performed an orderly shutdown.
+ *
+ */
+ssize_t recv_zc(int sockfd, void **buf);
+
+/**
+ * This API performs send operation on specified socket, and it's
+ * optimized for zero copy, which means the caller does not need to
+ * free the buffer, not even touch that buffer even after calling this
+ * API; the buffer will be freed after an ack from the socket peer.
+ * @param sockfd
+ * the file descriptor for the socket.
+ * @param buf
+ * The pointer to the payload buffer to be sent.
+ * @param len
+ * The length of the payload buffer to be sent.
+ * @return
+ * the number of bytes sent, or -1 if an error occurred.
+ */
+ssize_t send_zc(int sockfd, const void *buf, size_t len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /*_TLE_GLUE_ZEROCOPY_H_ */
diff --git a/lib/libtle_l4p/Makefile b/lib/libtle_l4p/Makefile
index e1357d1..ee81d4a 100644
--- a/lib/libtle_l4p/Makefile
+++ b/lib/libtle_l4p/Makefile
@@ -45,6 +45,7 @@ SYMLINK-y-include += tle_ctx.h
SYMLINK-y-include += tle_event.h
SYMLINK-y-include += tle_tcp.h
SYMLINK-y-include += tle_udp.h
+SYMLINK-y-include += tle_stats.h
# this lib dependencies
DEPDIRS-y += lib/libtle_misc
diff --git a/lib/libtle_l4p/ctx.c b/lib/libtle_l4p/ctx.c
index b8067f0..d6bde48 100644
--- a/lib/libtle_l4p/ctx.c
+++ b/lib/libtle_l4p/ctx.c
@@ -21,9 +21,14 @@
#include <rte_ip.h>
#include "stream.h"
+#include "stream_table.h"
#include "misc.h"
#include <halfsiphash.h>
+struct tle_mib default_mib;
+
+RTE_DEFINE_PER_LCORE(struct tle_mib *, mib) = &default_mib;
+
#define LPORT_START 0x8000
#define LPORT_END MAX_PORT_NUM
@@ -103,6 +108,16 @@ tle_ctx_create(const struct tle_ctx_param *ctx_prm)
ctx->prm = *ctx_prm;
+ rc = bhash_init(ctx);
+ if (rc != 0) {
+ UDP_LOG(ERR, "create bhash table (ctx=%p, proto=%u) failed "
+ "with error code: %d;\n",
+ ctx, ctx_prm->proto, rc);
+ tle_ctx_destroy(ctx);
+ rte_errno = -rc;
+ return NULL;
+ }
+
rc = tle_stream_ops[ctx_prm->proto].init_streams(ctx);
if (rc != 0) {
UDP_LOG(ERR, "init_streams(ctx=%p, proto=%u) failed "
@@ -114,9 +129,10 @@ tle_ctx_create(const struct tle_ctx_param *ctx_prm)
}
for (i = 0; i != RTE_DIM(ctx->use); i++)
- tle_pbm_init(ctx->use + i, LPORT_START_BLK);
+ tle_psm_init(ctx->use + i);
- ctx->streams.nb_free = ctx->prm.max_streams;
+ ctx->streams.nb_free = ctx->prm.min_streams;
+ ctx->streams.nb_cur = ctx->prm.min_streams;
/* Initialization of siphash state is done here to speed up the
* fastpath processing.
@@ -124,6 +140,11 @@ tle_ctx_create(const struct tle_ctx_param *ctx_prm)
if (ctx->prm.hash_alg == TLE_SIPHASH)
siphash_initialization(&ctx->prm.secret_key,
&ctx->prm.secret_key);
+
+ rte_spinlock_init(&ctx->dev_lock);
+ rte_spinlock_init(&ctx->bhash_lock[TLE_V4]);
+ rte_spinlock_init(&ctx->bhash_lock[TLE_V6]);
+
return ctx;
}
@@ -137,6 +158,8 @@ tle_ctx_destroy(struct tle_ctx *ctx)
return;
}
+ bhash_fini(ctx);
+
for (i = 0; i != RTE_DIM(ctx->dev); i++)
tle_del_dev(ctx->dev + i);
@@ -150,37 +173,6 @@ tle_ctx_invalidate(struct tle_ctx *ctx)
RTE_SET_USED(ctx);
}
-static void
-fill_pbm(struct tle_pbm *pbm, const struct tle_bl_port *blp)
-{
- uint32_t i;
-
- for (i = 0; i != blp->nb_port; i++)
- tle_pbm_set(pbm, blp->port[i]);
-}
-
-static int
-init_dev_proto(struct tle_dev *dev, uint32_t idx, int32_t socket_id,
- const struct tle_bl_port *blp)
-{
- size_t sz;
-
- sz = sizeof(*dev->dp[idx]);
- dev->dp[idx] = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE,
- socket_id);
-
- if (dev->dp[idx] == NULL) {
- UDP_LOG(ERR, "allocation of %zu bytes on "
- "socket %d for %u-th device failed\n",
- sz, socket_id, idx);
- return ENOMEM;
- }
-
- tle_pbm_init(&dev->dp[idx]->use, LPORT_START_BLK);
- fill_pbm(&dev->dp[idx]->use, blp);
- return 0;
-}
-
static struct tle_dev *
find_free_dev(struct tle_ctx *ctx)
{
@@ -214,27 +206,8 @@ tle_add_dev(struct tle_ctx *ctx, const struct tle_dev_param *dev_prm)
return NULL;
rc = 0;
- /* device can handle IPv4 traffic */
- if (dev_prm->local_addr4.s_addr != INADDR_ANY) {
- rc = init_dev_proto(dev, TLE_V4, ctx->prm.socket_id,
- &dev_prm->bl4);
- if (rc == 0)
- fill_pbm(&ctx->use[TLE_V4], &dev_prm->bl4);
- }
-
- /* device can handle IPv6 traffic */
- if (rc == 0 && memcmp(&dev_prm->local_addr6, &tle_ipv6_any,
- sizeof(tle_ipv6_any)) != 0) {
- rc = init_dev_proto(dev, TLE_V6, ctx->prm.socket_id,
- &dev_prm->bl6);
- if (rc == 0)
- fill_pbm(&ctx->use[TLE_V6], &dev_prm->bl6);
- }
-
if (rc != 0) {
/* cleanup and return an error. */
- rte_free(dev->dp[TLE_V4]);
- rte_free(dev->dp[TLE_V6]);
rte_errno = rc;
return NULL;
}
@@ -246,16 +219,19 @@ tle_add_dev(struct tle_ctx *ctx, const struct tle_dev_param *dev_prm)
if ((dev_prm->tx_offload & DEV_TX_OFFLOAD_UDP_CKSUM) != 0 &&
ctx->prm.proto == TLE_PROTO_UDP) {
- dev->tx.ol_flags[TLE_V4] |= PKT_TX_IPV4 | PKT_TX_UDP_CKSUM;
- dev->tx.ol_flags[TLE_V6] |= PKT_TX_IPV6 | PKT_TX_UDP_CKSUM;
+ dev->tx.ol_flags[TLE_V4] |= PKT_TX_UDP_CKSUM;
+ dev->tx.ol_flags[TLE_V6] |= PKT_TX_UDP_CKSUM;
} else if ((dev_prm->tx_offload & DEV_TX_OFFLOAD_TCP_CKSUM) != 0 &&
ctx->prm.proto == TLE_PROTO_TCP) {
- dev->tx.ol_flags[TLE_V4] |= PKT_TX_IPV4 | PKT_TX_TCP_CKSUM;
- dev->tx.ol_flags[TLE_V6] |= PKT_TX_IPV6 | PKT_TX_TCP_CKSUM;
+ dev->tx.ol_flags[TLE_V4] |= PKT_TX_TCP_CKSUM;
+ dev->tx.ol_flags[TLE_V6] |= PKT_TX_TCP_CKSUM;
}
if ((dev_prm->tx_offload & DEV_TX_OFFLOAD_IPV4_CKSUM) != 0)
- dev->tx.ol_flags[TLE_V4] |= PKT_TX_IPV4 | PKT_TX_IP_CKSUM;
+ dev->tx.ol_flags[TLE_V4] |= PKT_TX_IP_CKSUM;
+
+ dev->tx.ol_flags[TLE_V4] |= PKT_TX_IPV4;
+ dev->tx.ol_flags[TLE_V6] |= PKT_TX_IPV6;
dev->prm = *dev_prm;
dev->ctx = ctx;
@@ -300,220 +276,97 @@ tle_del_dev(struct tle_dev *dev)
ctx = dev->ctx;
p = dev - ctx->dev;
- if (p >= RTE_DIM(ctx->dev) ||
- (dev->dp[TLE_V4] == NULL &&
- dev->dp[TLE_V6] == NULL))
+ if (p >= RTE_DIM(ctx->dev))
return -EINVAL;
/* emtpy TX queues. */
empty_dring(&dev->tx.dr, ctx->prm.proto);
- rte_free(dev->dp[TLE_V4]);
- rte_free(dev->dp[TLE_V6]);
memset(dev, 0, sizeof(*dev));
ctx->nb_dev--;
return 0;
}
-static struct tle_dev *
-find_ipv4_dev(struct tle_ctx *ctx, const struct in_addr *addr)
-{
- uint32_t i;
-
- for (i = 0; i != RTE_DIM(ctx->dev); i++) {
- if (ctx->dev[i].prm.local_addr4.s_addr == addr->s_addr &&
- ctx->dev[i].dp[TLE_V4] != NULL)
- return ctx->dev + i;
- }
-
- return NULL;
-}
-
-static struct tle_dev *
-find_ipv6_dev(struct tle_ctx *ctx, const struct in6_addr *addr)
+int
+stream_fill_ctx(struct tle_ctx *ctx, struct tle_stream *s,
+ const struct sockaddr *laddr, const struct sockaddr *raddr)
{
- uint32_t i;
+ struct sockaddr_storage addr;
+ int32_t rc = 0;
- for (i = 0; i != RTE_DIM(ctx->dev); i++) {
- if (memcmp(&ctx->dev[i].prm.local_addr6, addr,
- sizeof(*addr)) == 0 &&
- ctx->dev[i].dp[TLE_V6] != NULL)
- return ctx->dev + i;
+ if (laddr->sa_family == AF_INET) {
+ s->type = TLE_V4;
+ } else if (laddr->sa_family == AF_INET6) {
+ s->type = TLE_V6;
}
- return NULL;
-}
-
-static int
-stream_fill_dev(struct tle_ctx *ctx, struct tle_stream *s,
- const struct sockaddr *addr)
-{
- struct tle_dev *dev;
- struct tle_pbm *pbm;
- const struct sockaddr_in *lin4;
- const struct sockaddr_in6 *lin6;
- uint32_t i, p, sp, t;
-
- if (addr->sa_family == AF_INET) {
- lin4 = (const struct sockaddr_in *)addr;
- t = TLE_V4;
- p = lin4->sin_port;
- } else if (addr->sa_family == AF_INET6) {
- lin6 = (const struct sockaddr_in6 *)addr;
- t = TLE_V6;
- p = lin6->sin6_port;
- } else
- return EINVAL;
-
+ uint16_t p = ((const struct sockaddr_in *)laddr)->sin_port;
p = ntohs(p);
-
- /* if local address is not wildcard, find device it belongs to. */
- if (t == TLE_V4 && lin4->sin_addr.s_addr != INADDR_ANY) {
- dev = find_ipv4_dev(ctx, &lin4->sin_addr);
- if (dev == NULL)
- return ENODEV;
- } else if (t == TLE_V6 && memcmp(&tle_ipv6_any, &lin6->sin6_addr,
- sizeof(tle_ipv6_any)) != 0) {
- dev = find_ipv6_dev(ctx, &lin6->sin6_addr);
- if (dev == NULL)
- return ENODEV;
- } else
- dev = NULL;
-
- if (dev != NULL)
- pbm = &dev->dp[t]->use;
- else
- pbm = &ctx->use[t];
-
+ struct tle_psm *psm = &ctx->use[s->type];
/* try to acquire local port number. */
+ rte_spinlock_lock(&ctx->dev_lock);
if (p == 0) {
- p = tle_pbm_find_range(pbm, pbm->blk, LPORT_END_BLK);
- if (p == 0 && pbm->blk > LPORT_START_BLK)
- p = tle_pbm_find_range(pbm, LPORT_START_BLK, pbm->blk);
- } else if (tle_pbm_check(pbm, p) != 0)
- return EEXIST;
-
- if (p == 0)
- return ENFILE;
-
- /* fill socket's dst port and type */
-
- sp = htons(p);
- s->type = t;
- s->port.dst = sp;
-
- /* mark port as in-use */
-
- tle_pbm_set(&ctx->use[t], p);
- if (dev != NULL) {
- tle_pbm_set(pbm, p);
- dev->dp[t]->streams[sp] = s;
- } else {
- for (i = 0; i != RTE_DIM(ctx->dev); i++) {
- if (ctx->dev[i].dp[t] != NULL) {
- tle_pbm_set(&ctx->dev[i].dp[t]->use, p);
- ctx->dev[i].dp[t]->streams[sp] = s;
- }
+ if (s->type == TLE_V6 && is_empty_addr(laddr) && !s->option.ipv6only)
+ p = tle_psm_alloc_dual_port(&ctx->use[TLE_V4], psm);
+ else
+ p = tle_psm_alloc_port(psm);
+ if (p == 0) {
+ rte_spinlock_unlock(&ctx->dev_lock);
+ return ENFILE;
}
+ rte_memcpy(&addr, laddr, sizeof(struct sockaddr_storage));
+ ((struct sockaddr_in *)&addr)->sin_port = htons(p);
+ laddr = (const struct sockaddr*)&addr;
}
- return 0;
-}
+ if (tle_psm_set(psm, p, s->option.reuseport) != 0) {
+ rte_spinlock_unlock(&ctx->dev_lock);
+ return EADDRINUSE;
+ }
-static int
-stream_clear_dev(struct tle_ctx *ctx, const struct tle_stream *s)
-{
- struct tle_dev *dev;
- uint32_t i, p, sp, t;
-
- t = s->type;
- sp = s->port.dst;
- p = ntohs(sp);
-
- /* if local address is not wildcard, find device it belongs to. */
- if (t == TLE_V4 && s->ipv4.addr.dst != INADDR_ANY) {
- dev = find_ipv4_dev(ctx,
- (const struct in_addr *)&s->ipv4.addr.dst);
- if (dev == NULL)
- return ENODEV;
- } else if (t == TLE_V6 && memcmp(&tle_ipv6_any, &s->ipv6.addr.dst,
- sizeof(tle_ipv6_any)) != 0) {
- dev = find_ipv6_dev(ctx,
- (const struct in6_addr *)&s->ipv6.addr.dst);
- if (dev == NULL)
- return ENODEV;
- } else
- dev = NULL;
-
- tle_pbm_clear(&ctx->use[t], p);
- if (dev != NULL) {
- if (dev->dp[t]->streams[sp] == s) {
- tle_pbm_clear(&dev->dp[t]->use, p);
- dev->dp[t]->streams[sp] = NULL;
- }
- } else {
- for (i = 0; i != RTE_DIM(ctx->dev); i++) {
- if (ctx->dev[i].dp[t] != NULL &&
- ctx->dev[i].dp[t]->streams[sp] == s) {
- tle_pbm_clear(&ctx->dev[i].dp[t]->use, p);
- ctx->dev[i].dp[t]->streams[sp] = NULL;
+ if (is_empty_addr(laddr)) {
+ if (s->type == TLE_V6 && !s->option.ipv6only) {
+ rc = tle_psm_set(&ctx->use[TLE_V4], p, s->option.reuseport);
+ if (rc != 0) {
+ tle_psm_clear(psm, p);
+ rte_spinlock_unlock(&ctx->dev_lock);
+ return EADDRINUSE;
}
}
}
- return 0;
-}
-
-static void
-fill_ipv4_am(const struct sockaddr_in *in, uint32_t *addr, uint32_t *mask)
-{
- *addr = in->sin_addr.s_addr;
- *mask = (*addr == INADDR_ANY) ? INADDR_ANY : INADDR_NONE;
-}
+ if (is_empty_addr(raddr))
+ rc = bhash_add_entry(ctx, laddr, s);
-static void
-fill_ipv6_am(const struct sockaddr_in6 *in, rte_xmm_t *addr, rte_xmm_t *mask)
-{
- const struct in6_addr *pm;
-
- memcpy(addr, &in->sin6_addr, sizeof(*addr));
- if (memcmp(&tle_ipv6_any, addr, sizeof(*addr)) == 0)
- pm = &tle_ipv6_any;
- else
- pm = &tle_ipv6_none;
-
- memcpy(mask, pm, sizeof(*mask));
-}
+ if (rc) {
+ tle_psm_clear(psm, p);
+ }
-int
-stream_fill_ctx(struct tle_ctx *ctx, struct tle_stream *s,
- const struct sockaddr *laddr, const struct sockaddr *raddr)
-{
- const struct sockaddr_in *rin;
- int32_t rc;
+ rte_spinlock_unlock(&ctx->dev_lock);
+ /* fill socket's dst (src actually) port */
+ s->port.dst = htons(p);
- /* setup ports and port mask fields (except dst port). */
- rin = (const struct sockaddr_in *)raddr;
- s->port.src = rin->sin_port;
- s->pmsk.src = (s->port.src == 0) ? 0 : UINT16_MAX;
- s->pmsk.dst = UINT16_MAX;
+ if (rc)
+ return rc;
- /* setup src and dst addresses. */
+ /* setup src, dst addresses, and src port. */
if (laddr->sa_family == AF_INET) {
fill_ipv4_am((const struct sockaddr_in *)laddr,
&s->ipv4.addr.dst, &s->ipv4.mask.dst);
fill_ipv4_am((const struct sockaddr_in *)raddr,
&s->ipv4.addr.src, &s->ipv4.mask.src);
+ s->port.src = ((const struct sockaddr_in *)raddr)->sin_port;
} else if (laddr->sa_family == AF_INET6) {
fill_ipv6_am((const struct sockaddr_in6 *)laddr,
&s->ipv6.addr.dst, &s->ipv6.mask.dst);
fill_ipv6_am((const struct sockaddr_in6 *)raddr,
&s->ipv6.addr.src, &s->ipv6.mask.src);
+ s->port.src = ((const struct sockaddr_in6 *)raddr)->sin6_port;
}
- rte_spinlock_lock(&ctx->dev_lock);
- rc = stream_fill_dev(ctx, s, laddr);
- rte_spinlock_unlock(&ctx->dev_lock);
+ /* setup port mask fields. */
+ s->pmsk.src = (s->port.src == 0) ? 0 : UINT16_MAX;
+ s->pmsk.dst = UINT16_MAX;
return rc;
}
@@ -522,11 +375,41 @@ stream_fill_ctx(struct tle_ctx *ctx, struct tle_stream *s,
int
stream_clear_ctx(struct tle_ctx *ctx, struct tle_stream *s)
{
- int32_t rc;
+ bool is_any = false;
+ struct sockaddr_storage addr;
+ struct sockaddr_in *addr4;
+ struct sockaddr_in6 *addr6;
+
+ if (s->type == TLE_V4) {
+ if (s->ipv4.addr.src == INADDR_ANY) {
+ is_any = true;
+ addr4 = (struct sockaddr_in *)&addr;
+ addr4->sin_addr.s_addr = s->ipv4.addr.dst;
+ addr4->sin_port = s->port.dst;
+ addr.ss_family = AF_INET;
+ bhash_del_entry(ctx, s, (struct sockaddr*)&addr);
+ }
+ } else {
+ if (IN6_IS_ADDR_UNSPECIFIED(&s->ipv6.addr.src)) {
+ is_any = true;
+ addr6 = (struct sockaddr_in6 *)&addr;
+ memcpy(&addr6->sin6_addr, &s->ipv6.addr.dst,
+ sizeof(tle_ipv6_any));
+ addr6->sin6_port = s->port.dst;
+ addr.ss_family = AF_INET6;
+ bhash_del_entry(ctx, s, (struct sockaddr*)&addr);
+ }
+ }
rte_spinlock_lock(&ctx->dev_lock);
- rc = stream_clear_dev(ctx, s);
+ /* strange behaviour to match linux stack */
+ if (is_any) {
+ if (s->type == TLE_V6 && !s->option.ipv6only)
+ tle_psm_clear(&ctx->use[TLE_V4], ntohs(s->port.dst));
+ }
+
+ tle_psm_clear(&ctx->use[s->type], ntohs(s->port.dst));
rte_spinlock_unlock(&ctx->dev_lock);
- return rc;
+ return 0;
}
diff --git a/lib/libtle_l4p/ctx.h b/lib/libtle_l4p/ctx.h
index f18060b..9483976 100644
--- a/lib/libtle_l4p/ctx.h
+++ b/lib/libtle_l4p/ctx.h
@@ -21,7 +21,7 @@
#include <tle_dring.h>
#include <tle_ctx.h>
-#include "port_bitmap.h"
+#include "port_statmap.h"
#include "osdep.h"
#include "net_misc.h"
@@ -29,11 +29,6 @@
extern "C" {
#endif
-struct tle_dport {
- struct tle_pbm use; /* ports in use. */
- struct tle_stream *streams[MAX_PORT_NUM]; /* port to stream. */
-};
-
struct tle_dev {
struct tle_ctx *ctx;
struct {
@@ -45,7 +40,6 @@ struct tle_dev {
struct tle_dring dr;
} tx;
struct tle_dev_param prm; /* copy of device parameters. */
- struct tle_dport *dp[TLE_VNUM]; /* device L4 ports */
};
struct tle_ctx {
@@ -54,18 +48,23 @@ struct tle_ctx {
struct {
rte_spinlock_t lock;
uint32_t nb_free; /* number of free streams. */
+ uint32_t nb_cur; /* number of allocated streams. */
STAILQ_HEAD(, tle_stream) free;
void *buf; /* space allocated for streams */
} streams;
- rte_spinlock_t dev_lock;
+ rte_spinlock_t bhash_lock[TLE_VNUM];
+ struct rte_hash *bhash[TLE_VNUM]; /* bind and listen hash table */
+
uint32_t nb_dev;
- struct tle_pbm use[TLE_VNUM]; /* all ports in use. */
+ rte_spinlock_t dev_lock;
+ struct tle_psm use[TLE_VNUM]; /* all ports in use. */
struct tle_dev dev[RTE_MAX_ETHPORTS];
};
struct stream_ops {
int (*init_streams)(struct tle_ctx *);
+ uint32_t (*more_streams)(struct tle_ctx *);
void (*fini_streams)(struct tle_ctx *);
void (*free_drbs)(struct tle_stream *, struct tle_drb *[], uint32_t);
};
@@ -77,6 +76,27 @@ int stream_fill_ctx(struct tle_ctx *ctx, struct tle_stream *s,
int stream_clear_ctx(struct tle_ctx *ctx, struct tle_stream *s);
+static inline void
+fill_ipv4_am(const struct sockaddr_in *in, uint32_t *addr, uint32_t *mask)
+{
+ *addr = in->sin_addr.s_addr;
+ *mask = (*addr == INADDR_ANY) ? INADDR_ANY : INADDR_NONE;
+}
+
+static inline void
+fill_ipv6_am(const struct sockaddr_in6 *in, rte_xmm_t *addr, rte_xmm_t *mask)
+{
+ const struct in6_addr *pm;
+
+ memcpy(addr, &in->sin6_addr, sizeof(*addr));
+ if (IN6_IS_ADDR_UNSPECIFIED(addr))
+ pm = &tle_ipv6_any;
+ else
+ pm = &tle_ipv6_none;
+
+ memcpy(mask, pm, sizeof(*mask));
+}
+
#ifdef __cplusplus
}
#endif
diff --git a/lib/libtle_l4p/misc.h b/lib/libtle_l4p/misc.h
index 327296f..d39e5a1 100644
--- a/lib/libtle_l4p/misc.h
+++ b/lib/libtle_l4p/misc.h
@@ -16,12 +16,34 @@
#ifndef _MISC_H_
#define _MISC_H_
+#include <tle_stats.h>
#include <tle_dpdk_wrapper.h>
#ifdef __cplusplus
extern "C" {
#endif
+union typflg {
+ uint16_t raw;
+ struct {
+ uint8_t type; /* TLE_V4/TLE_V6 */
+ uint8_t flags; /* TCP header flags */
+ };
+};
+
+union pkt_info {
+ rte_xmm_t raw;
+ struct {
+ union typflg tf;
+ uint16_t csf; /* checksum flags */
+ union l4_ports port;
+ union {
+ union ipv4_addrs addr4;
+ const union ipv6_addrs *addr6;
+ };
+ };
+};
+
static inline int
xmm_cmp(const rte_xmm_t *da, const rte_xmm_t *sa)
{
@@ -286,43 +308,41 @@ _ipv4x_cksum(const void *iph, size_t len)
return (cksum == 0xffff) ? cksum : ~cksum;
}
-/*
- * helper function to check csum.
- */
static inline int
-check_pkt_csum(const struct rte_mbuf *m, uint64_t ol_flags, uint32_t type,
- uint32_t proto)
+check_pkt_csum(const struct rte_mbuf *m, uint32_t type, uint32_t proto)
{
const struct ipv4_hdr *l3h4;
const struct ipv6_hdr *l3h6;
const struct udp_hdr *l4h;
- uint64_t fl3, fl4;
- uint16_t csum;
int32_t ret;
-
- fl4 = ol_flags & PKT_RX_L4_CKSUM_MASK;
- fl3 = (type == TLE_V4) ?
- (ol_flags & PKT_RX_IP_CKSUM_MASK) : PKT_RX_IP_CKSUM_GOOD;
+ uint16_t csum;
+ uint64_t ol_flags = m->ol_flags;
/* case 0: both ip and l4 cksum is verified or data is valid */
- if ((fl3 | fl4) == (PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD))
+ if ((ol_flags & PKT_RX_IP_CKSUM_GOOD) &&
+ (ol_flags & PKT_RX_L4_CKSUM_GOOD))
return 0;
/* case 1: either ip or l4 cksum bad */
- if (fl3 == PKT_RX_IP_CKSUM_BAD || fl4 == PKT_RX_L4_CKSUM_BAD)
+ if ((ol_flags & PKT_RX_IP_CKSUM_MASK) == PKT_RX_IP_CKSUM_BAD)
+ return 1;
+
+ if ((ol_flags & PKT_RX_L4_CKSUM_MASK) == PKT_RX_L4_CKSUM_BAD)
return 1;
/* case 2: either ip or l4 or both cksum is unknown */
+ ret = 0;
l3h4 = rte_pktmbuf_mtod_offset(m, const struct ipv4_hdr *, m->l2_len);
l3h6 = rte_pktmbuf_mtod_offset(m, const struct ipv6_hdr *, m->l2_len);
- ret = 0;
- if (fl3 == PKT_RX_IP_CKSUM_UNKNOWN && l3h4->hdr_checksum != 0) {
+ if ((ol_flags & PKT_RX_IP_CKSUM_MASK) == PKT_RX_IP_CKSUM_UNKNOWN &&
+ l3h4->hdr_checksum != 0) {
csum = _ipv4x_cksum(l3h4, m->l3_len);
ret = (csum != UINT16_MAX);
}
- if (ret == 0 && fl4 == PKT_RX_L4_CKSUM_UNKNOWN) {
+ if (ret == 0 && (ol_flags & PKT_RX_L4_CKSUM_MASK) ==
+ PKT_RX_L4_CKSUM_UNKNOWN) {
/*
* for IPv4 it is allowed to have zero UDP cksum,
@@ -376,8 +396,20 @@ rwl_acquire(rte_atomic32_t *p)
static inline void
rwl_down(rte_atomic32_t *p)
{
- while (rte_atomic32_cmpset((volatile uint32_t *)p, 0, INT32_MIN) == 0)
+ while (rte_atomic32_cmpset((volatile uint32_t *)p, 0, INT32_MIN) == 0)
+ rte_pause();
+}
+
+static inline int
+rwl_try_down(rte_atomic32_t *p)
+{
+ while (rte_atomic32_cmpset((volatile uint32_t *)p, 0, INT32_MIN) == 0) {
+ /* Already down */
+ if (rte_atomic32_read(p) == INT32_MIN)
+ return -1;
rte_pause();
+ }
+ return 0;
}
static inline void
diff --git a/lib/libtle_l4p/net_misc.h b/lib/libtle_l4p/net_misc.h
index 2d8dac2..c1d946b 100644
--- a/lib/libtle_l4p/net_misc.h
+++ b/lib/libtle_l4p/net_misc.h
@@ -16,6 +16,7 @@
#ifndef _NET_MISC_H_
#define _NET_MISC_H_
+#include <stdbool.h>
#include <rte_ip.h>
#include <rte_udp.h>
#include "osdep.h"
@@ -71,6 +72,26 @@ union ip_addrs {
union ipv6_addrs v6;
};
+static inline bool
+is_empty_addr(const struct sockaddr *addr)
+{
+ bool any = false;
+ const struct sockaddr_in *in4;
+ const struct sockaddr_in6 *in6;
+
+ if (addr->sa_family == AF_INET) {
+ in4 = (const struct sockaddr_in *)addr;
+ if (in4->sin_addr.s_addr == INADDR_ANY)
+ any = true;
+ } else if (addr->sa_family == AF_INET6) {
+ in6 = (const struct sockaddr_in6 *)addr;
+ if (IN6_IS_ADDR_UNSPECIFIED(&in6->sin6_addr))
+ any = true;
+ }
+
+ return any;
+}
+
#ifdef __cplusplus
}
#endif
diff --git a/lib/libtle_l4p/port_statmap.h b/lib/libtle_l4p/port_statmap.h
new file mode 100644
index 0000000..8bbb0ba
--- /dev/null
+++ b/lib/libtle_l4p/port_statmap.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2019 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _PORT_STATMAP_H_
+#define _PORT_STATMAP_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_PORT_NUM (UINT16_MAX + 1)
+#define ALLOC_PORT_START 0x8000
+
+struct tle_psm {
+ uint32_t nb_used; /* Number of ports already in use. */
+ uint32_t next_alloc; /* Next port to try allocate. */
+ uint8_t stat[MAX_PORT_NUM]; /* Status of the port:
+ * 1) the most significant bit indicates
+ * if SO_REUSEPORT is allowed;
+ * 2) lowest 7 bits indicate # of streams
+ * using the port.
+ */
+};
+
+static inline void
+tle_psm_init(struct tle_psm *psm)
+{
+ memset(psm, 0, sizeof(struct tle_psm));
+ psm->next_alloc = ALLOC_PORT_START;
+}
+
+static inline int
+tle_psm_set(struct tle_psm *psm, uint16_t port, uint8_t reuseport)
+{
+ if (psm->stat[port] == 0) {
+ /* port has not been used */
+ psm->stat[port]++;
+ if (reuseport)
+ psm->stat[port] |= 0x80;
+ } else {
+ /* port is used by some socket */
+ if (reuseport && (psm->stat[port] & 0x80)) {
+ /* all sockets set reuseport */
+ psm->stat[port]++;
+ } else
+ return -1;
+ }
+
+ return 0;
+}
+
+static inline void
+tle_psm_clear(struct tle_psm *psm, uint16_t port)
+{
+ psm->stat[port]--;
+ if ((psm->stat[port] & 0x7f) == 0)
+ psm->stat[port] = 0;
+}
+
+
+static inline uint8_t
+tle_psm_check(const struct tle_psm *psm, uint16_t port)
+{
+ return psm->stat[port];
+}
+
+static inline uint16_t
+tle_psm_alloc_port(struct tle_psm *psm)
+{
+ uint32_t i = psm->next_alloc;
+
+ for (; i < MAX_PORT_NUM; i++) {
+ if (psm->stat[i] == 0) {
+ psm->next_alloc = i + 1;
+ return (uint16_t)i;
+ }
+ }
+
+ for (i = ALLOC_PORT_START; i < psm->next_alloc; i++) {
+ if (psm->stat[i] == 0) {
+ psm->next_alloc = i + 1;
+ return (uint16_t)i;
+ }
+ }
+
+ return 0;
+}
+
+static inline uint16_t
+tle_psm_alloc_dual_port(struct tle_psm *psm4, struct tle_psm *psm6)
+{
+ uint32_t i = psm6->next_alloc;
+
+ for (; i < MAX_PORT_NUM; i++) {
+ if (psm6->stat[i] == 0 && psm4->stat[i] == 0) {
+ psm6->next_alloc = i + 1;
+ return (uint16_t)i;
+ }
+ }
+
+ for (i = ALLOC_PORT_START; i < psm6->next_alloc; i++) {
+ if (psm6->stat[i] == 0 && psm4->stat[i] == 0) {
+ psm6->next_alloc = i + 1;
+ return (uint16_t)i;
+ }
+ }
+
+ return 0;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _PORT_STATMAP_H_ */
diff --git a/lib/libtle_l4p/stream.h b/lib/libtle_l4p/stream.h
index 49a2809..9f2bbc1 100644
--- a/lib/libtle_l4p/stream.h
+++ b/lib/libtle_l4p/stream.h
@@ -31,7 +31,11 @@ struct tle_stream {
STAILQ_ENTRY(tle_stream) link;
struct tle_ctx *ctx;
- uint8_t type; /* TLE_V4 or TLE_V6 */
+ tle_stream_options_t option;
+ unsigned long timestamp;
+ uint16_t reuseport_seed;
+ uint8_t type; /* TLE_V4 or TLE_V6 */
+ uint8_t padding;
/* Stream address information. */
union l4_ports port;
@@ -53,15 +57,25 @@ static inline uint32_t
get_streams(struct tle_ctx *ctx, struct tle_stream *s[], uint32_t num)
{
struct tle_stream *p;
- uint32_t i, n;
+ uint32_t i, n, inc;
rte_spinlock_lock(&ctx->streams.lock);
- n = RTE_MIN(ctx->streams.nb_free, num);
- for (i = 0, p = STAILQ_FIRST(&ctx->streams.free);
- i != n;
- i++, p = STAILQ_NEXT(p, link))
+ n = ctx->streams.nb_free;
+ if (n < num) {
+ inc = tle_stream_ops[ctx->prm.proto].more_streams(ctx);
+ ctx->streams.nb_free += inc;
+ ctx->streams.nb_cur += inc;
+ n = ctx->streams.nb_free;
+ }
+ n = RTE_MIN(n, num);
+
+ for (i = 0, p = STAILQ_FIRST(&ctx->streams.free); i != n; ) {
s[i] = p;
+ p = STAILQ_NEXT(p, link);
+ s[i]->link.stqe_next = NULL;
+ i++;
+ }
if (p == NULL)
/* we retrieved all free entries */
@@ -80,9 +94,6 @@ get_stream(struct tle_ctx *ctx)
struct tle_stream *s;
s = NULL;
- if (ctx->streams.nb_free == 0)
- return s;
-
get_streams(ctx, &s, 1);
return s;
}
@@ -120,8 +131,8 @@ drb_nb_elem(const struct tle_ctx *ctx)
}
static inline int32_t
-stream_get_dest(struct tle_stream *s, const void *dst_addr,
- struct tle_dest *dst)
+stream_get_dest(uint8_t type, struct tle_stream *s, const void *src_addr,
+ const void *dst_addr, struct tle_dest *dst)
{
int32_t rc;
const struct in_addr *d4;
@@ -133,12 +144,13 @@ stream_get_dest(struct tle_stream *s, const void *dst_addr,
/* it is here just to keep gcc happy. */
d4 = NULL;
+ /* it is here just to keep gcc happy. */
d6 = NULL;
- if (s->type == TLE_V4) {
+ if (type == TLE_V4) {
d4 = dst_addr;
rc = ctx->prm.lookup4(ctx->prm.lookup4_data, d4, dst);
- } else if (s->type == TLE_V6) {
+ } else if (type == TLE_V6) {
d6 = dst_addr;
rc = ctx->prm.lookup6(ctx->prm.lookup6_data, d6, dst);
} else
@@ -148,18 +160,25 @@ stream_get_dest(struct tle_stream *s, const void *dst_addr,
return -ENOENT;
dev = dst->dev;
- dst->ol_flags = dev->tx.ol_flags[s->type];
+ dst->ol_flags = dev->tx.ol_flags[type];
- if (s->type == TLE_V4) {
+ if (type == TLE_V4) {
struct ipv4_hdr *l3h;
l3h = (struct ipv4_hdr *)(dst->hdr + dst->l2_len);
- l3h->src_addr = dev->prm.local_addr4.s_addr;
+ if (((const struct in_addr*)src_addr)->s_addr != INADDR_ANY)
+ l3h->src_addr = ((const struct in_addr*)src_addr)->s_addr;
+ else
+ l3h->src_addr = dev->prm.local_addr4.s_addr;
l3h->dst_addr = d4->s_addr;
} else {
struct ipv6_hdr *l3h;
l3h = (struct ipv6_hdr *)(dst->hdr + dst->l2_len);
- rte_memcpy(l3h->src_addr, &dev->prm.local_addr6,
- sizeof(l3h->src_addr));
+ if (!IN6_IS_ADDR_UNSPECIFIED(src_addr))
+ rte_memcpy(l3h->src_addr, src_addr,
+ sizeof(l3h->src_addr));
+ else
+ rte_memcpy(l3h->src_addr, &dev->prm.local_addr6,
+ sizeof(l3h->src_addr));
rte_memcpy(l3h->dst_addr, d6, sizeof(l3h->dst_addr));
}
diff --git a/lib/libtle_l4p/stream_table.c b/lib/libtle_l4p/stream_table.c
index 5a89553..e029306 100644
--- a/lib/libtle_l4p/stream_table.c
+++ b/lib/libtle_l4p/stream_table.c
@@ -13,68 +13,47 @@
* limitations under the License.
*/
#include <string.h>
-#include <rte_malloc.h>
#include <rte_errno.h>
#include "stream_table.h"
void
-stbl_fini(struct stbl *st)
+bhash_fini(struct tle_ctx *ctx)
{
uint32_t i;
- for (i = 0; i != RTE_DIM(st->ht); i++) {
- rte_hash_free(st->ht[i].t);
- rte_free(st->ht[i].ent);
- }
-
- memset(st, 0, sizeof(*st));
+ for (i = 0; i != RTE_DIM(ctx->bhash); i++)
+ rte_hash_free(ctx->bhash[i]);
}
int
-stbl_init(struct stbl *st, uint32_t num, int32_t socket)
+bhash_init(struct tle_ctx *ctx)
{
- int32_t rc;
- size_t i, sz;
- struct rte_hash_parameters hprm;
+ int rc = 0;
+ struct rte_hash_parameters hprm = {0};
+ bool ipv6 = ctx->prm.lookup6 != NULL;
char buf[RTE_HASH_NAMESIZE];
- num = RTE_MAX(5 * num / 4, 0x10U);
-
- memset(&hprm, 0, sizeof(hprm));
hprm.name = buf;
- hprm.entries = num;
- hprm.socket_id = socket;
-
- rc = 0;
-
- snprintf(buf, sizeof(buf), "stbl4@%p", st);
- hprm.key_len = sizeof(struct stbl4_key);
- st->ht[TLE_V4].t = rte_hash_create(&hprm);
- if (st->ht[TLE_V4].t == NULL)
+ hprm.entries = 4096;
+ hprm.extra_flag = RTE_HASH_EXTRA_FLAGS_EXT_TABLE;
+ hprm.socket_id = ctx->prm.socket_id;
+
+ snprintf(buf, sizeof(buf), "bhash4@%p", ctx);
+ hprm.key_len = sizeof(struct bhash4_key);
+ ctx->bhash[TLE_V4] = rte_hash_create(&hprm);
+ if (ctx->bhash[TLE_V4] == NULL)
rc = (rte_errno != 0) ? -rte_errno : -ENOMEM;
- if (rc == 0) {
- snprintf(buf, sizeof(buf), "stbl6@%p", st);
- hprm.key_len = sizeof(struct stbl6_key);
- st->ht[TLE_V6].t = rte_hash_create(&hprm);
- if (st->ht[TLE_V6].t == NULL)
+ if (rc == 0 && ipv6) {
+ snprintf(buf, sizeof(buf), "bhash6@%p", ctx);
+ hprm.key_len = sizeof(struct bhash6_key);
+ ctx->bhash[TLE_V6] = rte_hash_create(&hprm);
+ if (ctx->bhash[TLE_V6] == NULL) {
+ rte_hash_free(ctx->bhash[TLE_V4]);
rc = (rte_errno != 0) ? -rte_errno : -ENOMEM;
+ }
}
- for (i = 0; i != RTE_DIM(st->ht) && rc == 0; i++) {
-
- sz = sizeof(*st->ht[i].ent) * num;
- st->ht[i].ent = rte_zmalloc_socket(NULL, sz,
- RTE_CACHE_LINE_SIZE, socket);
- if (st->ht[i].ent == NULL)
- rc = -ENOMEM;
- else
- st->ht[i].nb_ent = num;
- }
-
- if (rc != 0)
- stbl_fini(st);
-
return rc;
}
diff --git a/lib/libtle_l4p/stream_table.h b/lib/libtle_l4p/stream_table.h
index 033c306..ba8d165 100644
--- a/lib/libtle_l4p/stream_table.h
+++ b/lib/libtle_l4p/stream_table.h
@@ -16,199 +16,415 @@
#ifndef _STREAM_TABLE_H_
#define _STREAM_TABLE_H_
+#include <string.h>
#include <rte_hash.h>
-#include "tcp_misc.h"
+#include "stream.h"
+#include "misc.h"
#ifdef __cplusplus
extern "C" {
#endif
+#define HASH_SIZE_32K 32771
+#define HASH_SIZE_64K 65537
+#define HASH_SIZE_128K 131071
+
+#define HASH_SIZE HASH_SIZE_64K
+
struct stbl_entry {
void *data;
};
-struct shtbl {
- uint32_t nb_ent; /* max number of entries in the table. */
- rte_spinlock_t l; /* lock to protect the hash table */
- struct rte_hash *t;
- struct stbl_entry *ent;
+struct stbl {
+ rte_spinlock_t l;
+ uint32_t need_lock;
+ struct stbl_entry head[HASH_SIZE];
} __rte_cache_aligned;
-struct stbl {
- struct shtbl ht[TLE_VNUM];
-};
+static inline int
+stbl_init(struct stbl *st, uint32_t lock)
+{
+ st->need_lock = lock;
+ return 0;
+}
-struct stbl4_key {
- union l4_ports port;
- union ipv4_addrs addr;
-} __attribute__((__packed__));
+static inline int
+stbl_fini(struct stbl *st)
+{
+ st->need_lock = 0;
+ return 0;
+}
-struct stbl6_key {
- union l4_ports port;
- union ipv6_addrs addr;
-} __attribute__((__packed__));
+static inline uint8_t
+compare_pkt(const struct tle_stream *s, const union pkt_info *pi)
+{
+ if (s->type != pi->tf.type)
+ return -1;
-struct stbl_key {
- union l4_ports port;
- union {
- union ipv4_addrs addr4;
- union ipv6_addrs addr6;
- };
-} __attribute__((__packed__));
+ if (s->port.raw != pi->port.raw)
+ return -1;
-extern void stbl_fini(struct stbl *st);
+ if (s->type == TLE_V4) {
+ if (s->ipv4.addr.raw != pi->addr4.raw)
+ return -1;
+ } else {
+ if (memcmp(&s->ipv6.addr, pi->addr6, sizeof(union ipv6_addrs)))
+ return -1;
+ }
-extern int stbl_init(struct stbl *st, uint32_t num, int32_t socket);
+ return 0;
+}
-static inline void
-stbl_pkt_fill_key(struct stbl_key *k, const union pkt_info *pi, uint32_t type)
+static inline uint32_t
+stbl_hash_stream(const struct tle_stream *s)
{
- static const struct stbl_key zero = {
- .port.raw = 0,
- };
-
- k->port = pi->port;
- if (type == TLE_V4)
- k->addr4 = pi->addr4;
- else if (type == TLE_V6)
- k->addr6 = *pi->addr6;
- else
- *k = zero;
+ int i;
+ unsigned int hash;
+
+ if (s->type == TLE_V4) {
+ hash = s->ipv4.addr.src ^ s->ipv4.addr.dst
+ ^ s->port.src ^ s->port.dst;
+ } else {
+ hash = s->port.src ^ s->port.dst;
+ for (i = 0; i < 4; i++) {
+ hash ^= s->ipv6.addr.src.u32[i];
+ hash ^= s->ipv6.addr.dst.u32[i];
+ }
+ }
+
+ return hash % HASH_SIZE;
}
-static inline void
-stbl_lock(struct stbl *st, uint32_t type)
+static inline uint32_t
+stbl_hash_pkt(const union pkt_info* pi)
{
- rte_spinlock_lock(&st->ht[type].l);
+ int i;
+ unsigned int hash;
+
+ if (pi->tf.type == TLE_V4) {
+ hash = pi->addr4.src ^ pi->addr4.dst ^ pi->port.src ^ pi->port.dst;
+ } else {
+ hash = pi->port.src ^ pi->port.dst;
+ for (i = 0; i < 4; i++) {
+ hash ^= pi->addr6->src.u32[i];
+ hash ^= pi->addr6->dst.u32[i];
+ }
+ }
+
+ return hash % HASH_SIZE;
}
-static inline void
-stbl_unlock(struct stbl *st, uint32_t type)
+static inline struct stbl_entry*
+stbl_add_stream(struct stbl *st, struct tle_stream *s)
{
- rte_spinlock_unlock(&st->ht[type].l);
+ struct stbl_entry* entry;
+
+ if (st->need_lock)
+ rte_spinlock_lock(&st->l);
+ entry = &st->head[stbl_hash_stream(s)];
+ s->link.stqe_next = (struct tle_stream*)entry->data;
+ entry->data = s;
+ if (st->need_lock)
+ rte_spinlock_unlock(&st->l);
+
+ return entry;
}
-static inline struct stbl_entry *
-stbl_add_entry(struct stbl *st, const union pkt_info *pi)
+static inline struct tle_stream *
+stbl_find_stream(struct stbl *st, const union pkt_info *pi)
{
- int32_t rc;
- uint32_t type;
- struct shtbl *ht;
- struct stbl_key k;
-
- type = pi->tf.type;
- stbl_pkt_fill_key(&k, pi, type);
- ht = st->ht + type;
-
- rc = rte_hash_add_key(ht->t, &k);
- if ((uint32_t)rc >= ht->nb_ent)
- return NULL;
- return ht->ent + rc;
+ struct tle_stream* head;
+
+ if (st->need_lock)
+ rte_spinlock_lock(&st->l);
+ head = (struct tle_stream*)st->head[stbl_hash_pkt(pi)].data;
+ while (head != NULL) {
+ if (compare_pkt(head, pi) == 0)
+ break;
+
+ head = head->link.stqe_next;
+ }
+ if (st->need_lock)
+ rte_spinlock_unlock(&st->l);
+ return head;
}
-static inline struct stbl_entry *
-stbl_add_stream(struct stbl *st, const union pkt_info *pi, const void *s)
+static inline void
+stbl_del_stream(struct stbl *st, struct stbl_entry *se,
+ struct tle_stream *s)
{
- struct stbl_entry *se;
+ struct tle_stream *prev, *current;
- se = stbl_add_entry(st, pi);
- if (se != NULL)
- se->data = (void *)(uintptr_t)s;
- return se;
+ if (st->need_lock)
+ rte_spinlock_lock(&st->l);
+ if (se == NULL)
+ se = &st->head[stbl_hash_stream(s)];
+ prev = NULL;
+ current = (struct tle_stream*)se->data;
+ while (current != NULL) {
+ if (current != s) {
+ prev = current;
+ current = current->link.stqe_next;
+ continue;
+ }
+
+ if (prev)
+ prev->link.stqe_next = current->link.stqe_next;
+ else
+ se->data = current->link.stqe_next;
+ break;
+ }
+ if (st->need_lock)
+ rte_spinlock_unlock(&st->l);
+
+ s->link.stqe_next = NULL;
}
-static inline struct stbl_entry *
-stbl_find_entry(struct stbl *st, const union pkt_info *pi)
+struct bhash4_key {
+ uint16_t port;
+ uint32_t addr;
+} __attribute__((__packed__));
+
+struct bhash6_key {
+ uint16_t port;
+ rte_xmm_t addr;
+} __attribute__((__packed__));
+
+struct bhash_key {
+ uint16_t port;
+ union {
+ uint32_t addr4;
+ rte_xmm_t addr6;
+ };
+} __attribute__((__packed__));
+
+void bhash_fini(struct tle_ctx *ctx);
+
+int bhash_init(struct tle_ctx *ctx);
+
+static inline int
+bhash_sockaddr2key(const struct sockaddr *addr, struct bhash_key *key)
{
- int32_t rc;
- uint32_t type;
- struct shtbl *ht;
- struct stbl_key k;
-
- type = pi->tf.type;
- stbl_pkt_fill_key(&k, pi, type);
- ht = st->ht + type;
-
- rc = rte_hash_lookup(ht->t, &k);
- if ((uint32_t)rc >= ht->nb_ent)
- return NULL;
- return ht->ent + rc;
+ int t;
+ const struct sockaddr_in *lin4;
+ const struct sockaddr_in6 *lin6;
+
+ if (addr->sa_family == AF_INET) {
+ lin4 = (const struct sockaddr_in *)addr;
+ key->port = lin4->sin_port;
+ key->addr4 = lin4->sin_addr.s_addr;
+ t = TLE_V4;
+ } else {
+ lin6 = (const struct sockaddr_in6 *)addr;
+ memcpy(&key->addr6, &lin6->sin6_addr, sizeof(key->addr6));
+ key->port = lin6->sin6_port;
+ t = TLE_V6;
+ }
+
+ return t;
}
-static inline void *
-stbl_find_data(struct stbl *st, const union pkt_info *pi)
+/* Return 0 on success;
+ * Return errno on failure.
+ */
+static inline int
+bhash_add_entry(struct tle_ctx *ctx, const struct sockaddr *addr,
+ struct tle_stream *s)
{
- struct stbl_entry *ent;
-
- ent = stbl_find_entry(st, pi);
- return (ent == NULL) ? NULL : ent->data;
+ int t;
+ int rc;
+ int is_first;
+ struct bhash_key key;
+ struct rte_hash *bhash;
+ struct tle_stream *old, *tmp;
+
+ is_first = 0;
+ t = bhash_sockaddr2key(addr, &key);
+
+ rte_spinlock_lock(&ctx->bhash_lock[t]);
+ bhash = ctx->bhash[t];
+ rc = rte_hash_lookup_data(bhash, &key, (void **)&old);
+ if (rc == -ENOENT) {
+ is_first = 1;
+ s->link.stqe_next = NULL; /* just to avoid follow */
+ rc = rte_hash_add_key_data(bhash, &key, s);
+ } else if (rc >= 0) {
+ if (t == TLE_V4 && old->type == TLE_V6) {
+ /* V6 stream may listen V4 address, assure V4 stream
+ * is ahead of V6 stream in the list
+ */
+ s->link.stqe_next = old;
+ rte_hash_add_key_data(bhash, &key, s);
+ } else {
+ tmp = old->link.stqe_next;
+ old->link.stqe_next = s;
+ s->link.stqe_next = tmp;
+ }
+ }
+ rte_spinlock_unlock(&ctx->bhash_lock[t]);
+
+ /* IPv6 socket with unspecified address could receive IPv4 packets.
+ * So the stream should also be recorded in IPv4 table.
+ * Only the first stream need be inserted into V4 list, otherwise
+ * the V6 list is already following V4 list.
+ */
+ if (t == TLE_V6 && !s->option.ipv6only && is_first &&
+ IN6_IS_ADDR_UNSPECIFIED(&key.addr6)) {
+ t = TLE_V4;
+ rte_spinlock_lock(&ctx->bhash_lock[t]);
+ bhash = ctx->bhash[t];
+ rc = rte_hash_lookup_data(bhash, &key, (void **)&old);
+ if (rc == -ENOENT)
+ rc = rte_hash_add_key_data(bhash, &key, s);
+ else if (rc >= 0) {
+ while(old->link.stqe_next != NULL)
+ old = old->link.stqe_next;
+ old->link.stqe_next = s;
+ s->link.stqe_next = NULL;
+ }
+ rte_spinlock_unlock(&ctx->bhash_lock[t]);
+ }
+
+ return (rc >= 0) ? 0 : (-rc);
}
-#include "tcp_stream.h"
-
static inline void
-stbl_stream_fill_key(struct stbl_key *k, const struct tle_stream *s,
- uint32_t type)
+bhash_del_entry(struct tle_ctx *ctx, struct tle_stream *s,
+ const struct sockaddr *addr)
{
- static const struct stbl_key zero = {
- .port.raw = 0,
- };
+ int t;
+ int rc;
+ struct bhash_key key;
+ struct tle_stream *f, *cur, *pre = NULL;
+
+ t = bhash_sockaddr2key(addr, &key);
+
+ rte_spinlock_lock(&ctx->bhash_lock[t]);
+ rc = rte_hash_lookup_data(ctx->bhash[t], &key, (void **)&f);
+ if (rc >= 0) {
+ cur = f;
+ pre = NULL;
+ while (cur != s) {
+ pre = cur;
+ cur = cur->link.stqe_next;
+ }
+
+ if (pre == NULL) {
+ cur = cur->link.stqe_next;
+ if (cur == NULL)
+ rte_hash_del_key(ctx->bhash[t], &key);
+ else /* change data */
+ rte_hash_add_key_data(ctx->bhash[t], &key, cur);
+ } else
+ pre->link.stqe_next = cur->link.stqe_next;
+ }
+
+ rte_spinlock_unlock(&ctx->bhash_lock[t]);
+
+ if (rc < 0)
+ return;
+
+ s->link.stqe_next = NULL;
+
+ /* IPv6 socket with unspecified address could receive IPv4 packets.
+ * So the stream should also be recorded in IPv4 table*/
+ if (t == TLE_V6 && !s->option.ipv6only && pre == NULL &&
+ IN6_IS_ADDR_UNSPECIFIED(&key.addr6)) {
+ t = TLE_V4;
+ rte_spinlock_lock(&ctx->bhash_lock[t]);
+ rc = rte_hash_lookup_data(ctx->bhash[t], &key, (void **)&f);
+ if (rc >= 0) {
+ cur = f;
+ pre = NULL;
+ while (cur != s) {
+ pre = cur;
+ cur = cur->link.stqe_next;
+ }
+
+ if (pre == NULL) {
+ cur = cur->link.stqe_next;
+ if (cur == NULL)
+ rte_hash_del_key(ctx->bhash[t], &key);
+ else /* change data */
+ rte_hash_add_key_data(ctx->bhash[t], &key, cur);
+ } else
+ pre->link.stqe_next = cur->link.stqe_next;
+ }
+
+ rte_spinlock_unlock(&ctx->bhash_lock[t]);
+ }
- k->port = s->port;
- if (type == TLE_V4)
- k->addr4 = s->ipv4.addr;
- else if (type == TLE_V6)
- k->addr6 = s->ipv6.addr;
- else
- *k = zero;
}
-static inline struct stbl_entry *
-stbl_add_stream_lock(struct stbl *st, const struct tle_tcp_stream *s)
+static inline void *
+bhash_reuseport_get_stream(struct tle_stream *s)
{
- uint32_t type;
- struct stbl_key k;
- struct stbl_entry *se;
- struct shtbl *ht;
- int32_t rc;
-
- type = s->s.type;
- stbl_stream_fill_key(&k, &s->s, type);
- ht = st->ht + type;
+ int n = 0;
+ struct tle_stream *e, *all[32];
+
+ e = s;
+ while(e && n < 32) {
+ all[n++] = e;
+ e = e->link.stqe_next;
+ }
+
+ /* for each connection, this function will be called twice
+ * 1st time for the first handshake: SYN
+ * 2nd time for the third handshake: ACK
+ */
+ return all[(s->reuseport_seed++) % n];
+}
- stbl_lock(st, type);
- rc = rte_hash_add_key(ht->t, &k);
- stbl_unlock(st, type);
+static inline void *
+bhash_lookup4(struct rte_hash *t, uint32_t addr, uint16_t port, uint8_t reuse)
+{
+ int rc;
+ void *s = NULL;
+ struct bhash_key key = {
+ .port = port,
+ .addr4 = addr,
+ };
- if ((uint32_t)rc >= ht->nb_ent)
- return NULL;
+ rc = rte_hash_lookup_data(t, &key, &s);
+ if (rc == -ENOENT) {
+ key.addr4 = INADDR_ANY;
+ rc = rte_hash_lookup_data(t, &key, &s);
+ }
- se = ht->ent + rc;
- if (se != NULL)
- se->data = (void *)(uintptr_t)s;
+ if (rc >= 0) {
+ if (reuse)
+ return bhash_reuseport_get_stream(s);
+ else
+ return s;
+ }
- return se;
+ return NULL;
}
-static inline void
-stbl_del_stream(struct stbl *st, struct stbl_entry *se,
- const struct tle_tcp_stream *s, uint32_t lock)
+static inline void *
+bhash_lookup6(struct rte_hash *t, rte_xmm_t addr, uint16_t port, uint8_t reuse)
{
- uint32_t type;
- struct stbl_key k;
+ int rc;
+ void *s = NULL;
+ struct bhash_key key = {
+ .port = port,
+ .addr6 = addr,
+ };
- if (se == NULL)
- return;
+ rc = rte_hash_lookup_data(t, &key, &s);
+ if (rc == -ENOENT) {
+ memcpy(&key.addr6, &tle_ipv6_any, sizeof(key.addr6));
+ rc = rte_hash_lookup_data(t, &key, &s);
+ }
- se->data = NULL;
+ if (rc >= 0) {
+ if (reuse)
+ return bhash_reuseport_get_stream(s);
+ else
+ return s;
+ }
- type = s->s.type;
- stbl_stream_fill_key(&k, &s->s, type);
- if (lock != 0)
- stbl_lock(st, type);
- rte_hash_del_key(st->ht[type].t, &k);
- if (lock != 0)
- stbl_unlock(st, type);
+ return NULL;
}
#ifdef __cplusplus
diff --git a/lib/libtle_l4p/syncookie.h b/lib/libtle_l4p/syncookie.h
index 61bfce4..bf01e78 100644
--- a/lib/libtle_l4p/syncookie.h
+++ b/lib/libtle_l4p/syncookie.h
@@ -182,9 +182,12 @@ sync_fill_tcb(struct tcb *tcb, const union seg_info *si, const union tsopt *to)
{
uint32_t ack, mss, seq, wscale;
+ tcb->err = 0;
+
seq = si->seq;
tcb->rcv.nxt = seq;
+ tcb->rcv.cpy = seq;
tcb->rcv.irs = seq - 1;
tcb->snd.wu.wl1 = seq;
@@ -202,6 +205,7 @@ sync_fill_tcb(struct tcb *tcb, const union seg_info *si, const union tsopt *to)
tcb->so.mss = mss;
tcb->snd.ts = to->ecr;
+ tcb->snd.cork_ts = 0;
tcb->rcv.ts = to->val;
tcb->so.ts.raw = to->raw;
diff --git a/lib/libtle_l4p/tcp_ctl.h b/lib/libtle_l4p/tcp_ctl.h
index bec1e76..3196470 100644
--- a/lib/libtle_l4p/tcp_ctl.h
+++ b/lib/libtle_l4p/tcp_ctl.h
@@ -22,6 +22,7 @@
#include "tcp_stream.h"
#include "tcp_ofo.h"
+#include "tcp_timer.h"
#ifdef __cplusplus
extern "C" {
@@ -97,10 +98,10 @@ calc_rx_wnd(const struct tle_tcp_stream *s, uint32_t scale)
/* peer doesn't support WSCALE option, wnd size is limited to 64K */
if (scale == TCP_WSCALE_NONE) {
- wnd = _rte_ring_get_mask(s->rx.q) << TCP_WSCALE_DEFAULT;
+ wnd = rte_ring_free_count(s->rx.q) << TCP_WSCALE_DEFAULT;
return RTE_MIN(wnd, (uint32_t)UINT16_MAX);
} else
- return _rte_ring_get_mask(s->rx.q) << scale;
+ return rte_ring_free_count(s->rx.q) << scale;
}
/* empty stream's send queue */
@@ -144,31 +145,34 @@ static inline void
tcp_stream_reset(struct tle_ctx *ctx, struct tle_tcp_stream *s)
{
struct stbl *st;
- uint16_t uop;
+ uint16_t state;
+ uint8_t i;
st = CTX_TCP_STLB(ctx);
- /* reset TX armed */
- rte_atomic32_set(&s->tx.arm, 0);
+ for (i = 0; i < TIMER_NUM; i++)
+ timer_stop(s, i);
/* reset TCB */
- uop = s->tcb.uop & ~TCP_OP_CLOSE;
+ state = s->tcb.state;
memset(&s->tcb, 0, sizeof(s->tcb));
/* reset cached destination */
memset(&s->tx.dst, 0, sizeof(s->tx.dst));
- if (uop != TCP_OP_ACCEPT) {
+ /* state could be ESTABLISHED, CLOSED or LISTEN
+ * stream in CLOSED state has already been cleared by stream_term
+ * stream in ESTABLISHED state is accepted stream, and doesn't need clear
+ */
+ if (state == TCP_ST_LISTEN) {
/* free stream's destination port */
stream_clear_ctx(ctx, &s->s);
- if (uop == TCP_OP_LISTEN)
- empty_lq(s);
+ empty_lq(s);
}
if (s->ste != NULL) {
/* remove entry from RX streams table */
- stbl_del_stream(st, s->ste, s,
- (s->flags & TLE_CTX_FLAG_ST) == 0);
+ stbl_del_stream(st, s->ste, &s->s);
s->ste = NULL;
empty_rq(s);
}
@@ -184,6 +188,48 @@ tcp_stream_reset(struct tle_ctx *ctx, struct tle_tcp_stream *s)
put_stream(ctx, &s->s, TCP_STREAM_TX_FINISHED(s));
}
+static inline void
+stream_term(struct tle_tcp_stream *s)
+{
+ struct sdr *dr;
+
+ /* 1) recv a RST packet; 2) keepalive timeout */
+ if (s->tcb.state == TCP_ST_ESTABLISHED) {
+ TCP_DEC_STATS_ATOMIC(TCP_MIB_CURRESTAB);
+ TCP_INC_STATS(TCP_MIB_ESTABRESETS);
+ }
+
+ s->tcb.state = TCP_ST_CLOSED;
+ rte_smp_wmb();
+
+ /* close() was already invoked, schedule final cleanup */
+ if ((s->tcb.uop & TCP_OP_CLOSE) != 0) {
+ if ((s->tcb.uop & TCP_OP_ACCEPT) == 0) {
+ /* free stream's destination port */
+ stream_clear_ctx(s->s.ctx, &s->s);
+ if ((s->tcb.uop & TCP_OP_LISTEN) != 0)
+ empty_lq(s);
+ }
+
+ if (s->ste != NULL) {
+ /* remove entry from RX streams table */
+ stbl_del_stream(CTX_TCP_STLB(s->s.ctx), s->ste, &s->s);
+ s->ste = NULL;
+ empty_rq(s);
+ }
+
+ dr = CTX_TCP_SDR(s->s.ctx);
+ rte_spinlock_lock(&dr->lock);
+ STAILQ_INSERT_TAIL(&dr->be, &s->s, link);
+ rte_spinlock_unlock(&dr->lock);
+
+ /* notify user that stream need to be closed */
+ } else if (s->err.ev != NULL)
+ tle_event_raise(s->err.ev);
+ else if (s->err.cb.func != NULL)
+ s->err.cb.func(s->err.cb.data, &s->s);
+}
+
#ifdef __cplusplus
}
#endif
diff --git a/lib/libtle_l4p/tcp_misc.h b/lib/libtle_l4p/tcp_misc.h
index 0cef8b2..1f7974e 100644
--- a/lib/libtle_l4p/tcp_misc.h
+++ b/lib/libtle_l4p/tcp_misc.h
@@ -30,7 +30,7 @@ extern "C" {
* of protocol related data.
*/
-#define TCP_WSCALE_DEFAULT 7
+#define TCP_WSCALE_DEFAULT 10
#define TCP_WSCALE_NONE 0
#define TCP_TX_HDR_MAX (sizeof(struct tcp_hdr) + TCP_TX_OPT_LEN_MAX)
@@ -71,27 +71,6 @@ extern "C" {
/* TCP flags mask. */
#define TCP_FLAG_MASK UINT8_MAX
-union typflg {
- uint16_t raw;
- struct {
- uint8_t type; /* TLE_V4/TLE_V6 */
- uint8_t flags; /* TCP header flags */
- };
-};
-
-union pkt_info {
- rte_xmm_t raw;
- struct {
- union typflg tf;
- uint16_t csf; /* checksum flags */
- union l4_ports port;
- union {
- union ipv4_addrs addr4;
- const union ipv6_addrs *addr6;
- };
- };
-};
-
union seg_info {
rte_xmm_t raw;
struct {
@@ -226,7 +205,7 @@ struct dack_info {
};
/* get current timestamp in ms */
-static inline uint32_t
+static inline uint64_t
tcp_get_tms(uint32_t mshift)
{
uint64_t ts;
@@ -344,7 +323,9 @@ fill_syn_opts(void *p, const struct syn_opts *so)
opt = (struct tcpopt *)to;
}
- to[0] = TCP_OPT_KIND_EOL;
+ to[0] = TCP_OPT_KIND_NOP;
+ to[1] = TCP_OPT_KIND_NOP;
+ to[2] = TCP_OPT_KIND_NOP;
}
/*
@@ -390,6 +371,8 @@ get_tms_opts(uintptr_t p, uint32_t len)
else if (kind == TCP_OPT_KIND_NOP)
i += sizeof(to->kl.kind);
else {
+ if (to->kl.len == 0)
+ break;
i += to->kl.len;
if (i <= len && to->kl.raw == TCP_OPT_KL_TMS) {
ts.val = rte_be_to_cpu_32(to->ts.val);
@@ -449,7 +432,6 @@ get_pkt_info(const struct rte_mbuf *m, union pkt_info *pi, union seg_info *si)
((uintptr_t)tcph + offsetof(struct tcp_hdr, src_port));
pi->tf.flags = tcph->tcp_flags;
pi->tf.type = type;
- pi->csf = m->ol_flags & (PKT_RX_IP_CKSUM_MASK | PKT_RX_L4_CKSUM_MASK);
pi->port.raw = prt->raw;
get_seg_info(tcph, si);
@@ -462,7 +444,7 @@ tcp_mbuf_seq_free(struct rte_mbuf *mb[], uint32_t num)
len = 0;
for (i = 0; i != num; i++) {
- len += mb[i]->pkt_len;
+ len += PKT_L4_PLEN(mb[i]);
rte_pktmbuf_free(mb[i]);
}
diff --git a/lib/libtle_l4p/tcp_ofo.c b/lib/libtle_l4p/tcp_ofo.c
index 1565445..b31f2b5 100644
--- a/lib/libtle_l4p/tcp_ofo.c
+++ b/lib/libtle_l4p/tcp_ofo.c
@@ -12,7 +12,6 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-#include <rte_malloc.h>
#include <rte_errno.h>
#include "tcp_stream.h"
@@ -28,12 +27,6 @@
#define OFO_OBJ_MAX (OFODB_OBJ_MAX * OFO_DB_MAX)
void
-tcp_ofo_free(struct ofo *ofo)
-{
- rte_free(ofo);
-}
-
-static void
calc_ofo_elems(uint32_t nbufs, uint32_t *nobj, uint32_t *ndb)
{
uint32_t n, nd, no;
@@ -51,35 +44,3 @@ calc_ofo_elems(uint32_t nbufs, uint32_t *nobj, uint32_t *ndb)
*nobj = no;
*ndb = nd;
}
-
-struct ofo *
-tcp_ofo_alloc(uint32_t nbufs, int32_t socket)
-{
- uint32_t i, ndb, nobj;
- size_t dsz, osz, sz;
- struct ofo *ofo;
- struct rte_mbuf **obj;
-
- calc_ofo_elems(nbufs, &nobj, &ndb);
- osz = sizeof(*ofo) + sizeof(ofo->db[0]) * ndb;
- dsz = sizeof(ofo->db[0].obj[0]) * nobj * ndb;
- sz = osz + dsz;
-
- ofo = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE, socket);
- if (ofo == NULL) {
- TCP_LOG(ERR, "%s: allocation of %zu bytes on socket %d "
- "failed with error code: %d\n",
- __func__, sz, socket, rte_errno);
- return NULL;
- }
-
- obj = (struct rte_mbuf **)&ofo->db[ndb];
- for (i = 0; i != ndb; i++) {
- ofo->db[i].nb_max = nobj;
- ofo->db[i].obj = obj + i * nobj;
- }
-
- ofo->nb_max = ndb;
- return ofo;
-}
-
diff --git a/lib/libtle_l4p/tcp_ofo.h b/lib/libtle_l4p/tcp_ofo.h
index 9d88266..0857f17 100644
--- a/lib/libtle_l4p/tcp_ofo.h
+++ b/lib/libtle_l4p/tcp_ofo.h
@@ -20,8 +20,6 @@
extern "C" {
#endif
-#include <stdbool.h>
-
struct ofodb {
uint32_t nb_elem;
uint32_t nb_max;
@@ -103,7 +101,7 @@ _ofo_insert_mbuf(struct ofo* ofo, uint32_t pos, union seqlen* sl,
db->obj[k + i] = mb[i];
}
if (tcp_seq_lt(end, seq))
- rte_pktmbuf_trim(mb[i - 1], seq - end);
+ _rte_pktmbuf_trim(mb[i - 1], seq - end);
db->nb_elem += i;
db->sl.len += tcp_seq_min(seq, end) - sl->seq;
@@ -157,7 +155,7 @@ _ofo_insert_right(struct ofo *ofo, uint32_t pos, union seqlen *sl,
plen = mb[i]->pkt_len;
if (n < plen) {
/* adjust partially overlapped packet. */
- rte_pktmbuf_adj(mb[i], n);
+ mb[i] = _rte_pktmbuf_adj(mb[i], n);
break;
}
}
@@ -258,7 +256,7 @@ static inline uint32_t
_ofodb_enqueue(struct rte_ring *r, const struct ofodb *db, uint32_t *seq)
{
uint32_t i, n, num, begin, end;
- struct rte_mbuf *pkt;
+ struct rte_mbuf* pkt;
n = 0;
num = db->nb_elem;
@@ -289,11 +287,7 @@ _ofodb_enqueue(struct rte_ring *r, const struct ofodb *db, uint32_t *seq)
return num - n;
}
-struct ofo *
-tcp_ofo_alloc(uint32_t nbufs, int32_t socket);
-
-void
-tcp_ofo_free(struct ofo *ofo);
+void calc_ofo_elems(uint32_t nbufs, uint32_t *nobj, uint32_t *ndb);
#ifdef __cplusplus
}
diff --git a/lib/libtle_l4p/tcp_rxq.h b/lib/libtle_l4p/tcp_rxq.h
index 2351ee6..be092f9 100644
--- a/lib/libtle_l4p/tcp_rxq.h
+++ b/lib/libtle_l4p/tcp_rxq.h
@@ -17,6 +17,7 @@
#define _TCP_RXQ_H_
#include "tcp_ofo.h"
+#include "tcp_ctl.h"
#ifdef __cplusplus
extern "C" {
@@ -74,6 +75,7 @@ rx_ofo_reduce(struct tle_tcp_stream *s)
s->tcb.rcv.nxt = seq;
_ofo_remove(ofo, 0, i);
+
return n;
}
@@ -133,6 +135,8 @@ rx_data_enqueue(struct tle_tcp_stream *s, uint32_t seq, uint32_t len,
}
n = rte_ring_count(s->rx.q);
+ /* update receive window with left recv buffer*/
+ s->tcb.rcv.wnd = calc_rx_wnd(s, s->tcb.rcv.wscale);
if (r != n) {
/* raise RX event */
if (s->rx.ev != NULL)
diff --git a/lib/libtle_l4p/tcp_rxtx.c b/lib/libtle_l4p/tcp_rxtx.c
index a519645..5d7e0d1 100644
--- a/lib/libtle_l4p/tcp_rxtx.c
+++ b/lib/libtle_l4p/tcp_rxtx.c
@@ -28,8 +28,30 @@
#include "tcp_rxq.h"
#include "tcp_txq.h"
#include "tcp_tx_seg.h"
+#include "tcp_rxtx.h"
-#define TCP_MAX_PKT_SEG 0x20
+/* Uncomment below line to debug cwnd */
+// #define DEBUG_CWND
+
+#ifdef DEBUG_CWND
+#define CWND_INFO(msg, value) printf("CWND: %s: %d\n", msg, value)
+#else
+#define CWND_INFO(msg, value) do {} while (0)
+#endif
+
+#define TCP_MAX_PKT_SEG 0x20
+#define DELAY_ACK_CHECK_INTERVAL 100
+
+/* must larger than l2_len(14)+l3_len(20)+l4_len(20)+tms_option(12) */
+#define RESERVE_HEADER_LEN 128
+
+/* If we encounter exhaustion of recv win, we set this thresh to
+ * update recv win to the remote. It's not set to 1 or some smaller
+ * value to avoid too-frequent update.
+ */
+#define RECV_WIN_NOTIFY_THRESH 64
+
+static inline int stream_fill_dest(struct tle_tcp_stream *s);
/*
* checks if input TCP ports and IP addresses match given stream.
@@ -54,11 +76,17 @@ rx_check_stream(const struct tle_tcp_stream *s, const union pkt_info *pi)
static inline struct tle_tcp_stream *
rx_obtain_listen_stream(const struct tle_dev *dev, const union pkt_info *pi,
- uint32_t type)
+ uint32_t type, uint8_t reuse)
{
struct tle_tcp_stream *s;
- s = (struct tle_tcp_stream *)dev->dp[type]->streams[pi->port.dst];
+ if (type == TLE_V4)
+ s = bhash_lookup4(dev->ctx->bhash[type],
+ pi->addr4.dst, pi->port.dst, reuse);
+ else
+ s = bhash_lookup6(dev->ctx->bhash[type],
+ pi->addr6->dst, pi->port.dst, reuse);
+
if (s == NULL || tcp_stream_acquire(s) < 0)
return NULL;
@@ -77,10 +105,10 @@ rx_obtain_stream(const struct tle_dev *dev, struct stbl *st,
{
struct tle_tcp_stream *s;
- s = stbl_find_data(st, pi);
+ s = TCP_STREAM(stbl_find_stream(st, pi));
if (s == NULL) {
- if (pi->tf.flags == TCP_FLAG_ACK)
- return rx_obtain_listen_stream(dev, pi, type);
+ if (pi->tf.flags & TCP_FLAG_ACK)
+ return rx_obtain_listen_stream(dev, pi, type, 1);
return NULL;
}
@@ -150,131 +178,6 @@ pkt_info_bulk_syneq(const union pkt_info pi[], uint32_t num)
return i;
}
-static inline void
-stream_drb_free(struct tle_tcp_stream *s, struct tle_drb *drbs[],
- uint32_t nb_drb)
-{
- _rte_ring_enqueue_burst(s->tx.drb.r, (void **)drbs, nb_drb);
-}
-
-static inline uint32_t
-stream_drb_alloc(struct tle_tcp_stream *s, struct tle_drb *drbs[],
- uint32_t nb_drb)
-{
- return _rte_ring_dequeue_burst(s->tx.drb.r, (void **)drbs, nb_drb);
-}
-
-static inline uint32_t
-get_ip_pid(struct tle_dev *dev, uint32_t num, uint32_t type, uint32_t st)
-{
- uint32_t pid;
- rte_atomic32_t *pa;
-
- pa = &dev->tx.packet_id[type];
-
- if (st == 0) {
- pid = rte_atomic32_add_return(pa, num);
- return pid - num;
- } else {
- pid = rte_atomic32_read(pa);
- rte_atomic32_set(pa, pid + num);
- return pid;
- }
-}
-
-static inline void
-fill_tcph(struct tcp_hdr *l4h, const struct tcb *tcb, union l4_ports port,
- uint32_t seq, uint8_t hlen, uint8_t flags)
-{
- uint16_t wnd;
-
- l4h->src_port = port.dst;
- l4h->dst_port = port.src;
-
- wnd = (flags & TCP_FLAG_SYN) ?
- RTE_MIN(tcb->rcv.wnd, (uint32_t)UINT16_MAX) :
- tcb->rcv.wnd >> tcb->rcv.wscale;
-
- /* ??? use sse shuffle to hton all remaining 16 bytes at once. ??? */
- l4h->sent_seq = rte_cpu_to_be_32(seq);
- l4h->recv_ack = rte_cpu_to_be_32(tcb->rcv.nxt);
- l4h->data_off = hlen / TCP_DATA_ALIGN << TCP_DATA_OFFSET;
- l4h->tcp_flags = flags;
- l4h->rx_win = rte_cpu_to_be_16(wnd);
- l4h->cksum = 0;
- l4h->tcp_urp = 0;
-
- if (flags & TCP_FLAG_SYN)
- fill_syn_opts(l4h + 1, &tcb->so);
- else if ((flags & TCP_FLAG_RST) == 0 && tcb->so.ts.raw != 0)
- fill_tms_opts(l4h + 1, tcb->snd.ts, tcb->rcv.ts);
-}
-
-static inline int
-tcp_fill_mbuf(struct rte_mbuf *m, const struct tle_tcp_stream *s,
- const struct tle_dest *dst, uint64_t ol_flags,
- union l4_ports port, uint32_t seq, uint32_t flags,
- uint32_t pid, uint32_t swcsm)
-{
- uint32_t l4, len, plen;
- struct tcp_hdr *l4h;
- char *l2h;
-
- len = dst->l2_len + dst->l3_len;
- plen = m->pkt_len;
-
- if (flags & TCP_FLAG_SYN)
- l4 = sizeof(*l4h) + TCP_TX_OPT_LEN_MAX;
- else if ((flags & TCP_FLAG_RST) == 0 && s->tcb.rcv.ts != 0)
- l4 = sizeof(*l4h) + TCP_TX_OPT_LEN_TMS;
- else
- l4 = sizeof(*l4h);
-
- /* adjust mbuf to put L2/L3/L4 headers into it. */
- l2h = rte_pktmbuf_prepend(m, len + l4);
- if (l2h == NULL)
- return -EINVAL;
-
- /* copy L2/L3 header */
- rte_memcpy(l2h, dst->hdr, len);
-
- /* setup TCP header & options */
- l4h = (struct tcp_hdr *)(l2h + len);
- fill_tcph(l4h, &s->tcb, port, seq, l4, flags);
-
- /* setup mbuf TX offload related fields. */
- m->tx_offload = _mbuf_tx_offload(dst->l2_len, dst->l3_len, l4, 0, 0, 0);
- m->ol_flags |= ol_flags;
-
- /* update proto specific fields. */
-
- if (s->s.type == TLE_V4) {
- struct ipv4_hdr *l3h;
- l3h = (struct ipv4_hdr *)(l2h + dst->l2_len);
- l3h->packet_id = rte_cpu_to_be_16(pid);
- l3h->total_length = rte_cpu_to_be_16(plen + dst->l3_len + l4);
-
- if ((ol_flags & PKT_TX_TCP_CKSUM) != 0)
- l4h->cksum = _ipv4x_phdr_cksum(l3h, m->l3_len,
- ol_flags);
- else if (swcsm != 0)
- l4h->cksum = _ipv4_udptcp_mbuf_cksum(m, len, l3h);
-
- if ((ol_flags & PKT_TX_IP_CKSUM) == 0 && swcsm != 0)
- l3h->hdr_checksum = _ipv4x_cksum(l3h, m->l3_len);
- } else {
- struct ipv6_hdr *l3h;
- l3h = (struct ipv6_hdr *)(l2h + dst->l2_len);
- l3h->payload_len = rte_cpu_to_be_16(plen + l4);
- if ((ol_flags & PKT_TX_TCP_CKSUM) != 0)
- l4h->cksum = rte_ipv6_phdr_cksum(l3h, ol_flags);
- else if (swcsm != 0)
- l4h->cksum = _ipv6_udptcp_mbuf_cksum(m, len, l3h);
- }
-
- return 0;
-}
-
/*
* That function supposed to be used only for data packets.
* Assumes that L2/L3/L4 headers and mbuf fields already setup properly.
@@ -355,6 +258,9 @@ tx_data_pkts(struct tle_tcp_stream *s, struct rte_mbuf *const m[], uint32_t num)
i = tle_dring_mp_enqueue(&dev->tx.dr, (const void * const*)m,
num, drb, &nb);
+ if (i > 0)
+ timer_stop(s, TIMER_DACK);
+
/* free unused drbs. */
if (nb != 0)
stream_drb_free(s, drb + nbm - nb, nb);
@@ -362,6 +268,113 @@ tx_data_pkts(struct tle_tcp_stream *s, struct rte_mbuf *const m[], uint32_t num)
return i;
}
+/*
+ * case 0: pkt is not split yet, (indicate plen > sl->len)
+ * case 1: pkt is split, but left packet > sl->len
+ * case 2: pkt is split, but left packet <= sl->len
+ */
+static inline struct rte_mbuf *
+get_indirect_mbuf(struct tle_tcp_stream *s,
+ struct rte_mbuf *m, uint32_t *p_plen,
+ union seqlen *sl, uint32_t type,
+ uint32_t mss)
+{
+ uint32_t hdr_len = PKT_L234_HLEN(m), plen, left;
+ struct rte_mbuf *f, *t;
+ uint16_t i, nb_segs, adj;
+ void *hdr;
+
+ if (s->tcb.snd.nxt_pkt) {
+ f = s->tcb.snd.nxt_pkt;
+ plen = f->data_len - s->tcb.snd.nxt_offset;
+ if (f == m) /* 1st segment contains net headers */
+ plen -= hdr_len;
+ } else {
+ f = m;
+ plen = f->data_len - hdr_len;
+ }
+
+ TCP_LOG(DEBUG, "m(%p): pkt_len=%u, nb_segs=%u, sl->len = %u\n",
+ m, m->pkt_len, m->nb_segs, sl->len);
+
+ nb_segs = 1;
+ if (sl->len < plen) {
+ /* Segment split needed: sometimes, cwnd will be reset to
+ * 1 or 2 mss. In this case, we send part of this seg, and
+ * record which segment we've sent, and the offset of sent
+ * data in tcb.
+ */
+ left = plen - sl->len;
+ plen = sl->len;
+ s->tcb.snd.nxt_pkt = f;
+ } else {
+ left = 0;
+ t = f->next;
+ while (t && plen + t->data_len <= sl->len) {
+ plen += t->data_len;
+ t = t->next;
+ nb_segs++;
+ }
+ s->tcb.snd.nxt_pkt = t;
+ }
+
+ struct rte_mbuf *pkts[1 + nb_segs];
+ if (rte_pktmbuf_alloc_bulk(s->tx.dst.head_mp, pkts, 1 + nb_segs) < 0)
+ return NULL;
+
+ rte_pktmbuf_attach(pkts[1], f);
+
+ /* remove bytes in the beginning */
+ adj = s->tcb.snd.nxt_offset;
+ if (f == m)
+ adj += hdr_len;
+ if (adj)
+ rte_pktmbuf_adj(pkts[1], adj);
+
+ /* remove bytes in the end */
+ if (left > 0) {
+ rte_pktmbuf_trim(pkts[1], left);
+ s->tcb.snd.nxt_offset += plen;
+ } else
+ s->tcb.snd.nxt_offset = 0;
+
+ /* attach chaining segment if we have */
+ for (i = 1, t = f->next; i < nb_segs; ++i) {
+ rte_pktmbuf_attach(pkts[i+1], t);
+ pkts[i]->next = pkts[i+1];
+ t = t->next;
+ }
+
+ /* prepare l2/l3/l4 header */
+ hdr = rte_pktmbuf_append(pkts[0], hdr_len);
+ rte_memcpy(hdr, rte_pktmbuf_mtod(m, void *), hdr_len);
+ pkts[0]->nb_segs = nb_segs + 1;
+ pkts[0]->pkt_len = plen + hdr_len;
+ pkts[0]->ol_flags = m->ol_flags;
+ pkts[0]->tx_offload = m->tx_offload;
+ if (type == TLE_V4) {
+ struct ipv4_hdr *l3h;
+
+ l3h = rte_pktmbuf_mtod_offset(pkts[0],
+ struct ipv4_hdr *, m->l2_len);
+ l3h->total_length =
+ rte_cpu_to_be_16(plen + m->l3_len + m->l4_len);
+ } else {
+ struct ipv6_hdr *l3h;
+
+ l3h = rte_pktmbuf_mtod_offset(pkts[0],
+ struct ipv6_hdr *, m->l2_len);
+ l3h->payload_len =
+ rte_cpu_to_be_16(plen + m->l4_len);
+ }
+ if (plen <= mss)
+ pkts[0]->ol_flags &= ~PKT_TX_TCP_SEG;
+ pkts[0]->next = pkts[1];
+
+ *p_plen = plen;
+ return pkts[0];
+}
+
static inline uint32_t
tx_data_bulk(struct tle_tcp_stream *s, union seqlen *sl, struct rte_mbuf *mi[],
uint32_t num)
@@ -371,11 +384,13 @@ tx_data_bulk(struct tle_tcp_stream *s, union seqlen *sl, struct rte_mbuf *mi[],
struct rte_mbuf *mb;
struct rte_mbuf *mo[MAX_PKT_BURST + TCP_MAX_PKT_SEG];
+ /* check stream has drb to send pkts */
+ if (stream_drb_empty(s))
+ return 0;
+
mss = s->tcb.snd.mss;
type = s->s.type;
-
dev = s->tx.dst.dev;
- pid = get_ip_pid(dev, num, type, (s->flags & TLE_CTX_FLAG_ST) != 0);
k = 0;
tn = 0;
@@ -383,26 +398,64 @@ tx_data_bulk(struct tle_tcp_stream *s, union seqlen *sl, struct rte_mbuf *mi[],
for (i = 0; i != num && sl->len != 0 && fail == 0; i++) {
mb = mi[i];
- sz = RTE_MIN(sl->len, mss);
plen = PKT_L4_PLEN(mb);
/*fast path, no need to use indirect mbufs. */
- if (plen <= sz) {
-
+ if (s->tcb.snd.nxt_pkt == NULL && plen <= sl->len) {
+ pid = get_ip_pid(dev, calc_seg_cnt(plen, s->tcb.snd.mss),
+ type, (s->flags & TLE_CTX_FLAG_ST) != 0);
/* update pkt TCP header */
- tcp_update_mbuf(mb, type, &s->tcb, sl->seq, pid + i);
+ tcp_update_mbuf(mb, type, &s->tcb, sl->seq, pid);
/* keep mbuf till ACK is received. */
rte_pktmbuf_refcnt_update(mb, 1);
sl->len -= plen;
sl->seq += plen;
mo[k++] = mb;
- /* remaining snd.wnd is less them MSS, send nothing */
- } else if (sz < mss)
+ if (sl->seq <= s->tcb.snd.rcvr)
+ TCP_INC_STATS(TCP_MIB_RETRANSSEGS);
+ /* remaining snd.wnd is less than MSS, send nothing */
+ } else if (sl->len < mss) {
+ break;
+ /* some data to send already */
+ } else if (k != 0 || tn != 0) {
break;
/* packet indirection needed */
- else
- RTE_VERIFY(0);
+ } else {
+ struct rte_mbuf *out;
+
+ out = get_indirect_mbuf(s, mb, &plen, sl, type, mss);
+ if (out == NULL)
+ return 0;
+
+ pid = get_ip_pid(dev, calc_seg_cnt(plen, s->tcb.snd.mss),
+ type, (s->flags & TLE_CTX_FLAG_ST) != 0);
+ /* update pkt TCP header */
+ tcp_update_mbuf(out, type, &s->tcb, sl->seq, pid);
+
+ /* no need to bump refcnt !!! */
+
+ sl->len -= plen;
+ sl->seq += plen;
+
+ if (tx_data_pkts(s, &out, 1) == 0) {
+ /* should not happen, we have checked at least one
+ * drb is available to send this mbuf
+ */
+ rte_pktmbuf_free(out);
+ return 0;
+ }
+
+ if (sl->seq <= s->tcb.snd.rcvr)
+ TCP_INC_STATS(TCP_MIB_RETRANSSEGS);
+
+ if (s->tcb.snd.nxt_pkt)
+ return 0;
+ else {
+ tn = 1;
+ continue;
+ }
+ }
if (k >= MAX_PKT_BURST) {
n = tx_data_pkts(s, mo, k);
@@ -466,14 +519,17 @@ tx_nxt_data(struct tle_tcp_stream *s, uint32_t tms)
tcp_txq_set_nxt_head(s, n);
} while (n == num);
- s->tcb.snd.nxt += sl.seq - (uint32_t)s->tcb.snd.nxt;
+ if (sl.seq != (uint32_t)s->tcb.snd.nxt) {
+ s->tcb.snd.nxt += sl.seq - (uint32_t)s->tcb.snd.nxt;
+ s->tcb.snd.ack = s->tcb.rcv.nxt;
+ }
return tn;
}
static inline void
free_una_data(struct tle_tcp_stream *s, uint32_t len)
{
- uint32_t i, num, plen;
+ uint32_t i, num, plen, una_data;
struct rte_mbuf **mi;
plen = 0;
@@ -487,14 +543,18 @@ free_una_data(struct tle_tcp_stream *s, uint32_t len)
/* free acked data */
for (i = 0; i != num && plen != len; i++) {
- uint32_t next_pkt_len = PKT_L4_PLEN(mi[i]);
- if (plen + next_pkt_len > len) {
- /* keep SND.UNA at the start of the packet */
- len = plen;
+ una_data = PKT_L4_PLEN(mi[i]) - s->tcb.snd.una_offset;
+
+ /* partial ack */
+ if (plen + una_data > len) {
+ s->tcb.snd.una_offset += len - plen;
+ plen = len;
break;
- } else {
- plen += next_pkt_len;
}
+
+ /* monolithic ack */
+ s->tcb.snd.una_offset = 0;
+ plen += una_data;
rte_pktmbuf_free(mi[i]);
}
@@ -503,6 +563,7 @@ free_una_data(struct tle_tcp_stream *s, uint32_t len)
} while (plen < len);
s->tcb.snd.una += len;
+ s->tcb.snd.waitlen -= len;
/*
* that could happen in case of retransmit,
@@ -519,7 +580,7 @@ calc_smss(uint16_t mss, const struct tle_dest *dst)
{
uint16_t n;
- n = dst->mtu - dst->l2_len - dst->l3_len - TCP_TX_HDR_DACK;
+ n = dst->mtu - dst->l3_len - sizeof(struct tcp_hdr);
mss = RTE_MIN(n, mss);
return mss;
}
@@ -537,71 +598,53 @@ initial_cwnd(uint32_t smss, uint32_t icw)
return RTE_MIN(10 * smss, RTE_MAX(2 * smss, icw));
}
-/*
- * queue standalone packet to he particular output device
- * It assumes that:
- * - L2/L3/L4 headers should be already set.
- * - packet fits into one segment.
- */
-static inline int
-send_pkt(struct tle_tcp_stream *s, struct tle_dev *dev, struct rte_mbuf *m)
+void
+tle_tcp_stream_kill(struct tle_stream *ts)
{
- uint32_t n, nb;
- struct tle_drb *drb;
-
- if (stream_drb_alloc(s, &drb, 1) == 0)
- return -ENOBUFS;
-
- /* enqueue pkt for TX. */
- nb = 1;
- n = tle_dring_mp_enqueue(&dev->tx.dr, (const void * const*)&m, 1,
- &drb, &nb);
-
- /* free unused drbs. */
- if (nb != 0)
- stream_drb_free(s, &drb, 1);
-
- return (n == 1) ? 0 : -ENOBUFS;
-}
+ struct tle_tcp_stream *s;
-static inline int
-send_ctrl_pkt(struct tle_tcp_stream *s, struct rte_mbuf *m, uint32_t seq,
- uint32_t flags)
-{
- const struct tle_dest *dst;
- uint32_t pid, type;
- int32_t rc;
+ s = TCP_STREAM(ts);
+ if (ts == NULL || s->s.type >= TLE_VNUM)
+ return;
- dst = &s->tx.dst;
- type = s->s.type;
- pid = get_ip_pid(dst->dev, 1, type, (s->flags & TLE_CTX_FLAG_ST) != 0);
+ if (s->tcb.state > TCP_ST_LISTEN)
+ send_rst(s, s->tcb.snd.nxt);
- rc = tcp_fill_mbuf(m, s, dst, 0, s->s.port, seq, flags, pid, 1);
- if (rc == 0)
- rc = send_pkt(s, dst->dev, m);
+ if (s->tcb.state == TCP_ST_ESTABLISHED)
+ TCP_DEC_STATS_ATOMIC(TCP_MIB_CURRESTAB);
- return rc;
+ s->tcb.state = TCP_ST_CLOSED;
+ rte_smp_wmb();
+ timer_stop(s, TIMER_RTO);
}
static inline int
-send_rst(struct tle_tcp_stream *s, uint32_t seq)
+send_ack(struct tle_tcp_stream *s, uint32_t tms, uint32_t flags)
{
struct rte_mbuf *m;
+ uint32_t seq;
int32_t rc;
m = rte_pktmbuf_alloc(s->tx.dst.head_mp);
if (m == NULL)
return -ENOMEM;
- rc = send_ctrl_pkt(s, m, seq, TCP_FLAG_RST);
- if (rc != 0)
+ seq = s->tcb.snd.nxt - ((flags & (TCP_FLAG_FIN | TCP_FLAG_SYN)) != 0);
+ s->tcb.snd.ts = tms;
+
+ rc = send_ctrl_pkt(s, m, seq, flags);
+ if (rc != 0) {
rte_pktmbuf_free(m);
+ return rc;
+ }
- return rc;
+ timer_stop(s, TIMER_DACK);
+ s->tcb.snd.ack = s->tcb.rcv.nxt;
+ return 0;
}
static inline int
-send_ack(struct tle_tcp_stream *s, uint32_t tms, uint32_t flags)
+send_keepalive(struct tle_tcp_stream *s)
{
struct rte_mbuf *m;
uint32_t seq;
@@ -611,20 +654,16 @@ send_ack(struct tle_tcp_stream *s, uint32_t tms, uint32_t flags)
if (m == NULL)
return -ENOMEM;
- seq = s->tcb.snd.nxt - ((flags & (TCP_FLAG_FIN | TCP_FLAG_SYN)) != 0);
- s->tcb.snd.ts = tms;
+ seq = s->tcb.snd.una - 1;
- rc = send_ctrl_pkt(s, m, seq, flags);
+ rc = send_ctrl_pkt(s, m, seq, TCP_FLAG_ACK);
if (rc != 0) {
rte_pktmbuf_free(m);
return rc;
}
-
- s->tcb.snd.ack = s->tcb.rcv.nxt;
return 0;
}
-
static int
sync_ack(struct tle_tcp_stream *s, const union pkt_info *pi,
const union seg_info *si, uint32_t ts, struct rte_mbuf *m)
@@ -633,19 +672,23 @@ sync_ack(struct tle_tcp_stream *s, const union pkt_info *pi,
int32_t rc;
uint32_t pid, seq, type;
struct tle_dev *dev;
- const void *da;
+ const void *sa, *da;
struct tle_dest dst;
const struct tcp_hdr *th;
- type = s->s.type;
+ type = pi->tf.type;
/* get destination information. */
- if (type == TLE_V4)
+ if (type == TLE_V4) {
da = &pi->addr4.src;
- else
+ sa = &pi->addr4.dst;
+ }
+ else {
da = &pi->addr6->src;
+ sa = &pi->addr6->dst;
+ }
- rc = stream_get_dest(&s->s, da, &dst);
+ rc = stream_get_dest(type, &s->s, sa, da, &dst);
if (rc < 0)
return rc;
@@ -654,11 +697,16 @@ sync_ack(struct tle_tcp_stream *s, const union pkt_info *pi,
get_syn_opts(&s->tcb.so, (uintptr_t)(th + 1), m->l4_len - sizeof(*th));
s->tcb.rcv.nxt = si->seq + 1;
+ s->tcb.rcv.cpy = si->seq + 1;
seq = sync_gen_seq(pi, s->tcb.rcv.nxt, ts, s->tcb.so.mss,
s->s.ctx->prm.hash_alg,
&s->s.ctx->prm.secret_key);
- s->tcb.so.ts.ecr = s->tcb.so.ts.val;
- s->tcb.so.ts.val = sync_gen_ts(ts, s->tcb.so.wscale);
+
+ if (s->tcb.so.ts.raw) {
+ s->tcb.so.ts.ecr = s->tcb.so.ts.val;
+ s->tcb.so.ts.val = sync_gen_ts(ts, s->tcb.so.wscale);
+ }
+
s->tcb.so.wscale = (s->tcb.so.wscale == TCP_WSCALE_NONE) ?
TCP_WSCALE_NONE : TCP_WSCALE_DEFAULT;
s->tcb.so.mss = calc_smss(dst.mtu, &dst);
@@ -672,11 +720,13 @@ sync_ack(struct tle_tcp_stream *s, const union pkt_info *pi,
dev = dst.dev;
pid = get_ip_pid(dev, 1, type, (s->flags & TLE_CTX_FLAG_ST) != 0);
- rc = tcp_fill_mbuf(m, s, &dst, 0, pi->port, seq,
- TCP_FLAG_SYN | TCP_FLAG_ACK, pid, 1);
+ rc = tcp_fill_mbuf(m, s, &dst, TCP_OLFLAGS_CKSUM(dst.ol_flags),
+ pi->port, seq, TCP_FLAG_SYN | TCP_FLAG_ACK, pid, 1);
if (rc == 0)
rc = send_pkt(s, dev, m);
+ TCP_INC_STATS(TCP_MIB_PASSIVEOPENS);
+
return rc;
}
@@ -800,43 +850,24 @@ restore_syn_opt(union seg_info *si, union tsopt *to,
return 0;
}
-static inline void
-stream_term(struct tle_tcp_stream *s)
-{
- struct sdr *dr;
-
- s->tcb.state = TCP_ST_CLOSED;
- rte_smp_wmb();
-
- timer_stop(s);
-
- /* close() was already invoked, schedule final cleanup */
- if ((s->tcb.uop & TCP_OP_CLOSE) != 0) {
-
- dr = CTX_TCP_SDR(s->s.ctx);
- STAILQ_INSERT_TAIL(&dr->be, &s->s, link);
-
- /* notify user that stream need to be closed */
- } else if (s->err.ev != NULL)
- tle_event_raise(s->err.ev);
- else if (s->err.cb.func != NULL)
- s->err.cb.func(s->err.cb.data, &s->s);
-}
-
static inline int
stream_fill_dest(struct tle_tcp_stream *s)
{
int32_t rc;
uint32_t type;
- const void *da;
+ const void *sa, *da;
- type = s->s.type;
- if (type == TLE_V4)
+ type = s->s.type;
+ if (type == TLE_V4) {
+ sa = &s->s.ipv4.addr.dst;
da = &s->s.ipv4.addr.src;
- else
+ }
+ else {
+ sa = &s->s.ipv6.addr.dst;
da = &s->s.ipv6.addr.src;
+ }
- rc = stream_get_dest(&s->s, da, &s->tx.dst);
+ rc = stream_get_dest(type, &s->s, sa, da, &s->tx.dst);
return (rc < 0) ? rc : 0;
}
@@ -851,19 +882,17 @@ accept_prep_stream(struct tle_tcp_stream *ps, struct stbl *st,
int32_t rc;
uint32_t rtt;
- /* some TX still pending for that stream. */
- if (TCP_STREAM_TX_PENDING(cs))
- return -EAGAIN;
-
/* setup L4 ports and L3 addresses fields. */
cs->s.port.raw = pi->port.raw;
cs->s.pmsk.raw = UINT32_MAX;
if (pi->tf.type == TLE_V4) {
+ cs->s.type = TLE_V4;
cs->s.ipv4.addr = pi->addr4;
cs->s.ipv4.mask.src = INADDR_NONE;
cs->s.ipv4.mask.dst = INADDR_NONE;
} else if (pi->tf.type == TLE_V6) {
+ cs->s.type = TLE_V6;
cs->s.ipv6.addr = *pi->addr6;
rte_memcpy(&cs->s.ipv6.mask.src, &tle_ipv6_none,
sizeof(cs->s.ipv6.mask.src));
@@ -887,7 +916,7 @@ accept_prep_stream(struct tle_tcp_stream *ps, struct stbl *st,
cs->tcb.snd.rto = TCP_RTO_DEFAULT;
/* copy streams type & flags. */
- cs->s.type = ps->s.type;
+ cs->s.type = pi->tf.type;
cs->flags = ps->flags;
/* retrive and cache destination information. */
@@ -897,16 +926,23 @@ accept_prep_stream(struct tle_tcp_stream *ps, struct stbl *st,
/* update snd.mss with SMSS value */
cs->tcb.snd.mss = calc_smss(cs->tcb.snd.mss, &cs->tx.dst);
+ if (cs->tcb.so.ts.raw != 0) {
+ cs->tcb.snd.mss -= TCP_TX_OPT_LEN_TMS;
+ }
/* setup congestion variables */
cs->tcb.snd.cwnd = initial_cwnd(cs->tcb.snd.mss, ps->tcb.snd.cwnd);
+ CWND_INFO("accept", cs->tcb.snd.cwnd);
+
cs->tcb.snd.ssthresh = cs->tcb.snd.wnd;
cs->tcb.snd.rto_tw = ps->tcb.snd.rto_tw;
+ cs->tcb.snd.rto_fw = ps->tcb.snd.rto_fw;
cs->tcb.state = TCP_ST_ESTABLISHED;
+ TCP_INC_STATS_ATOMIC(TCP_MIB_CURRESTAB);
/* add stream to the table */
- cs->ste = stbl_add_stream(st, pi, cs);
+ cs->ste = stbl_add_stream(st, &cs->s);
if (cs->ste == NULL)
return -ENOBUFS;
@@ -937,7 +973,7 @@ rx_ack_listen(struct tle_tcp_stream *s, struct stbl *st,
*csp = NULL;
- if (pi->tf.flags != TCP_FLAG_ACK || rx_check_stream(s, pi) != 0)
+ if ((pi->tf.flags & TCP_FLAG_ACK) == 0|| rx_check_stream(s, pi) != 0)
return -EINVAL;
ctx = s->s.ctx;
@@ -964,7 +1000,8 @@ rx_ack_listen(struct tle_tcp_stream *s, struct stbl *st,
/* cleanup on failure */
tcp_stream_down(cs);
- stbl_del_stream(st, cs->ste, cs, 0);
+ TCP_DEC_STATS_ATOMIC(TCP_MIB_CURRESTAB);
+ stbl_del_stream(st, cs->ste, &cs->s);
cs->ste = NULL;
}
@@ -982,6 +1019,10 @@ data_pkt_adjust(const struct tcb *tcb, struct rte_mbuf **mb, uint32_t hlen,
len = *plen;
rte_pktmbuf_adj(*mb, hlen);
+ /* header is removed, so we clear tx_offload here to make sure
+ * we can get correct payload length with PKT_L4_PLEN.
+ */
+ (*mb)->tx_offload = 0;
if (len == 0)
return -ENODATA;
/* cut off the start of the packet */
@@ -1018,7 +1059,8 @@ rx_ackdata(struct tle_tcp_stream *s, uint32_t ack)
tle_event_raise(s->tx.ev);
else if (k == 0 && s->tx.cb.func != NULL)
s->tx.cb.func(s->tx.cb.data, &s->s);
- }
+ } else
+ txs_enqueue(s->s.ctx, s);
}
return n;
@@ -1029,8 +1071,7 @@ stream_timewait(struct tle_tcp_stream *s, uint32_t rto)
{
if (rto != 0) {
s->tcb.state = TCP_ST_TIME_WAIT;
- s->tcb.snd.rto = rto;
- timer_reset(s);
+ timer_reset(s, TIMER_RTO, rto);
} else
stream_term(s);
}
@@ -1041,20 +1082,30 @@ rx_fin_state(struct tle_tcp_stream *s, struct resp_info *rsp)
uint32_t state;
int32_t ackfin;
+ s->tcb.rcv.frs.on = 2;
s->tcb.rcv.nxt += 1;
ackfin = (s->tcb.snd.una == s->tcb.snd.fss);
state = s->tcb.state;
if (state == TCP_ST_ESTABLISHED) {
+ TCP_DEC_STATS_ATOMIC(TCP_MIB_CURRESTAB);
s->tcb.state = TCP_ST_CLOSE_WAIT;
/* raise err.ev & err.cb */
- if (s->err.ev != NULL)
+ /* raise error event only when recvbuf is empty, to inform
+ * that the stream will not receive data any more.
+ */
+ if (rte_ring_count(s->rx.q) == 0 && s->err.ev != NULL)
tle_event_raise(s->err.ev);
else if (s->err.cb.func != NULL)
s->err.cb.func(s->err.cb.data, &s->s);
} else if (state == TCP_ST_FIN_WAIT_1 || state == TCP_ST_CLOSING) {
rsp->flags |= TCP_FLAG_ACK;
+
+ /* shutdown instead of close happens */
+ if (rte_ring_count(s->rx.q) == 0 && s->err.ev != NULL)
+ tle_event_raise(s->err.ev);
+
if (ackfin != 0)
stream_timewait(s, s->tcb.snd.rto_tw);
else
@@ -1089,8 +1140,10 @@ rx_fin(struct tle_tcp_stream *s, uint32_t state,
ts = rx_tms_opt(&s->tcb, mb);
ret = rx_check_seqack(&s->tcb, seq, si->ack, plen, ts);
- if (ret != 0)
+ if (ret != 0) {
+ rsp->flags |= TCP_FLAG_ACK;
return ret;
+ }
if (state < TCP_ST_ESTABLISHED)
return -EINVAL;
@@ -1108,9 +1161,10 @@ rx_fin(struct tle_tcp_stream *s, uint32_t state,
* fast-path: all data & FIN was already sent out
* and now is acknowledged.
*/
- if (s->tcb.snd.fss == s->tcb.snd.nxt &&
- si->ack == (uint32_t)s->tcb.snd.nxt) {
+ if (s->tcb.snd.fss >= s->tcb.snd.nxt &&
+ si->ack == (uint32_t)s->tcb.snd.fss) {
s->tcb.snd.una = s->tcb.snd.fss;
+ s->tcb.snd.nxt = s->tcb.snd.una;
empty_tq(s);
/* conventional ACK processiing */
} else
@@ -1148,8 +1202,25 @@ rx_rst(struct tle_tcp_stream *s, uint32_t state, uint32_t flags,
else
rc = check_seqn(&s->tcb, si->seq, 0);
- if (rc == 0)
+ if (rc == 0) {
+ /* receive rst, connection is closed abnormal
+ * and should return errno in later operations.
+ */
+ switch (state) {
+ case TCP_ST_SYN_SENT:
+ TCP_INC_STATS(TCP_MIB_ATTEMPTFAILS);
+ s->tcb.err = ECONNREFUSED;
+ break;
+ case TCP_ST_CLOSE_WAIT:
+ s->tcb.err = EPIPE;
+ break;
+ case TCP_ST_CLOSED:
+ return rc;
+ default:
+ s->tcb.err = ECONNRESET;
+ }
stream_term(s);
+ }
return rc;
}
@@ -1222,6 +1293,7 @@ rto_cwnd_update(struct tcb *tcb)
* no more than 1 full-sized segment.
*/
tcb->snd.cwnd = tcb->snd.mss;
+ CWND_INFO("update", tcb->snd.cwnd);
}
static inline void
@@ -1330,13 +1402,17 @@ rx_data_ack(struct tle_tcp_stream *s, struct dack_info *tack,
ret = rx_check_seqack(&s->tcb, si[j].seq, si[j].ack,
plen, ts);
- if (ret != 0)
- break;
-
/* account for segment received */
ack_info_update(tack, &si[j], ret != 0, plen, ts);
+ if (ret != 0)
+ break;
+
rte_pktmbuf_adj(mb[j], hlen);
+ /* header is removed, so we clear tx_offload here to make sure
+ * we can get correct payload length with PKT_L4_PLEN.
+ */
+ mb[j]->tx_offload = 0;
}
n = j - i;
@@ -1377,6 +1453,7 @@ start_fast_retransmit(struct tle_tcp_stream *s)
tcp_txq_rst_nxt_head(s);
tcb->snd.nxt = tcb->snd.una;
tcb->snd.cwnd = tcb->snd.ssthresh + 3 * tcb->snd.mss;
+ CWND_INFO("start fast retrans", tcb->snd.cwnd);
}
static inline void
@@ -1389,6 +1466,7 @@ stop_fast_retransmit(struct tle_tcp_stream *s)
n = tcb->snd.nxt - tcb->snd.una;
tcb->snd.cwnd = RTE_MIN(tcb->snd.ssthresh,
RTE_MAX(n, tcb->snd.mss) + tcb->snd.mss);
+ CWND_INFO("stop fast retrans", tcb->snd.cwnd);
tcb->snd.fastack = 0;
}
@@ -1415,8 +1493,10 @@ in_fast_retransmit(struct tle_tcp_stream *s, uint32_t ack_len, uint32_t ack_num,
* during fast recovery, also reset the
* retransmit timer.
*/
- if (tcb->snd.fastack == 1)
- timer_reset(s);
+ if (tcb->snd.fastack == 1) {
+ timer_reset(s, TIMER_RTO, s->tcb.snd.rto);
+ s->tcb.snd.nb_retx = 0;
+ }
tcb->snd.fastack += ack_num;
return 1;
@@ -1456,7 +1536,8 @@ process_ack(struct tle_tcp_stream *s, uint32_t acked,
/* remain in normal mode */
} else if (acked != 0) {
ack_cwnd_update(&s->tcb, acked, tack);
- timer_stop(s);
+ timer_stop(s, TIMER_RTO);
+ s->tcb.snd.nb_retx = 0;
}
/* fast retransmit mode */
@@ -1470,7 +1551,7 @@ process_ack(struct tle_tcp_stream *s, uint32_t acked,
} else {
/* RFC 5682 3.2.3 full ACK */
stop_fast_retransmit(s);
- timer_stop(s);
+ timer_stop(s, TIMER_RTO);
/* if we have another series of dup ACKs */
if (tack->dup3.seg != 0 &&
@@ -1501,17 +1582,22 @@ rx_ackfin(struct tle_tcp_stream *s)
uint32_t state;
s->tcb.snd.una = s->tcb.snd.fss;
+ s->tcb.snd.nxt = s->tcb.snd.una;
empty_tq(s);
state = s->tcb.state;
if (state == TCP_ST_LAST_ACK)
stream_term(s);
else if (state == TCP_ST_FIN_WAIT_1) {
- timer_stop(s);
+ timer_stop(s, TIMER_RTO);
s->tcb.state = TCP_ST_FIN_WAIT_2;
- } else if (state == TCP_ST_CLOSING) {
+ /* if stream is closed, should be released
+ * before timeout even without fin from peer
+ */
+ if (s->tcb.uop & TCP_OP_CLOSE)
+ timer_start(s, TIMER_RTO, s->tcb.snd.rto_fw);
+ } else if (state == TCP_ST_CLOSING)
stream_timewait(s, s->tcb.snd.rto_tw);
- }
}
static inline void
@@ -1532,7 +1618,7 @@ rx_process_ack(struct tle_tcp_stream *s, uint32_t ts,
/* restart RTO timer. */
if (s->tcb.snd.nxt != s->tcb.snd.una)
- timer_start(s);
+ timer_start(s, TIMER_RTO, s->tcb.snd.rto);
/* update rto, if fresh packet is here then calculate rtt */
if (tack->ts.ecr != 0)
@@ -1554,15 +1640,9 @@ rx_synack(struct tle_tcp_stream *s, uint32_t ts, uint32_t state,
if (state != TCP_ST_SYN_SENT)
return -EINVAL;
- /*
- * RFC 793 3.9: in the SYN-SENT state
- * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send a reset
- * <SEQ=SEG.ACK><CTL=RST>
- * and discard the segment.
- * The connection remains in the same state.
- */
+ /* invalid SEG.SEQ */
if (si->ack != (uint32_t)s->tcb.snd.nxt) {
- send_rst(s, si->ack);
+ rsp->flags = TCP_FLAG_RST;
return 0;
}
@@ -1574,18 +1654,25 @@ rx_synack(struct tle_tcp_stream *s, uint32_t ts, uint32_t state,
s->tcb.snd.una = s->tcb.snd.nxt;
s->tcb.snd.mss = calc_smss(so.mss, &s->tx.dst);
+ if (s->tcb.so.ts.raw != 0) {
+ s->tcb.snd.mss -= TCP_TX_OPT_LEN_TMS;
+ }
s->tcb.snd.wnd = si->wnd << so.wscale;
s->tcb.snd.wu.wl1 = si->seq;
s->tcb.snd.wu.wl2 = si->ack;
s->tcb.snd.wscale = so.wscale;
+ s->tcb.snd.cork_ts = 0;
/* setup congestion variables */
s->tcb.snd.cwnd = initial_cwnd(s->tcb.snd.mss, s->tcb.snd.cwnd);
+ CWND_INFO("synack", s->tcb.snd.cwnd);
+
s->tcb.snd.ssthresh = s->tcb.snd.wnd;
s->tcb.rcv.ts = so.ts.val;
s->tcb.rcv.irs = si->seq;
s->tcb.rcv.nxt = si->seq + 1;
+ s->tcb.rcv.cpy = si->seq + 1;
/* if peer doesn't support WSCALE opt, recalculate RCV.WND */
s->tcb.rcv.wscale = (so.wscale == TCP_WSCALE_NONE) ?
@@ -1597,9 +1684,14 @@ rx_synack(struct tle_tcp_stream *s, uint32_t ts, uint32_t state,
rsp->flags |= TCP_FLAG_ACK;
- timer_stop(s);
+ timer_stop(s, TIMER_RTO);
+ s->tcb.snd.nb_retx = 0;
s->tcb.state = TCP_ST_ESTABLISHED;
rte_smp_wmb();
+ TCP_INC_STATS_ATOMIC(TCP_MIB_CURRESTAB);
+
+ if (s->s.option.keepalive)
+ timer_start(s, TIMER_KEEPALIVE, s->s.option.keepidle * MS_PER_S);
if (s->tx.ev != NULL)
tle_event_raise(s->tx.ev);
@@ -1689,8 +1781,8 @@ rx_stream(struct tle_tcp_stream *s, uint32_t ts,
* fast-path: all data & FIN was already sent out
* and now is acknowledged.
*/
- if (s->tcb.snd.fss == s->tcb.snd.nxt &&
- tack.ack == (uint32_t)s->tcb.snd.nxt)
+ if (s->tcb.snd.fss >= s->tcb.snd.nxt &&
+ tack.ack == (uint32_t)s->tcb.snd.fss)
rx_ackfin(s);
else
rx_process_ack(s, ts, &tack);
@@ -1702,27 +1794,44 @@ rx_stream(struct tle_tcp_stream *s, uint32_t ts,
* - received segment with INO data and no TX is scheduled
* for that stream.
*/
- if (tack.segs.badseq != 0 || tack.segs.ofo != 0 ||
- (tack.segs.data != 0 &&
- rte_atomic32_read(&s->tx.arm) == 0))
+ if (tack.segs.badseq != 0 || tack.segs.ofo != 0)
+ rsp.flags |= TCP_FLAG_ACK;
+ else if (tack.segs.data != 0 &&
+ rte_atomic32_read(&s->tx.arm) == 0 &&
+ (s->s.option.tcpquickack ||
+ s->tcb.rcv.nxt - s->tcb.snd.ack > 8 * s->tcb.so.mss)) {
rsp.flags |= TCP_FLAG_ACK;
+ if (s->s.option.tcpquickack > 0)
+ s->s.option.tcpquickack--;
+ }
+ else if (tack.segs.data && rsp.flags == 0)
+ timer_start(s, TIMER_DACK, DELAY_ACK_CHECK_INTERVAL);
rx_ofo_fin(s, &rsp);
k += num - n;
i = num;
+ if (s->s.option.keepalive) {
+ s->tcb.snd.nb_keepalive = 0;
+ timer_reset(s, TIMER_KEEPALIVE, s->s.option.keepidle * MS_PER_S);
+ }
/* unhandled state, drop all packets. */
} else
i = 0;
/* we have a response packet to send. */
- if (rsp.flags != 0) {
+ if (rsp.flags == TCP_FLAG_RST) {
+ send_rst(s, si[i].ack);
+ stream_term(s);
+ } else if (rsp.flags != 0) {
send_ack(s, ts, rsp.flags);
/* start the timer for FIN packet */
- if ((rsp.flags & TCP_FLAG_FIN) != 0)
- timer_reset(s);
+ if ((rsp.flags & TCP_FLAG_FIN) != 0) {
+ timer_reset(s, TIMER_RTO, s->tcb.snd.rto);
+ s->tcb.snd.nb_retx = 0;
+ }
}
/* unprocessed packets */
@@ -1778,7 +1887,6 @@ rx_postsyn(struct tle_dev *dev, struct stbl *st, uint32_t type, uint32_t ts,
state = s->tcb.state;
if (state == TCP_ST_LISTEN) {
-
/* one connection per flow */
cs = NULL;
ret = -EINVAL;
@@ -1835,6 +1943,74 @@ rx_postsyn(struct tle_dev *dev, struct stbl *st, uint32_t type, uint32_t ts,
return num - k;
}
+static inline void
+sync_refuse(struct tle_tcp_stream *s, struct tle_dev *dev,
+ const union pkt_info *pi, struct rte_mbuf *m)
+{
+ struct ether_hdr *eth_h;
+ struct ether_addr eth_addr;
+ struct ipv4_hdr *ip_h;
+ uint32_t ip_addr;
+ struct ipv6_hdr *ipv6_h;
+ struct in6_addr ipv6_addr;
+ struct tcp_hdr *th;
+ uint16_t port;
+
+ /* rst pkt should not contain options for syn */
+ rte_pktmbuf_trim(m, m->l4_len - sizeof(*th));
+
+ eth_h = rte_pktmbuf_mtod(m, struct ether_hdr*);
+ ether_addr_copy(&eth_h->s_addr, &eth_addr);
+ ether_addr_copy(&eth_h->d_addr, &eth_h->s_addr);
+ ether_addr_copy(&eth_addr, &eth_h->d_addr);
+
+ th = rte_pktmbuf_mtod_offset(m, struct tcp_hdr*,
+ m->l2_len + m->l3_len);
+ port = th->src_port;
+ th->src_port = th->dst_port;
+ th->dst_port = port;
+ th->tcp_flags = TCP_FLAG_RST | TCP_FLAG_ACK;
+ th->recv_ack = rte_cpu_to_be_32(rte_be_to_cpu_32(th->sent_seq) + 1);
+ th->sent_seq = 0;
+ th->data_off &= 0x0f;
+ th->data_off |= (sizeof(*th) / 4) << 4;
+ th->cksum = 0;
+
+ if (pi->tf.type == TLE_V4) {
+ ip_h = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr*,
+ m->l2_len);
+ ip_addr = ip_h->src_addr;
+ ip_h->src_addr = ip_h->dst_addr;
+ ip_h->dst_addr = ip_addr;
+ ip_h->total_length = rte_cpu_to_be_16(
+ rte_be_to_cpu_16(ip_h->total_length) -
+ (m->l4_len - sizeof(*th)));
+ ip_h->hdr_checksum = 0;
+ th->cksum = rte_ipv4_udptcp_cksum(ip_h, th);
+ ip_h->hdr_checksum = rte_ipv4_cksum(ip_h);
+ } else {
+ ipv6_h = rte_pktmbuf_mtod_offset(m, struct ipv6_hdr*,
+ m->l2_len);
+ rte_memcpy(&ipv6_addr, ipv6_h->src_addr,
+ sizeof(struct in6_addr));
+ rte_memcpy(ipv6_h->src_addr, ipv6_h->dst_addr,
+ sizeof(struct in6_addr));
+ rte_memcpy(ipv6_h->dst_addr, &ipv6_addr,
+ sizeof(struct in6_addr));
+ ipv6_h->payload_len = rte_cpu_to_be_16(
+ rte_be_to_cpu_16(ipv6_h->payload_len) -
+ (m->l4_len - sizeof(*th)));
+ th->cksum = rte_ipv6_udptcp_cksum(ipv6_h, th);
+ }
+
+ if (m->pkt_len < ETHER_MIN_LEN)
+ rte_pktmbuf_append(m, ETHER_MIN_LEN - m->pkt_len);
+
+ if (send_pkt(s, dev, m) != 0)
+ rte_pktmbuf_free(m);
+ else
+ TCP_INC_STATS(TCP_MIB_OUTRSTS);
+}
static inline uint32_t
rx_syn(struct tle_dev *dev, uint32_t type, uint32_t ts,
@@ -1846,20 +2022,35 @@ rx_syn(struct tle_dev *dev, uint32_t type, uint32_t ts,
uint32_t i, k;
int32_t ret;
- s = rx_obtain_listen_stream(dev, &pi[0], type);
+ s = rx_obtain_listen_stream(dev, &pi[0], type, 0);
if (s == NULL) {
- for (i = 0; i != num; i++) {
- rc[i] = ENOENT;
- rp[i] = mb[i];
+ /* no socket listening this syn, send rst to refuse connect */
+ s = TCP_STREAM(get_stream(dev->ctx));
+ if (s != NULL) {
+ sync_refuse(s, dev, &pi[0], mb[0]);
+ put_stream(dev->ctx, &s->s, 0);
+ i = 1;
+ } else {
+ i = 0;
}
- return 0;
+ k = 0;
+ for (; i != num; i++) {
+ rc[k] = ENOENT;
+ rp[k] = mb[i];
+ k++;
+ }
+ return num - k;
}
k = 0;
for (i = 0; i != num; i++) {
-
+ /* check if stream has space to maintain new connection */
+ if (rte_ring_free_count(s->rx.q) == 0 ||
+ (s->s.ctx->streams.nb_free == 0 &&
+ s->s.ctx->streams.nb_cur >= s->s.ctx->prm.max_streams - 1))
+ ret = -ENOSPC;
/* check that this remote is allowed to connect */
- if (rx_check_stream(s, &pi[i]) != 0)
+ else if (rx_check_stream(s, &pi[i]) != 0)
ret = -ENOENT;
else
/* syncokie: reply with <SYN,ACK> */
@@ -1882,43 +2073,34 @@ tle_tcp_rx_bulk(struct tle_dev *dev, struct rte_mbuf *pkt[],
{
struct stbl *st;
struct tle_ctx *ctx;
- uint32_t i, j, k, mt, n, t, ts;
+ uint32_t i, j, k, n, t;
+ uint64_t ts;
union pkt_info pi[num];
union seg_info si[num];
- union {
- uint8_t t[TLE_VNUM];
- uint32_t raw;
- } stu;
+
+ TCP_ADD_STATS(TCP_MIB_INSEGS, num);
ctx = dev->ctx;
ts = tcp_get_tms(ctx->cycles_ms_shift);
st = CTX_TCP_STLB(ctx);
- mt = ((ctx->prm.flags & TLE_CTX_FLAG_ST) == 0);
-
- stu.raw = 0;
/* extract packet info and check the L3/L4 csums */
for (i = 0; i != num; i++) {
get_pkt_info(pkt[i], &pi[i], &si[i]);
-
t = pi[i].tf.type;
- pi[i].csf = check_pkt_csum(pkt[i], pi[i].csf, t, IPPROTO_TCP);
- stu.t[t] = mt;
+ pi[i].csf = check_pkt_csum(pkt[i], t, IPPROTO_TCP);
}
- if (stu.t[TLE_V4] != 0)
- stbl_lock(st, TLE_V4);
- if (stu.t[TLE_V6] != 0)
- stbl_lock(st, TLE_V6);
-
k = 0;
for (i = 0; i != num; i += j) {
-
t = pi[i].tf.type;
/*basic checks for incoming packet */
- if (t >= TLE_VNUM || pi[i].csf != 0 || dev->dp[t] == NULL) {
+ if (t >= TLE_VNUM || pi[i].csf != 0) {
+ TCP_INC_STATS(TCP_MIB_INERRS);
+ if (t < TLE_VNUM)
+ TCP_INC_STATS(TCP_MIB_CSUMERRORS);
rc[k] = EINVAL;
rp[k] = pkt[i];
j = 1;
@@ -1937,11 +2119,6 @@ tle_tcp_rx_bulk(struct tle_dev *dev, struct rte_mbuf *pkt[],
}
}
- if (stu.t[TLE_V4] != 0)
- stbl_unlock(st, TLE_V4);
- if (stu.t[TLE_V6] != 0)
- stbl_unlock(st, TLE_V6);
-
return num - k;
}
@@ -1953,21 +2130,37 @@ tle_tcp_stream_accept(struct tle_stream *ts, struct tle_stream *rs[],
struct tle_tcp_stream *s;
s = TCP_STREAM(ts);
- n = _rte_ring_dequeue_burst(s->rx.q, (void **)rs, num);
- if (n == 0)
- return 0;
- /*
- * if we still have packets to read,
- * then rearm stream RX event.
- */
- if (n == num && rte_ring_count(s->rx.q) != 0) {
- if (tcp_stream_try_acquire(s) > 0 && s->rx.ev != NULL)
- tle_event_raise(s->rx.ev);
+ if (tcp_stream_try_acquire(s) > 0) {
+ if (s->tcb.state != TCP_ST_LISTEN) {
+ tcp_stream_release(s);
+ rte_errno = EINVAL;
+ return 0;
+ }
+
+ n = _rte_ring_dequeue_burst(s->rx.q, (void **)rs, num);
+ if (n == 0)
+ {
+ tcp_stream_release(s);
+ rte_errno = EAGAIN;
+ return 0;
+ }
+
+ /*
+ * if we still have packets to read,
+ * then rearm stream RX event.
+ */
+ if (n == num && rte_ring_count(s->rx.q) != 0) {
+ if (s->rx.ev != NULL)
+ tle_event_raise(s->rx.ev);
+ }
+ tcp_stream_release(s);
+ return n;
+ } else {
tcp_stream_release(s);
+ rte_errno = EINVAL;
+ return 0;
}
-
- return n;
}
uint16_t
@@ -1995,6 +2188,7 @@ tle_tcp_tx_bulk(struct tle_dev *dev, struct rte_mbuf *pkt[], uint16_t num)
stream_drb_free(s, drb + i, j - i);
}
+ TCP_ADD_STATS(TCP_MIB_OUTSEGS, n);
return n;
}
@@ -2010,73 +2204,17 @@ stream_fill_pkt_info(const struct tle_tcp_stream *s, union pkt_info *pi)
pi->tf.type = s->s.type;
}
-static int
-stream_fill_addr(struct tle_tcp_stream *s, const struct sockaddr *addr)
-{
- const struct sockaddr_in *in4;
- const struct sockaddr_in6 *in6;
- const struct tle_dev_param *prm;
- int32_t rc;
-
- rc = 0;
- s->s.pmsk.raw = UINT32_MAX;
-
- /* setup L4 src ports and src address fields. */
- if (s->s.type == TLE_V4) {
- in4 = (const struct sockaddr_in *)addr;
- if (in4->sin_addr.s_addr == INADDR_ANY || in4->sin_port == 0)
- return -EINVAL;
-
- s->s.port.src = in4->sin_port;
- s->s.ipv4.addr.src = in4->sin_addr.s_addr;
- s->s.ipv4.mask.src = INADDR_NONE;
- s->s.ipv4.mask.dst = INADDR_NONE;
-
- } else if (s->s.type == TLE_V6) {
- in6 = (const struct sockaddr_in6 *)addr;
- if (memcmp(&in6->sin6_addr, &tle_ipv6_any,
- sizeof(tle_ipv6_any)) == 0 ||
- in6->sin6_port == 0)
- return -EINVAL;
-
- s->s.port.src = in6->sin6_port;
- rte_memcpy(&s->s.ipv6.addr.src, &in6->sin6_addr,
- sizeof(s->s.ipv6.addr.src));
- rte_memcpy(&s->s.ipv6.mask.src, &tle_ipv6_none,
- sizeof(s->s.ipv6.mask.src));
- rte_memcpy(&s->s.ipv6.mask.dst, &tle_ipv6_none,
- sizeof(s->s.ipv6.mask.dst));
- }
-
- /* setup the destination device. */
- rc = stream_fill_dest(s);
- if (rc != 0)
- return rc;
-
- /* setup L4 dst address from device param */
- prm = &s->tx.dst.dev->prm;
- if (s->s.type == TLE_V4) {
- if (s->s.ipv4.addr.dst == INADDR_ANY)
- s->s.ipv4.addr.dst = prm->local_addr4.s_addr;
- } else if (memcmp(&s->s.ipv6.addr.dst, &tle_ipv6_any,
- sizeof(tle_ipv6_any)) == 0)
- memcpy(&s->s.ipv6.addr.dst, &prm->local_addr6,
- sizeof(s->s.ipv6.addr.dst));
-
- return rc;
-}
-
static inline int
-tx_syn(struct tle_tcp_stream *s, const struct sockaddr *addr)
+tx_syn(struct tle_tcp_stream *s)
{
int32_t rc;
- uint32_t tms, seq;
+ uint32_t seq;
+ uint64_t tms;
union pkt_info pi;
struct stbl *st;
struct stbl_entry *se;
- /* fill stream address */
- rc = stream_fill_addr(s, addr);
+ rc = stream_fill_dest(s);
if (rc != 0)
return rc;
@@ -2107,7 +2245,7 @@ tx_syn(struct tle_tcp_stream *s, const struct sockaddr *addr)
/* add the stream in stream table */
st = CTX_TCP_STLB(s->s.ctx);
- se = stbl_add_stream_lock(st, s);
+ se = stbl_add_stream(st, &s->s);
if (se == NULL)
return -ENOBUFS;
s->ste = se;
@@ -2115,6 +2253,7 @@ tx_syn(struct tle_tcp_stream *s, const struct sockaddr *addr)
/* put stream into the to-send queue */
txs_enqueue(s->s.ctx, s);
+ TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
return 0;
}
@@ -2147,7 +2286,7 @@ tle_tcp_stream_connect(struct tle_stream *ts, const struct sockaddr *addr)
/* fill stream, prepare and transmit syn pkt */
s->tcb.uop |= TCP_OP_CONNECT;
- rc = tx_syn(s, addr);
+ rc = tx_syn(s);
tcp_stream_release(s);
/* error happened, do a cleanup */
@@ -2160,13 +2299,29 @@ tle_tcp_stream_connect(struct tle_stream *ts, const struct sockaddr *addr)
uint16_t
tle_tcp_stream_recv(struct tle_stream *ts, struct rte_mbuf *pkt[], uint16_t num)
{
- uint32_t n;
+ uint32_t n, i;
+ uint32_t free_slots;
struct tle_tcp_stream *s;
s = TCP_STREAM(ts);
+
+ free_slots = rte_ring_free_count(s->rx.q);
+
n = _rte_ring_mcs_dequeue_burst(s->rx.q, (void **)pkt, num);
- if (n == 0)
+ if (n == 0) {
+ if (s->tcb.err != 0) {
+ rte_errno = s->tcb.err;
+ } else {
+ rte_errno = EAGAIN;
+ }
return 0;
+ }
+
+ for (i = 0; i < n; ++i)
+ s->tcb.rcv.cpy += rte_pktmbuf_pkt_len(pkt[i]);
+
+ /* update receive window with left recv buffer*/
+ s->tcb.rcv.wnd = calc_rx_wnd(s, s->tcb.rcv.wscale);
/*
* if we still have packets to read,
@@ -2176,28 +2331,99 @@ tle_tcp_stream_recv(struct tle_stream *ts, struct rte_mbuf *pkt[], uint16_t num)
if (tcp_stream_try_acquire(s) > 0 && s->rx.ev != NULL)
tle_event_raise(s->rx.ev);
tcp_stream_release(s);
+ /* if we have received fin, no more data will come, raise err event. */
+ } else if (s->tcb.rcv.frs.on == 2) {
+ if (tcp_stream_try_acquire(s) > 0 && s->err.ev != NULL)
+ tle_event_raise(s->err.ev);
+ tcp_stream_release(s);
+ }
+
+ /* update recv win to the remote */
+ if (free_slots < RECV_WIN_NOTIFY_THRESH &&
+ rte_ring_free_count(s->rx.q) >= RECV_WIN_NOTIFY_THRESH) {
+ s->tcb.snd.update_rcv = true;
+ txs_enqueue(s->s.ctx, s);
}
return n;
}
+uint16_t
+tle_tcp_stream_inq(struct tle_stream *ts)
+{
+ struct tle_tcp_stream *s;
+
+ s = TCP_STREAM(ts);
+ return s->tcb.rcv.nxt - s->tcb.rcv.cpy;
+}
+
+#define DECONST(type, var) ((type)(uintptr_t)(const void *)(var))
+
+ssize_t
+tle_tcp_stream_readv(struct tle_stream *ts, const struct iovec *iov, int iovcnt)
+{
+ struct msghdr msg = {0};
+
+ msg.msg_iov = DECONST(struct iovec *, iov); /* Recover const later */
+ msg.msg_iovlen = iovcnt;
+ return tle_tcp_stream_recvmsg(ts, &msg);
+}
+
ssize_t
-tle_tcp_stream_readv(struct tle_stream *ts, const struct iovec *iov,
- int iovcnt)
+tle_tcp_stream_recvmsg(struct tle_stream *ts, struct msghdr *msg)
{
+ size_t sz;
int32_t i;
uint32_t mn, n, tn;
- size_t sz;
+ uint32_t free_slots;
struct tle_tcp_stream *s;
struct iovec iv;
struct rxq_objs mo[2];
+ struct sockaddr_in *addr;
+ struct sockaddr_in6 *addr6;
+ const struct iovec *iov = msg->msg_iov;
+ int iovcnt = msg->msg_iovlen;
s = TCP_STREAM(ts);
+ free_slots = rte_ring_free_count(s->rx.q);
+
/* get group of packets */
mn = tcp_rxq_get_objs(s, mo);
- if (mn == 0)
- return 0;
+ if (mn == 0) {
+ if (s->tcb.err != 0)
+ rte_errno = s->tcb.err;
+ else
+ rte_errno = EAGAIN;
+ return -1;
+ }
+
+ if (!ts->option.timestamp)
+ ts->timestamp = mo[0].mb[0]->timestamp;
+
+ if (msg->msg_control != NULL) {
+ if (ts->option.timestamp)
+ tle_set_timestamp(msg, mo[0].mb[0]);
+ else
+ msg->msg_controllen = 0;
+ }
+
+ if (msg->msg_name != NULL) {
+ if (s->s.type == TLE_V4) {
+ addr = (struct sockaddr_in*)msg->msg_name;
+ addr->sin_family = AF_INET;
+ addr->sin_addr.s_addr = s->s.ipv4.addr.src;
+ addr->sin_port = s->s.port.src;
+ msg->msg_namelen = sizeof(struct sockaddr_in);
+ } else {
+ addr6 = (struct sockaddr_in6*)msg->msg_name;
+ addr6->sin6_family = AF_INET6;
+ rte_memcpy(&addr6->sin6_addr, &s->s.ipv6.addr.src,
+ sizeof(struct sockaddr_in6));
+ addr6->sin6_port = s->s.port.src;
+ msg->msg_namelen = sizeof(struct sockaddr_in6);
+ }
+ }
sz = 0;
n = 0;
@@ -2229,6 +2455,8 @@ tle_tcp_stream_readv(struct tle_stream *ts, const struct iovec *iov,
}
tcp_rxq_consume(s, tn);
+ /* update receive window with left recv buffer*/
+ s->tcb.rcv.wnd = calc_rx_wnd(s, s->tcb.rcv.wscale);
/*
* if we still have packets to read,
@@ -2238,6 +2466,20 @@ tle_tcp_stream_readv(struct tle_stream *ts, const struct iovec *iov,
if (tcp_stream_try_acquire(s) > 0 && s->rx.ev != NULL)
tle_event_raise(s->rx.ev);
tcp_stream_release(s);
+ /* if we have received fin, no more data will come, raise err event. */
+ } else if (s->tcb.rcv.frs.on == 2) {
+ if (tcp_stream_try_acquire(s) > 0 && s->err.ev != NULL)
+ tle_event_raise(s->err.ev);
+ tcp_stream_release(s);
+ }
+
+ s->tcb.rcv.cpy += sz;
+
+ /* update recv win to the remote */
+ if (free_slots < RECV_WIN_NOTIFY_THRESH &&
+ rte_ring_free_count(s->rx.q) >= RECV_WIN_NOTIFY_THRESH) {
+ s->tcb.snd.update_rcv = true;
+ txs_enqueue(s->s.ctx, s);
}
return sz;
@@ -2263,48 +2505,35 @@ tx_segments(struct tle_tcp_stream *s, uint64_t ol_flags,
if (i == num) {
/* queue packets for further transmission. */
rc = _rte_ring_enqueue_bulk(s->tx.q, (void **)segs, num);
- if (rc != 0)
+ if (rc != 0) {
+ rc = -EAGAIN;
free_mbufs(segs, num);
+ }
}
return rc;
}
-uint16_t
-tle_tcp_stream_send(struct tle_stream *ts, struct rte_mbuf *pkt[], uint16_t num)
+static inline uint16_t
+stream_send(struct tle_tcp_stream *s, struct rte_mbuf *pkt[],
+ uint16_t num, uint16_t mss, uint64_t ol_flags)
{
- uint32_t i, j, k, mss, n, state;
+ uint16_t i, j, k;
int32_t rc;
- uint64_t ol_flags;
- struct tle_tcp_stream *s;
+ uint32_t n, free_slots;
struct rte_mbuf *segs[TCP_MAX_PKT_SEG];
-
- s = TCP_STREAM(ts);
-
- /* mark stream as not closable. */
- if (tcp_stream_acquire(s) < 0) {
- rte_errno = EAGAIN;
- return 0;
- }
-
- state = s->tcb.state;
- if (state != TCP_ST_ESTABLISHED && state != TCP_ST_CLOSE_WAIT) {
- rte_errno = ENOTCONN;
- tcp_stream_release(s);
- return 0;
- }
-
- mss = s->tcb.snd.mss;
- ol_flags = s->tx.dst.ol_flags;
+ int32_t pkt_len;
k = 0;
rc = 0;
+ pkt_len = 0;
while (k != num) {
/* prepare and check for TX */
for (i = k; i != num; i++) {
if (pkt[i]->pkt_len > mss ||
pkt[i]->nb_segs > TCP_MAX_PKT_SEG)
break;
+ pkt_len += pkt[i]->pkt_len;
rc = tcp_fill_mbuf(pkt[i], s, &s->tx.dst, ol_flags,
s->s.port, 0, TCP_FLAG_ACK, 0, 0);
if (rc != 0)
@@ -2328,6 +2557,7 @@ tle_tcp_stream_send(struct tle_stream *ts, struct rte_mbuf *pkt[], uint16_t num)
pkt[j]->l3_len +
pkt[j]->l4_len);
pkt[j]->ol_flags &= ol_flags;
+ pkt_len -= pkt[j]->pkt_len;
}
break;
}
@@ -2339,8 +2569,10 @@ tle_tcp_stream_send(struct tle_stream *ts, struct rte_mbuf *pkt[], uint16_t num)
/* segment large packet and enqueue for sending */
} else if (i != num) {
+ free_slots = rte_ring_free_count(s->tx.q);
+ free_slots = RTE_MIN(free_slots, RTE_DIM(segs));
/* segment the packet. */
- rc = tcp_segmentation(pkt[i], segs, RTE_DIM(segs),
+ rc = tcp_segmentation(pkt[i], segs, free_slots,
&s->tx.dst, mss);
if (rc < 0) {
rte_errno = -rc;
@@ -2351,19 +2583,161 @@ tle_tcp_stream_send(struct tle_stream *ts, struct rte_mbuf *pkt[], uint16_t num)
if (rc == 0) {
/* free the large mbuf */
rte_pktmbuf_free(pkt[i]);
+ pkt_len += pkt[i]->pkt_len;
/* set the mbuf as consumed */
k++;
- } else
+ } else {
/* no space left in tx queue */
+ RTE_VERIFY(0);
break;
+ }
}
}
+ s->tcb.snd.waitlen += pkt_len;
+ return k;
+}
+
+static inline uint16_t
+stream_send_tso(struct tle_tcp_stream *s, struct rte_mbuf *pkt[],
+ uint16_t num, uint16_t mss, uint64_t ol_flags)
+{
+ uint16_t i, k, nb_segs;
+ int32_t rc, pkt_len;
+ uint64_t ol_flags1;
+ struct rte_mbuf *pre_tail;
+
+ k = 0;
+ rc = 0;
+ while (k != num) {
+ /* Make sure there is at least one slot available */
+ if (rte_ring_free_count(s->tx.q) == 0)
+ break;
+
+ /* prepare and check for TX */
+ nb_segs = 0;
+ pkt_len = 0;
+ pre_tail = NULL;
+ for (i = k; i != num; i++) {
+ if (pkt[i]->nb_segs != 1)
+ rte_panic("chained mbuf: %p\n", pkt[i]);
+ /* We shall consider cwnd and snd wnd when limit len */
+ if (nb_segs + pkt[i]->nb_segs <= TCP_MAX_PKT_SEG &&
+ pkt_len + pkt[i]->pkt_len <= 65535 - RESERVE_HEADER_LEN) {
+ nb_segs += pkt[i]->nb_segs;
+ pkt_len += pkt[i]->pkt_len;
+ if (pre_tail)
+ pre_tail->next = pkt[i];
+ pre_tail = rte_pktmbuf_lastseg(pkt[i]);
+ } else {
+ /* enqueue this one now */
+ break;
+ }
+ }
+
+ if (unlikely(i == k)) {
+ /* pkt[k] is a too big packet, now we fall back to
+ * non-tso send; we can optimize it later by
+ * splitting the mbuf.
+ */
+ if (stream_send(s, &pkt[k], 1, mss, ol_flags) == 1) {
+ k++;
+ continue;
+ } else
+ break;
+ }
+
+ pkt[k]->nb_segs = nb_segs;
+ pkt[k]->pkt_len = pkt_len;
+
+ ol_flags1 = ol_flags;
+ if (pkt_len > mss)
+ ol_flags1 |= PKT_TX_TCP_SEG;
+
+ rc = tcp_fill_mbuf(pkt[k], s, &s->tx.dst, ol_flags1,
+ s->s.port, 0, TCP_FLAG_ACK, 0, 0);
+ if (rc != 0) /* hard to recover */
+ rte_panic("failed to fill mbuf: %p\n", pkt[k]);
+
+ /* correct mss */
+ pkt[k]->tso_segsz = mss;
+
+ s->tcb.snd.waitlen += pkt_len;
+ /* We already make sure there is at least one slot */
+ if (_rte_ring_enqueue_burst(s->tx.q, (void **)pkt + k, 1) < 1)
+ RTE_VERIFY(0);
+
+ k = i;
+ }
+
+ return k;
+}
+
+uint16_t
+tle_tcp_stream_send(struct tle_stream *ts, struct rte_mbuf *pkt[], uint16_t num)
+{
+ uint16_t k, mss, state;
+ uint64_t ol_flags;
+ struct tle_tcp_stream *s;
+
+ s = TCP_STREAM(ts);
+
+ if (s->tcb.err != 0) {
+ rte_errno = s->tcb.err;
+ return 0;
+ }
+
+ /* mark stream as not closable. */
+ if (tcp_stream_acquire(s) < 0) {
+ rte_errno = EAGAIN;
+ return 0;
+ }
+
+ state = s->tcb.state;
+ switch (state) {
+ case TCP_ST_ESTABLISHED:
+ case TCP_ST_CLOSE_WAIT:
+ break;
+ case TCP_ST_FIN_WAIT_1:
+ case TCP_ST_FIN_WAIT_2:
+ case TCP_ST_CLOSING:
+ case TCP_ST_LAST_ACK:
+ rte_errno = EPIPE;
+ tcp_stream_release(s);
+ return 0;
+ default:
+ rte_errno = ENOTCONN;
+ tcp_stream_release(s);
+ return 0;
+ }
+
+ mss = s->tcb.snd.mss;
+
+ ol_flags = s->tx.dst.ol_flags;
+
+ /* Some reference number on the case:
+ * "<netperf with uss> - tap - <kernel stack> - <netserver>"
+ * ~2Gbps with tso disabled;
+ * ~16Gbps with tso enabled.
+ */
+ if (rte_ring_free_count(s->tx.q) == 0) {
+ /* Block send may try without waiting for tx event (raised by acked
+ * data), so here we will still put this stream for further process
+ */
+ txs_enqueue(s->s.ctx, s);
+ rte_errno = EAGAIN;
+ k = 0;
+ } else if (s->tx.dst.dev->prm.tx_offload & DEV_TX_OFFLOAD_TCP_TSO)
+ k = stream_send_tso(s, pkt, num, mss, ol_flags);
+ else
+ k = stream_send(s, pkt, num, mss, ol_flags);
+
/* notify BE about more data to send */
if (k != 0)
txs_enqueue(s->s.ctx, s);
+
/* if possible, re-arm stream write event. */
- if (rte_ring_free_count(s->tx.q) != 0 && s->tx.ev != NULL)
+ if (rte_ring_free_count(s->tx.q) && s->tx.ev != NULL && k == num)
tle_event_raise(s->tx.ev);
tcp_stream_release(s);
@@ -2382,9 +2756,15 @@ tle_tcp_stream_writev(struct tle_stream *ts, struct rte_mempool *mp,
struct tle_tcp_stream *s;
struct iovec iv;
struct rte_mbuf *mb[2 * MAX_PKT_BURST];
+ uint16_t mss;
s = TCP_STREAM(ts);
+ if (s->tcb.err != 0) {
+ rte_errno = s->tcb.err;
+ return -1;
+ }
+
/* mark stream as not closable. */
if (tcp_stream_acquire(s) < 0) {
rte_errno = EAGAIN;
@@ -2392,7 +2772,18 @@ tle_tcp_stream_writev(struct tle_stream *ts, struct rte_mempool *mp,
}
state = s->tcb.state;
- if (state != TCP_ST_ESTABLISHED && state != TCP_ST_CLOSE_WAIT) {
+ switch (state) {
+ case TCP_ST_ESTABLISHED:
+ case TCP_ST_CLOSE_WAIT:
+ break;
+ case TCP_ST_FIN_WAIT_1:
+ case TCP_ST_FIN_WAIT_2:
+ case TCP_ST_CLOSING:
+ case TCP_ST_LAST_ACK:
+ rte_errno = EPIPE;
+ tcp_stream_release(s);
+ return -1;
+ default:
rte_errno = ENOTCONN;
tcp_stream_release(s);
return -1;
@@ -2403,11 +2794,24 @@ tle_tcp_stream_writev(struct tle_stream *ts, struct rte_mempool *mp,
for (i = 0; i != iovcnt; i++)
tsz += iov[i].iov_len;
+ if (tsz == 0) {
+ tcp_stream_release(s);
+ return 0;
+ }
+
slen = rte_pktmbuf_data_room_size(mp);
- slen = RTE_MIN(slen, s->tcb.snd.mss);
+ mss = s->tcb.snd.mss;
+
+ slen = RTE_MIN(slen, mss);
num = (tsz + slen - 1) / slen;
n = rte_ring_free_count(s->tx.q);
+
+ if (n == 0) {
+ tcp_stream_release(s);
+ return 0;
+ }
+
num = RTE_MIN(num, n);
n = RTE_MIN(num, RTE_DIM(mb));
@@ -2451,7 +2855,6 @@ tle_tcp_stream_writev(struct tle_stream *ts, struct rte_mempool *mp,
k = 0;
if (k != j) {
-
/* free pkts that were not enqueued */
free_mbufs(mb + k, j - k);
@@ -2466,14 +2869,16 @@ tle_tcp_stream_writev(struct tle_stream *ts, struct rte_mempool *mp,
}
}
- if (k != 0) {
-
+ if (k != 0) {
/* notify BE about more data to send */
txs_enqueue(s->s.ctx, s);
/* if possible, re-arm stream write event. */
if (rte_ring_free_count(s->tx.q) != 0 && s->tx.ev != NULL)
tle_event_raise(s->tx.ev);
+ } else {
+ rte_errno = EAGAIN;
+ sz = -1;
}
tcp_stream_release(s);
@@ -2485,7 +2890,7 @@ static inline void
tx_data_fin(struct tle_tcp_stream *s, uint32_t tms, uint32_t state)
{
/* try to send some data */
- tx_nxt_data(s, tms);
+ uint32_t tn = tx_nxt_data(s, tms);
/* we also have to send a FIN */
if (state != TCP_ST_ESTABLISHED &&
@@ -2495,6 +2900,13 @@ tx_data_fin(struct tle_tcp_stream *s, uint32_t tms, uint32_t state)
s->tcb.snd.fss = ++s->tcb.snd.nxt;
send_ack(s, tms, TCP_FLAG_FIN | TCP_FLAG_ACK);
}
+
+ if (s->tcb.snd.update_rcv) {
+ if (tn == 0)
+ send_ack(s, tms, TCP_FLAG_ACK); /* update recv window */
+
+ s->tcb.snd.update_rcv = false;
+ }
}
static inline void
@@ -2507,7 +2919,7 @@ tx_stream(struct tle_tcp_stream *s, uint32_t tms)
if (state == TCP_ST_SYN_SENT) {
/* send the SYN, start the rto timer */
send_ack(s, tms, TCP_FLAG_SYN);
- timer_start(s);
+ timer_start(s, TIMER_RTO, s->tcb.snd.rto);
} else if (state >= TCP_ST_ESTABLISHED && state <= TCP_ST_LAST_ACK) {
@@ -2515,7 +2927,7 @@ tx_stream(struct tle_tcp_stream *s, uint32_t tms)
/* start RTO timer. */
if (s->tcb.snd.nxt != s->tcb.snd.una)
- timer_start(s);
+ timer_start(s, TIMER_RTO, s->tcb.snd.rto);
}
}
@@ -2544,7 +2956,6 @@ rto_stream(struct tle_tcp_stream *s, uint32_t tms)
if (s->tcb.snd.nb_retx < s->tcb.snd.nb_retm) {
if (state >= TCP_ST_ESTABLISHED && state <= TCP_ST_LAST_ACK) {
-
/* update SND.CWD and SND.SSTHRESH */
rto_cwnd_update(&s->tcb);
@@ -2570,50 +2981,131 @@ rto_stream(struct tle_tcp_stream *s, uint32_t tms)
* than one SYN or SYN/ACK retransmissions or true loss
* detection has been made.
*/
- if (s->tcb.snd.nb_retx != 0)
+ if (s->tcb.snd.nb_retx != 0) {
s->tcb.snd.cwnd = s->tcb.snd.mss;
+ CWND_INFO("synsent", s->tcb.snd.cwnd);
+ }
send_ack(s, tms, TCP_FLAG_SYN);
-
- } else if (state == TCP_ST_TIME_WAIT) {
- stream_term(s);
+ TCP_INC_STATS(TCP_MIB_RETRANSSEGS);
}
/* RFC6298:5.5 back off the timer */
s->tcb.snd.rto = rto_roundup(2 * s->tcb.snd.rto);
s->tcb.snd.nb_retx++;
- timer_restart(s);
+ timer_restart(s, TIMER_RTO, s->tcb.snd.rto);
} else {
- send_rst(s, s->tcb.snd.nxt);
+ if (state == TCP_ST_SYN_SENT) {
+ if (stream_fill_dest(s) != 0 ||
+ is_broadcast_ether_addr((struct ether_addr *)s->tx.dst.hdr))
+ s->tcb.err = EHOSTUNREACH;
+ else
+ /* TODO: do we send rst on this */
+ s->tcb.err = ENOTCONN;
+ } else
+ send_rst(s, s->tcb.snd.una);
stream_term(s);
}
}
+static inline void
+set_keepalive_timer(struct tle_tcp_stream *s)
+{
+ if (s->s.option.keepalive) {
+ if (s->tcb.state == TCP_ST_ESTABLISHED) {
+ if (s->tcb.snd.nb_keepalive == 0)
+ timer_reset(s, TIMER_KEEPALIVE,
+ s->s.option.keepidle * MS_PER_S);
+ else
+ timer_reset(s, TIMER_KEEPALIVE,
+ s->s.option.keepintvl * MS_PER_S);
+ }
+ } else {
+ timer_stop(s, TIMER_KEEPALIVE);
+ s->tcb.snd.nb_keepalive = 0;
+ }
+}
+
int
tle_tcp_process(struct tle_ctx *ctx, uint32_t num)
{
- uint32_t i, k, tms;
+ uint8_t type;
+ uint32_t i, k;
+ uint64_t tms;
struct sdr *dr;
struct tle_timer_wheel *tw;
struct tle_stream *p;
struct tle_tcp_stream *s, *rs[num];
- /* process streams with RTO exipred */
+ tms = tcp_get_tms(ctx->cycles_ms_shift);
+ /* process streams with RTO exipred */
tw = CTX_TCP_TMWHL(ctx);
- tms = tcp_get_tms(ctx->cycles_ms_shift);
tle_timer_expire(tw, tms);
k = tle_timer_get_expired_bulk(tw, (void **)rs, RTE_DIM(rs));
for (i = 0; i != k; i++) {
-
- s = rs[i];
- s->timer.handle = NULL;
- if (tcp_stream_try_acquire(s) > 0)
- rto_stream(s, tms);
- tcp_stream_release(s);
+ s = timer_stream(rs[i]);
+ type = timer_type(rs[i]);
+ s->timer.handle[type] = NULL;
+
+ switch (type) {
+ case TIMER_RTO:
+ /* FE cannot change stream into below states,
+ * that's why we don't put it into lock
+ */
+ if (s->tcb.state == TCP_ST_TIME_WAIT ||
+ s->tcb.state == TCP_ST_FIN_WAIT_2) {
+ tcp_stream_down(s);
+ stream_term(s);
+ tcp_stream_up(s);
+ } else if (tcp_stream_acquire(s) > 0) {
+ /*
+ * stream may be closed in frontend concurrently.
+ * if stream has already been closed, it need not
+ * to retransmit anymore.
+ */
+ if (s->tcb.state != TCP_ST_CLOSED)
+ rto_stream(s, tms);
+ tcp_stream_release(s);
+ }
+ /* Fail to aquire lock? FE is shutdown or close this
+ * stream, either FIN or RST needs to be sent, which
+ * means it's in tsq, will be processed later.
+ */
+ break;
+ case TIMER_DACK:
+ if (rte_atomic32_read(&s->tx.arm) == 0 &&
+ s->tcb.rcv.nxt != s->tcb.snd.ack &&
+ tcp_stream_acquire(s) > 0) {
+ s->s.option.tcpquickack = 8;
+ send_ack(s, tms, TCP_FLAG_ACK);
+ tcp_stream_release(s);
+ }
+ break;
+ case TIMER_KEEPALIVE:
+ if (s->tcb.snd.nb_keepalive < s->s.option.keepcnt) {
+ if (tcp_stream_try_acquire(s) > 0 &&
+ s->tcb.state == TCP_ST_ESTABLISHED) {
+ send_keepalive(s);
+ s->tcb.snd.nb_keepalive++;
+ timer_start(s, TIMER_KEEPALIVE,
+ s->s.option.keepintvl * MS_PER_S);
+ }
+ tcp_stream_release(s);
+ } else {
+ tcp_stream_down(s);
+ send_rst(s, s->tcb.snd.nxt);
+ s->tcb.err = ETIMEDOUT;
+ stream_term(s);
+ tcp_stream_up(s);
+ }
+ break;
+ default:
+ rte_panic("Invalid timer type: %d\n", type);
+ }
}
/* process streams from to-send queue */
@@ -2621,20 +3113,63 @@ tle_tcp_process(struct tle_ctx *ctx, uint32_t num)
k = txs_dequeue_bulk(ctx, rs, RTE_DIM(rs));
for (i = 0; i != k; i++) {
-
s = rs[i];
- rte_atomic32_set(&s->tx.arm, 0);
- if (tcp_stream_try_acquire(s) > 0)
+ if (s->tcb.uop & TCP_OP_RESET) {
+ /* already put into death row in close() */
+ send_rst(s, s->tcb.snd.nxt);
+ continue;
+ }
+
+ if (tcp_stream_acquire(s) > 0) {
+ if (s->tcb.uop & TCP_OP_KEEPALIVE) {
+ s->tcb.uop &= ~TCP_OP_KEEPALIVE;
+ set_keepalive_timer(s);
+ }
+
+ if (s->tcb.state == TCP_ST_FIN_WAIT_2 &&
+ s->tcb.uop & TCP_OP_CLOSE) {
+ /* This could happen after:
+ * 1) shutdown;
+ * 2) FIN sent;
+ * 3) ack received;
+ * 4) close;
+ */
+ timer_start(s, TIMER_RTO, s->tcb.snd.rto_fw);
+ tcp_stream_release(s);
+ continue;
+ }
+
+ if (s->tcb.state == TCP_ST_ESTABLISHED &&
+ s->s.option.tcpcork) {
+ if (s->tcb.snd.cork_ts == 0)
+ s->tcb.snd.cork_ts = (uint32_t)tms;
+
+ if (s->tcb.snd.waitlen < s->tcb.snd.mss &&
+ (uint32_t)tms - s->tcb.snd.cork_ts < 200) {
+ txs_enqueue(s->s.ctx, s);
+ tcp_stream_release(s);
+ continue;
+ }
+
+ s->tcb.snd.cork_ts = 0;
+ }
+
tx_stream(s, tms);
- else
+ tcp_stream_release(s);
+ continue;
+ }
+
+ if (s->tcb.state != TCP_ST_CLOSED)
txs_enqueue(s->s.ctx, s);
- tcp_stream_release(s);
+
+ /* TCP_ST_CLOSED? See close with TCP_ST_CLOSED state */
}
/* collect streams to close from the death row */
dr = CTX_TCP_SDR(ctx);
+ rte_spinlock_lock(&dr->lock);
for (k = 0, p = STAILQ_FIRST(&dr->be);
k != num && p != NULL;
k++, p = STAILQ_NEXT(p, link))
@@ -2645,9 +3180,21 @@ tle_tcp_process(struct tle_ctx *ctx, uint32_t num)
else
STAILQ_FIRST(&dr->be) = p;
+ /* if stream still in tsq, wait one more round */
+ for (i = 0; i != k; i++) {
+ if (rte_atomic32_read(&rs[i]->tx.arm) > 0) {
+ STAILQ_INSERT_TAIL(&dr->be, &rs[i]->s, link);
+ rs[i] = NULL;
+ }
+ }
+
+ rte_spinlock_unlock(&dr->lock);
+
/* cleanup closed streams */
for (i = 0; i != k; i++) {
s = rs[i];
+ if (s == NULL)
+ continue;
tcp_stream_down(s);
tcp_stream_reset(ctx, s);
}
diff --git a/lib/libtle_l4p/tcp_rxtx.h b/lib/libtle_l4p/tcp_rxtx.h
new file mode 100644
index 0000000..e7f8e3e
--- /dev/null
+++ b/lib/libtle_l4p/tcp_rxtx.h
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2016-2017 Intel Corporation.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _TCP_RXTX_H_
+#define _TCP_RXTX_H_
+
+#include "tcp_stream.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static inline uint32_t
+calc_seg_cnt(uint32_t plen, uint32_t mss)
+{
+ if (plen > mss)
+ return (plen + mss - 1) / mss;
+ else
+ return 1;
+}
+
+static inline uint32_t
+get_ip_pid(struct tle_dev *dev, uint32_t num, uint32_t type, uint32_t st)
+{
+ uint32_t pid;
+ rte_atomic32_t *pa;
+
+ pa = &dev->tx.packet_id[type];
+
+ if (st == 0) {
+ pid = rte_atomic32_add_return(pa, num);
+ return pid - num;
+ } else {
+ pid = rte_atomic32_read(pa);
+ rte_atomic32_set(pa, pid + num);
+ return pid;
+ }
+}
+
+static inline void
+fill_tcph(struct tcp_hdr *l4h, const struct tcb *tcb, union l4_ports port,
+ uint32_t seq, uint8_t hlen, uint8_t flags)
+{
+ uint16_t wnd;
+
+ l4h->src_port = port.dst;
+ l4h->dst_port = port.src;
+
+ wnd = (flags & TCP_FLAG_SYN) ?
+ RTE_MIN(tcb->rcv.wnd, (uint32_t)UINT16_MAX) :
+ tcb->rcv.wnd >> tcb->rcv.wscale;
+
+ /* ??? use sse shuffle to hton all remaining 16 bytes at once. ??? */
+ l4h->sent_seq = rte_cpu_to_be_32(seq);
+ l4h->recv_ack = rte_cpu_to_be_32(tcb->rcv.nxt);
+ l4h->data_off = hlen / TCP_DATA_ALIGN << TCP_DATA_OFFSET;
+ l4h->tcp_flags = flags;
+ l4h->rx_win = rte_cpu_to_be_16(wnd);
+ l4h->cksum = 0;
+ l4h->tcp_urp = 0;
+
+ if (flags & TCP_FLAG_SYN)
+ fill_syn_opts(l4h + 1, &tcb->so);
+ else if ((flags & TCP_FLAG_RST) == 0 && tcb->so.ts.raw != 0)
+ fill_tms_opts(l4h + 1, tcb->snd.ts, tcb->rcv.ts);
+}
+
+static inline int
+tcp_fill_mbuf(struct rte_mbuf *m, const struct tle_tcp_stream *s,
+ const struct tle_dest *dst, uint64_t ol_flags,
+ union l4_ports port, uint32_t seq, uint32_t flags,
+ uint32_t pid, uint32_t swcsm)
+{
+ uint32_t l4, len, plen;
+ struct tcp_hdr *l4h;
+ char *l2h, *l3;
+
+ len = dst->l2_len + dst->l3_len;
+ plen = m->pkt_len;
+
+ if (flags & TCP_FLAG_SYN) {
+ /* basic length */
+ l4 = sizeof(*l4h) + TCP_OPT_LEN_MSS;
+
+ /* add wscale space and nop */
+ if (s->tcb.so.wscale) {
+ l4 += TCP_OPT_LEN_WSC + TCP_OPT_LEN_NOP;
+ }
+
+ /* add timestamp space and nop */
+ if (s->tcb.so.ts.raw) {
+ l4 += TCP_TX_OPT_LEN_TMS;
+ }
+ } else if ((flags & TCP_FLAG_RST) == 0 && s->tcb.rcv.ts != 0) {
+ l4 = sizeof(*l4h) + TCP_TX_OPT_LEN_TMS;
+ } else {
+ l4 = sizeof(*l4h);
+ }
+
+ /* adjust mbuf to put L2/L3/L4 headers into it. */
+ l2h = rte_pktmbuf_prepend(m, len + l4);
+ if (l2h == NULL)
+ return -EINVAL;
+
+ /* copy L2/L3 header */
+ rte_memcpy(l2h, dst->hdr, len);
+
+ /* setup TCP header & options */
+ l4h = (struct tcp_hdr *)(l2h + len);
+ fill_tcph(l4h, &s->tcb, port, seq, l4, flags);
+
+ /* setup mbuf TX offload related fields. */
+ m->tx_offload = _mbuf_tx_offload(dst->l2_len, dst->l3_len, l4, 0, 0, 0);
+ m->ol_flags |= ol_flags;
+
+ /* update proto specific fields. */
+
+ l3 = l2h + dst->l2_len;
+ if (((struct ipv4_hdr*)l3)->version_ihl>>4 == 4) {
+ struct ipv4_hdr *l3h;
+ l3h = (struct ipv4_hdr *)l3;
+ l3h->packet_id = rte_cpu_to_be_16(pid);
+ l3h->total_length = rte_cpu_to_be_16(plen + dst->l3_len + l4);
+
+ if ((ol_flags & PKT_TX_TCP_CKSUM) != 0)
+ l4h->cksum = _ipv4x_phdr_cksum(l3h, m->l3_len,
+ ol_flags);
+ else if (swcsm != 0)
+ l4h->cksum = _ipv4_udptcp_mbuf_cksum(m, len, l3h);
+
+ if ((ol_flags & PKT_TX_IP_CKSUM) == 0 && swcsm != 0)
+ l3h->hdr_checksum = _ipv4x_cksum(l3h, m->l3_len);
+ } else {
+ struct ipv6_hdr *l3h;
+ l3h = (struct ipv6_hdr *)l3;
+ l3h->payload_len = rte_cpu_to_be_16(plen + l4);
+ if ((ol_flags & PKT_TX_TCP_CKSUM) != 0)
+ l4h->cksum = rte_ipv6_phdr_cksum(l3h, ol_flags);
+ else if (swcsm != 0)
+ l4h->cksum = _ipv6_udptcp_mbuf_cksum(m, len, l3h);
+ }
+
+ return 0;
+}
+
+static inline int
+stream_drb_empty(struct tle_tcp_stream *s)
+{
+ return rte_ring_empty(s->tx.drb.r);
+}
+
+static inline void
+stream_drb_free(struct tle_tcp_stream *s, struct tle_drb *drbs[],
+ uint32_t nb_drb)
+{
+ _rte_ring_enqueue_burst(s->tx.drb.r, (void **)drbs, nb_drb);
+}
+
+static inline uint32_t
+stream_drb_alloc(struct tle_tcp_stream *s, struct tle_drb *drbs[],
+ uint32_t nb_drb)
+{
+ return _rte_ring_dequeue_burst(s->tx.drb.r, (void **)drbs, nb_drb);
+}
+
+/*
+ * queue standalone packet to he particular output device
+ * It assumes that:
+ * - L2/L3/L4 headers should be already set.
+ * - packet fits into one segment.
+ */
+static inline int
+send_pkt(struct tle_tcp_stream *s, struct tle_dev *dev, struct rte_mbuf *m)
+{
+ uint32_t n, nb;
+ struct tle_drb *drb;
+
+ if (stream_drb_alloc(s, &drb, 1) == 0)
+ return -ENOBUFS;
+
+ /* enqueue pkt for TX. */
+ nb = 1;
+ n = tle_dring_mp_enqueue(&dev->tx.dr, (const void * const*)&m, 1,
+ &drb, &nb);
+
+ /* free unused drbs. */
+ if (nb != 0)
+ stream_drb_free(s, &drb, 1);
+
+ return (n == 1) ? 0 : -ENOBUFS;
+}
+
+#define TCP_OLFLAGS_CKSUM(flags) (flags & (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM))
+
+static inline int
+send_ctrl_pkt(struct tle_tcp_stream *s, struct rte_mbuf *m, uint32_t seq,
+ uint32_t flags)
+{
+ const struct tle_dest *dst;
+ uint32_t pid, type;
+ int32_t rc;
+
+ dst = &s->tx.dst;
+ type = s->s.type;
+ pid = get_ip_pid(dst->dev, 1, type, (s->flags & TLE_CTX_FLAG_ST) != 0);
+
+ rc = tcp_fill_mbuf(m, s, dst, TCP_OLFLAGS_CKSUM(dst->ol_flags),
+ s->s.port, seq, flags, pid, 1);
+ if (rc == 0)
+ rc = send_pkt(s, dst->dev, m);
+
+ return rc;
+}
+
+static inline int
+send_rst(struct tle_tcp_stream *s, uint32_t seq)
+{
+ struct rte_mbuf *m;
+ int32_t rc;
+
+ m = rte_pktmbuf_alloc(s->tx.dst.head_mp);
+ if (m == NULL)
+ return -ENOMEM;
+
+ rc = send_ctrl_pkt(s, m, seq, TCP_FLAG_RST | TCP_FLAG_ACK);
+ if (rc != 0)
+ rte_pktmbuf_free(m);
+ else
+ TCP_INC_STATS(TCP_MIB_OUTRSTS);
+
+ return rc;
+}
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _TCP_RXTX_H_ */
diff --git a/lib/libtle_l4p/tcp_stream.c b/lib/libtle_l4p/tcp_stream.c
index 676521b..4a65053 100644
--- a/lib/libtle_l4p/tcp_stream.c
+++ b/lib/libtle_l4p/tcp_stream.c
@@ -20,6 +20,8 @@
#include <rte_ip.h>
#include <rte_tcp.h>
+#include <netinet/tcp.h>
+
#include "tcp_stream.h"
#include "tcp_timer.h"
#include "stream_table.h"
@@ -27,6 +29,7 @@
#include "tcp_ctl.h"
#include "tcp_ofo.h"
#include "tcp_txq.h"
+#include "tcp_rxtx.h"
static void
unuse_stream(struct tle_tcp_stream *s)
@@ -38,25 +41,27 @@ unuse_stream(struct tle_tcp_stream *s)
static void
fini_stream(struct tle_tcp_stream *s)
{
- if (s != NULL) {
- rte_free(s->rx.q);
- tcp_ofo_free(s->rx.ofo);
- rte_free(s->tx.q);
- rte_free(s->tx.drb.r);
- }
+ rte_free(s);
}
static void
tcp_fini_streams(struct tle_ctx *ctx)
{
- uint32_t i;
struct tcp_streams *ts;
+ struct tle_stream *s;
ts = CTX_TCP_STREAMS(ctx);
if (ts != NULL) {
stbl_fini(&ts->st);
- for (i = 0; i != ctx->prm.max_streams; i++)
- fini_stream(&ts->s[i]);
+
+ /* TODO: free those in use? may be not necessary, as we assume
+ * all streams have been closed and are free.
+ */
+ while (ctx->streams.nb_free--) {
+ s = STAILQ_FIRST(&ctx->streams.free);
+ STAILQ_FIRST(&ctx->streams.free) = STAILQ_NEXT(s, link);
+ fini_stream(TCP_STREAM(s));
+ }
/* free the timer wheel */
tle_timer_free(ts->tmr);
@@ -94,61 +99,100 @@ alloc_ring(uint32_t n, uint32_t flags, int32_t socket)
return r;
}
+/* stream memory layout:
+ * [tle_tcp_stream] [rx.q] [rx.ofo] [tx.q] [tx.drb.r]
+ */
static int
-init_stream(struct tle_ctx *ctx, struct tle_tcp_stream *s)
+add_stream(struct tle_ctx *ctx)
{
- size_t bsz, rsz, sz;
- uint32_t f, i, k, n, nb;
+ size_t sz_s, sz_rxq, sz_ofo, sz_txq, sz_drb_r, sz;
+ /* for rx.q */
+ uint32_t n_rxq;
+ /* for rx.ofo */
+ struct ofo *ofo;
+ struct rte_mbuf **obj;
+ uint32_t ndb, nobj;
+ size_t dsz, osz;
+ /* for tx.q */
+ uint32_t n_txq;
+ /* for tx.drb.r */
+ size_t bsz, rsz;
struct tle_drb *drb;
- char name[RTE_RING_NAMESIZE];
-
- f = ((ctx->prm.flags & TLE_CTX_FLAG_ST) == 0) ? 0 :
- (RING_F_SP_ENQ | RING_F_SC_DEQ);
-
- /* init RX part. */
-
- n = RTE_MAX(ctx->prm.max_stream_rbufs, 1U);
- s->rx.q = alloc_ring(n, f | RING_F_SP_ENQ, ctx->prm.socket_id);
- if (s->rx.q == NULL)
- return -ENOMEM;
-
- s->rx.ofo = tcp_ofo_alloc(n, ctx->prm.socket_id);
- if (s->rx.ofo == NULL)
- return -ENOMEM;
-
- /* init TX part. */
+ uint32_t k, nb, n_drb;
- n = RTE_MAX(ctx->prm.max_stream_sbufs, 1U);
- s->tx.q = alloc_ring(n, f | RING_F_SC_DEQ, ctx->prm.socket_id);
- if (s->tx.q == NULL)
- return -ENOMEM;
+ uint32_t f, i;
+ char name[RTE_RING_NAMESIZE];
+ struct tle_tcp_stream *s;
+ // stream
+ sz_s = RTE_ALIGN_CEIL(sizeof(*s), RTE_CACHE_LINE_SIZE);
+
+ // rx.q
+ n_rxq = RTE_MAX(ctx->prm.max_stream_rbufs, 1U);
+ n_rxq = rte_align32pow2(n_rxq);
+ sz_rxq = rte_ring_get_memsize(n_rxq);
+ sz_rxq = RTE_ALIGN_CEIL(sz_rxq, RTE_CACHE_LINE_SIZE);
+
+ // rx.ofo
+ calc_ofo_elems(n_rxq, &nobj, &ndb);
+ osz = sizeof(*ofo) + sizeof(ofo->db[0]) * ndb;
+ dsz = sizeof(ofo->db[0].obj[0]) * nobj * ndb;
+ sz_ofo = osz + dsz;
+ sz_ofo = RTE_ALIGN_CEIL(sz_ofo, RTE_CACHE_LINE_SIZE);
+
+ // tx.q
+ n_txq = RTE_MAX(ctx->prm.max_stream_sbufs, 1U);
+ n_txq = rte_align32pow2(n_txq);
+ sz_txq = rte_ring_get_memsize(n_txq);
+ sz_txq = RTE_ALIGN_CEIL(sz_txq, RTE_CACHE_LINE_SIZE);
+
+ // tx.drb.r
nb = drb_nb_elem(ctx);
k = calc_stream_drb_num(ctx, nb);
- n = rte_align32pow2(k);
-
- /* size of the drbs ring */
- rsz = rte_ring_get_memsize(n);
+ n_drb = rte_align32pow2(k);
+ rsz = rte_ring_get_memsize(n_drb); /* size of the drbs ring */
rsz = RTE_ALIGN_CEIL(rsz, RTE_CACHE_LINE_SIZE);
+ bsz = tle_drb_calc_size(nb); /* size of the drb. */
+ sz_drb_r = rsz + bsz * k; /* total stream drbs size. */
+ sz_drb_r = RTE_ALIGN_CEIL(sz_drb_r, RTE_CACHE_LINE_SIZE);
- /* size of the drb. */
- bsz = tle_drb_calc_size(nb);
-
- /* total stream drbs size. */
- sz = rsz + bsz * k;
-
- s->tx.drb.r = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE,
- ctx->prm.socket_id);
- if (s->tx.drb.r == NULL) {
- TCP_LOG(ERR, "%s(%p): allocation of %zu bytes on socket %d "
+ sz = sz_s + sz_rxq + sz_ofo + sz_txq + sz_drb_r;
+ s = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE,
+ ctx->prm.socket_id);
+ if (s == NULL) {
+ TCP_LOG(ERR, "%s: allocation of %zu bytes on socket %d "
"failed with error code: %d\n",
- __func__, s, sz, ctx->prm.socket_id, rte_errno);
+ __func__, sz, ctx->prm.socket_id, rte_errno);
return -ENOMEM;
}
- snprintf(name, sizeof(name), "%p@%zu", s, sz);
- rte_ring_init(s->tx.drb.r, name, n, f);
+ s->rx.q = (struct rte_ring *)((uintptr_t)s + sz_s);
+ s->rx.ofo = (struct ofo *)((uintptr_t)s->rx.q + sz_rxq);
+ ofo = s->rx.ofo;
+ s->tx.q = (struct rte_ring *)((uintptr_t)s->rx.ofo + sz_ofo);
+ s->tx.drb.r = (struct rte_ring *)((uintptr_t)s->tx.q + sz_txq);
+ // ring flags
+ f = ((ctx->prm.flags & TLE_CTX_FLAG_ST) == 0) ? 0 :
+ (RING_F_SP_ENQ | RING_F_SC_DEQ);
+
+ /* init RX part. */
+ snprintf(name, sizeof(name), "%p@%zu", s->rx.q, sz_rxq);
+ rte_ring_init(s->rx.q, name, n_rxq, f);
+
+ obj = (struct rte_mbuf **)&ofo->db[ndb];
+ for (i = 0; i != ndb; i++) {
+ ofo->db[i].nb_max = nobj;
+ ofo->db[i].obj = obj + i * nobj;
+ }
+ ofo->nb_max = ndb;
+
+ /* init TX part. */
+ snprintf(name, sizeof(name), "%p@%zu", s->tx.q, sz_txq);
+ rte_ring_init(s->tx.q, name, n_txq, f);
+
+ snprintf(name, sizeof(name), "%p@%zu", s->tx.drb.r, sz_drb_r);
+ rte_ring_init(s->tx.drb.r, name, n_drb, f);
for (i = 0; i != k; i++) {
drb = (struct tle_drb *)((uintptr_t)s->tx.drb.r +
rsz + bsz * i);
@@ -200,7 +244,7 @@ tcp_init_streams(struct tle_ctx *ctx)
f = ((ctx->prm.flags & TLE_CTX_FLAG_ST) == 0) ? 0 :
(RING_F_SP_ENQ | RING_F_SC_DEQ);
- sz = sizeof(*ts) + sizeof(ts->s[0]) * ctx->prm.max_streams;
+ sz = sizeof(*ts);
ts = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE,
ctx->prm.socket_id);
if (ts == NULL) {
@@ -210,6 +254,7 @@ tcp_init_streams(struct tle_ctx *ctx)
return -ENOMEM;
}
+ rte_spinlock_init(&ts->dr.lock);
STAILQ_INIT(&ts->dr.fe);
STAILQ_INIT(&ts->dr.be);
@@ -228,12 +273,11 @@ tcp_init_streams(struct tle_ctx *ctx)
if (ts->tsq == NULL)
rc = -ENOMEM;
else
- rc = stbl_init(&ts->st, ctx->prm.max_streams,
- ctx->prm.socket_id);
+ rc = stbl_init(&ts->st, (ctx->prm.flags & TLE_CTX_FLAG_ST) == 0);
}
- for (i = 0; rc == 0 && i != ctx->prm.max_streams; i++)
- rc = init_stream(ctx, &ts->s[i]);
+ for (i = 0; rc == 0 && i != ctx->prm.min_streams; i++)
+ rc = add_stream(ctx);
if (rc != 0) {
TCP_LOG(ERR, "initalisation of %u-th stream failed", i);
@@ -243,11 +287,30 @@ tcp_init_streams(struct tle_ctx *ctx)
return rc;
}
-static void __attribute__((constructor))
+/*
+ * Note this function is not thread-safe, and we did not lock here as we
+ * have the assumption that this ctx is dedicated to one thread.
+ */
+static uint32_t
+tcp_more_streams(struct tle_ctx *ctx)
+{
+ uint32_t i, nb;
+ uint32_t nb_max = ctx->prm.max_streams - 1;
+ uint32_t nb_cur = ctx->streams.nb_cur;
+
+ nb = RTE_MIN(ctx->prm.delta_streams, nb_max - nb_cur);
+ for (i = 0; i < nb; i++)
+ if (add_stream(ctx) != 0)
+ break;
+ return i;
+}
+
+static void __attribute__((constructor(101)))
tcp_stream_setup(void)
{
static const struct stream_ops tcp_ops = {
.init_streams = tcp_init_streams,
+ .more_streams = tcp_more_streams,
.fini_streams = tcp_fini_streams,
.free_drbs = tcp_free_drbs,
};
@@ -305,16 +368,12 @@ tle_tcp_stream_open(struct tle_ctx *ctx,
s = (struct tle_tcp_stream *)get_stream(ctx);
if (s == NULL) {
- rte_errno = ENFILE;
- return NULL;
-
- /* some TX still pending for that stream. */
- } else if (TCP_STREAM_TX_PENDING(s)) {
- put_stream(ctx, &s->s, 0);
rte_errno = EAGAIN;
return NULL;
}
+ s->s.option.raw = prm->option;
+
/* setup L4 ports and L3 addresses fields. */
rc = stream_fill_ctx(ctx, &s->s,
(const struct sockaddr *)&prm->addr.local,
@@ -336,12 +395,14 @@ tle_tcp_stream_open(struct tle_ctx *ctx,
/* store other params */
s->flags = ctx->prm.flags;
+ s->tcb.err = 0;
s->tcb.snd.nb_retm = (prm->cfg.nb_retries != 0) ? prm->cfg.nb_retries :
TLE_TCP_DEFAULT_RETRIES;
s->tcb.snd.cwnd = (ctx->prm.icw == 0) ? TCP_INITIAL_CWND_MAX :
ctx->prm.icw;
s->tcb.snd.rto_tw = (ctx->prm.timewait == TLE_TCP_TIMEWAIT_DEFAULT) ?
TCP_RTO_2MSL : ctx->prm.timewait;
+ s->tcb.snd.rto_fw = TLE_TCP_FINWAIT_TIMEOUT;
tcp_stream_up(s);
return &s->s;
@@ -354,9 +415,16 @@ static inline int
stream_close(struct tle_ctx *ctx, struct tle_tcp_stream *s)
{
uint16_t uop;
- uint32_t state;
static const struct tle_stream_cb zcb;
+ /* Put uop operation into this wlock; or it may cause this stream
+ * to be put into death ring twice, for example:
+ * 1) FE sets OP_CLOSE;
+ * 2) BE stream_term sets state as TCP_ST_CLOSED, and put in queue;
+ * 3) FE down the stream, and calls stream_term again.
+ */
+ tcp_stream_down(s);
+
/* check was close() already invoked */
uop = s->tcb.uop;
if ((uop & TCP_OP_CLOSE) != 0)
@@ -366,47 +434,66 @@ stream_close(struct tle_ctx *ctx, struct tle_tcp_stream *s)
if (rte_atomic16_cmpset(&s->tcb.uop, uop, uop | TCP_OP_CLOSE) == 0)
return -EDEADLK;
- /* mark stream as unavaialbe for RX/TX. */
- tcp_stream_down(s);
-
/* reset events/callbacks */
- s->rx.ev = NULL;
s->tx.ev = NULL;
+ s->rx.ev = NULL;
s->err.ev = NULL;
s->rx.cb = zcb;
s->tx.cb = zcb;
s->err.cb = zcb;
- state = s->tcb.state;
-
- /* CLOSED, LISTEN, SYN_SENT - we can close the stream straighway */
- if (state <= TCP_ST_SYN_SENT) {
+ switch (s->tcb.state) {
+ case TCP_ST_LISTEN:
+ /* close the stream straightway */
tcp_stream_reset(ctx, s);
return 0;
- }
-
- /* generate FIN and proceed with normal connection termination */
- if (state == TCP_ST_ESTABLISHED || state == TCP_ST_CLOSE_WAIT) {
-
- /* change state */
- s->tcb.state = (state == TCP_ST_ESTABLISHED) ?
- TCP_ST_FIN_WAIT_1 : TCP_ST_LAST_ACK;
-
- /* mark stream as writable/readable again */
+ case TCP_ST_CLOSED:
+ /* it could be put into this state if a RST packet is
+ * received, but this stream could be still in tsq trying
+ * to send something.
+ */
+ /* fallthrough */
+ case TCP_ST_SYN_SENT:
+ /* timer on and could be in tsq (SYN retrans) */
+ stream_term(s);
+ /* fallthrough */
+ case TCP_ST_FIN_WAIT_1:
+ /* fallthrough */
+ case TCP_ST_CLOSING:
+ /* fallthrough */
+ case TCP_ST_TIME_WAIT:
+ /* fallthrough */
+ case TCP_ST_LAST_ACK:
tcp_stream_up(s);
-
- /* queue stream into to-send queue */
- txs_enqueue(ctx, s);
return 0;
+ case TCP_ST_ESTABLISHED:
+ /* fallthrough */
+ case TCP_ST_CLOSE_WAIT:
+ if (s->tcb.state == TCP_ST_ESTABLISHED) {
+ s->tcb.state = TCP_ST_FIN_WAIT_1;
+ TCP_DEC_STATS_ATOMIC(TCP_MIB_CURRESTAB);
+ } else
+ s->tcb.state = TCP_ST_LAST_ACK;
+
+ if (!rte_ring_empty(s->rx.q)) {
+ TCP_INC_STATS(TCP_MIB_ESTABRESETS);
+ s->tcb.uop |= TCP_OP_RESET;
+ stream_term(s);
+ }
+ break;
+ case TCP_ST_FIN_WAIT_2:
+ /* Can reach this state if shutdown was called, but the timer
+ * shall be set after this close.
+ */
+ break;
+ default:
+ rte_panic("Invalid state when close: %d\n", s->tcb.state);
}
- /*
- * accroding to the state, close() was already invoked,
- * should never that point.
- */
- RTE_ASSERT(0);
- return -EINVAL;
+ tcp_stream_up(s);
+ txs_enqueue(ctx, s);
+ return 0;
}
uint32_t
@@ -453,6 +540,64 @@ tle_tcp_stream_close(struct tle_stream *ts)
}
int
+tle_tcp_stream_shutdown(struct tle_stream *ts, int how)
+{
+ int ret;
+ bool wakeup;
+ uint32_t state;
+ struct tle_tcp_stream *s;
+
+ s = TCP_STREAM(ts);
+ if (ts == NULL || s->s.type >= TLE_VNUM)
+ return -EINVAL;
+
+ /* Refer to linux/net/ipv4/tcp.c:tcp_shutdown() */
+ if (how == SHUT_RD)
+ return 0;
+
+ tcp_stream_down(s);
+
+ state = s->tcb.state;
+
+ switch (state) {
+ case TCP_ST_LISTEN:
+ /* fallthrough */
+ case TCP_ST_SYN_SENT:
+ s->tcb.state = TCP_ST_CLOSED;
+ wakeup = true;
+ ret = 0;
+ break;
+ case TCP_ST_ESTABLISHED:
+ /* fallthrough */
+ case TCP_ST_CLOSE_WAIT:
+ if (state == TCP_ST_ESTABLISHED) {
+ TCP_DEC_STATS_ATOMIC(TCP_MIB_CURRESTAB);
+ s->tcb.state = TCP_ST_FIN_WAIT_1;
+ } else
+ s->tcb.state = TCP_ST_LAST_ACK;
+ txs_enqueue(ts->ctx, s);
+ wakeup = true;
+ ret = 0;
+ break;
+ default:
+ wakeup = false;
+ rte_errno = ENOTCONN;
+ ret = -1;
+ }
+
+ if (wakeup) {
+ /* Notify other threads which may wait on the event */
+ if (s->tx.ev)
+ tle_event_raise(s->tx.ev);
+ if (how == SHUT_RDWR && s->err.ev)
+ tle_event_raise(s->err.ev);
+ }
+
+ tcp_stream_up(s);
+ return ret;
+}
+
+int
tle_tcp_stream_get_addr(const struct tle_stream *ts,
struct tle_tcp_stream_addr *addr)
{
@@ -617,3 +762,73 @@ tle_tcp_stream_get_mss(const struct tle_stream * ts)
s = TCP_STREAM(ts);
return s->tcb.snd.mss;
}
+
+int
+tle_tcp_stream_get_info(const struct tle_stream * ts, void *info, socklen_t *optlen)
+{
+ struct tle_tcp_stream *s;
+ struct tcp_info i;
+
+ if (ts == NULL)
+ return -EINVAL;
+
+ s = TCP_STREAM(ts);
+
+ memset(&i, 0, sizeof(struct tcp_info));
+
+ /* transform from tldk state into linux kernel state */
+ switch (s->tcb.state) {
+ case TCP_ST_CLOSED:
+ i.tcpi_state = TCP_CLOSE;
+ break;
+ case TCP_ST_LISTEN:
+ i.tcpi_state = TCP_LISTEN;
+ break;
+ case TCP_ST_SYN_SENT:
+ i.tcpi_state = TCP_SYN_SENT;
+ break;
+ case TCP_ST_SYN_RCVD:
+ i.tcpi_state = TCP_SYN_RECV;
+ break;
+ case TCP_ST_ESTABLISHED:
+ i.tcpi_state = TCP_ESTABLISHED;
+ break;
+ case TCP_ST_FIN_WAIT_1:
+ i.tcpi_state = TCP_FIN_WAIT1;
+ break;
+ case TCP_ST_FIN_WAIT_2:
+ i.tcpi_state = TCP_FIN_WAIT2;
+ break;
+ case TCP_ST_CLOSE_WAIT:
+ i.tcpi_state = TCP_CLOSE_WAIT;
+ break;
+ case TCP_ST_CLOSING:
+ i.tcpi_state = TCP_CLOSING;
+ break;
+ case TCP_ST_LAST_ACK:
+ i.tcpi_state = TCP_LAST_ACK;
+ break;
+ case TCP_ST_TIME_WAIT:
+ i.tcpi_state = TCP_TIME_WAIT;
+ break;
+ }
+
+ /* fix me, total retrans? */
+ i.tcpi_total_retrans = s->tcb.snd.nb_retx;
+
+ if (*optlen > sizeof(struct tcp_info))
+ *optlen = sizeof(struct tcp_info);
+ rte_memcpy(info, &i, *optlen);
+ return 0;
+}
+
+void
+tle_tcp_stream_set_keepalive(struct tle_stream *ts)
+{
+ struct tle_tcp_stream *s;
+
+ s = TCP_STREAM(ts);
+
+ s->tcb.uop |= TCP_OP_KEEPALIVE;
+ txs_enqueue(ts->ctx, s);
+}
diff --git a/lib/libtle_l4p/tcp_stream.h b/lib/libtle_l4p/tcp_stream.h
index 4629fe6..1202574 100644
--- a/lib/libtle_l4p/tcp_stream.h
+++ b/lib/libtle_l4p/tcp_stream.h
@@ -17,6 +17,8 @@
#define _TCP_STREAM_H_
#include <rte_vect.h>
+#include <rte_mbuf.h>
+
#include <tle_dring.h>
#include <tle_tcp.h>
#include <tle_event.h>
@@ -45,23 +47,28 @@ enum {
};
enum {
- TCP_OP_LISTEN = 0x1,
- TCP_OP_ACCEPT = 0x2,
- TCP_OP_CONNECT = 0x4,
- TCP_OP_CLOSE = 0x8,
+ TCP_OP_LISTEN = 0x1,
+ TCP_OP_ACCEPT = 0x2,
+ TCP_OP_CONNECT = 0x4,
+ TCP_OP_CLOSE = 0x8,
+ TCP_OP_RESET = 0x10,
+ TCP_OP_KEEPALIVE = 0x20
};
struct tcb {
+ int err;
volatile uint16_t state;
volatile uint16_t uop; /* operations by user performed */
struct {
uint32_t nxt;
+ uint32_t cpy; /* head of yet unread data */
uint32_t irs; /* initial received sequence */
uint32_t wnd;
uint32_t ts;
struct {
uint32_t seq;
- uint32_t on;
+ uint32_t on; /* on == 1: received an out-of-order fin
+ * on == 2: received an in order fin */
} frs;
uint32_t srtt; /* smoothed round trip time (scaled by >> 3) */
uint32_t rttvar; /* rtt variance */
@@ -83,15 +90,32 @@ struct tcb {
uint32_t ssthresh; /* slow start threshold */
uint32_t rto; /* retransmission timeout */
uint32_t rto_tw; /* TIME_WAIT retransmission timeout */
+ uint32_t rto_fw; /* FIN_WAIT_2 waiting timeout */
uint32_t iss; /* initial send sequence */
+ uint32_t waitlen; /* total length of unacknowledged pkt */
+ uint32_t cork_ts;
uint16_t mss;
uint8_t wscale;
uint8_t nb_retx; /* number of retransmission */
uint8_t nb_retm; /**< max number of retx attempts. */
+ uint8_t nb_keepalive;/* number of sended keepalive */
+ bool update_rcv; /* Flag for updating recv window */
+ uint16_t nxt_offset; /* Partial tx, next data of a segment to tx */
+ uint32_t una_offset; /* Partial ack, next data of a mbuf to ack */
+ struct rte_mbuf *nxt_pkt; /* Partial tx, next segment to send */
} snd;
struct syn_opts so; /* initial syn options. */
};
+enum {
+ TIMER_RTO,
+ TIMER_DACK,
+ TIMER_KEEPALIVE,
+ TIMER_NUM,
+ TIMER_MAX_NUM = 8,
+ TIMER_MASK = TIMER_MAX_NUM - 1
+};
+
struct tle_tcp_stream {
struct tle_stream s;
@@ -103,7 +127,7 @@ struct tle_tcp_stream {
struct tcb tcb;
struct {
- void *handle;
+ void *handle[TIMER_NUM];
} timer;
struct {
@@ -155,7 +179,6 @@ struct tcp_streams {
struct tle_timer_wheel *tmr; /* timer wheel */
struct rte_ring *tsq; /* to-send streams queue */
struct sdr dr; /* death row for zombie streams */
- struct tle_tcp_stream s[]; /* array of allocated streams. */
};
#define CTX_TCP_STREAMS(ctx) ((struct tcp_streams *)(ctx)->streams.buf)
diff --git a/lib/libtle_l4p/tcp_timer.h b/lib/libtle_l4p/tcp_timer.h
index 8faefb3..d242556 100644
--- a/lib/libtle_l4p/tcp_timer.h
+++ b/lib/libtle_l4p/tcp_timer.h
@@ -27,43 +27,53 @@ extern "C" {
* all RTO values are in ms.
*/
#define TCP_RTO_MAX 60000U /* RFC 6298 (2.5) */
-#define TCP_RTO_MIN 1000U /* RFC 6298 (2.4) */
+#define TCP_RTO_MIN 200U /* Linux/include/net/tcp.h: TCP_RTO_MIN */
#define TCP_RTO_2MSL (2 * TCP_RTO_MAX)
-#define TCP_RTO_DEFAULT TCP_RTO_MIN /* RFC 6298 (2.1)*/
+#define TCP_RTO_DEFAULT 1000U /* RFC 6298 (2.1)*/
#define TCP_RTO_GRANULARITY 100U
+static inline struct tle_tcp_stream *
+timer_stream(struct tle_tcp_stream *s)
+{
+ return (struct tle_tcp_stream *)((unsigned long)s & (~(unsigned long)TIMER_MASK));
+}
+
+static inline uint8_t
+timer_type(struct tle_tcp_stream *s)
+{
+ return (uint8_t)((unsigned long)s & (unsigned long)TIMER_MASK);
+}
static inline void
-timer_stop(struct tle_tcp_stream *s)
+timer_stop(struct tle_tcp_stream *s, uint8_t type)
{
struct tle_timer_wheel *tw;
- if (s->timer.handle != NULL) {
+ if (s->timer.handle[type] != NULL) {
tw = CTX_TCP_TMWHL(s->s.ctx);
- tle_timer_stop(tw, s->timer.handle);
- s->timer.handle = NULL;
+ tle_timer_stop(tw, s->timer.handle[type]);
+ s->timer.handle[type] = NULL;
}
}
static inline void
-timer_start(struct tle_tcp_stream *s)
+timer_start(struct tle_tcp_stream *s, uint8_t type, uint32_t timeout)
{
struct tle_timer_wheel *tw;
- if (s->timer.handle == NULL) {
+ if (s->timer.handle[type] == NULL) {
tw = CTX_TCP_TMWHL(s->s.ctx);
- s->timer.handle = tle_timer_start(tw, s, s->tcb.snd.rto);
- s->tcb.snd.nb_retx = 0;
+ s->timer.handle[type] = tle_timer_start(tw, (void*)((unsigned long)s | type), timeout);
}
}
static inline void
-timer_restart(struct tle_tcp_stream *s)
+timer_restart(struct tle_tcp_stream *s, uint8_t type, uint32_t timeout)
{
struct tle_timer_wheel *tw;
tw = CTX_TCP_TMWHL(s->s.ctx);
- s->timer.handle = tle_timer_start(tw, s, s->tcb.snd.rto);
+ s->timer.handle[type] = tle_timer_start(tw, (void*)((unsigned long)s | type), timeout);
}
@@ -71,10 +81,10 @@ timer_restart(struct tle_tcp_stream *s)
* reset number of retransmissions and restart RTO timer.
*/
static inline void
-timer_reset(struct tle_tcp_stream *s)
+timer_reset(struct tle_tcp_stream *s, uint8_t type, uint32_t timeout)
{
- timer_stop(s);
- timer_start(s);
+ timer_stop(s, type);
+ timer_start(s, type, timeout);
}
static inline uint32_t
diff --git a/lib/libtle_l4p/tcp_tx_seg.h b/lib/libtle_l4p/tcp_tx_seg.h
index ac2b13b..b64aa77 100644
--- a/lib/libtle_l4p/tcp_tx_seg.h
+++ b/lib/libtle_l4p/tcp_tx_seg.h
@@ -27,7 +27,7 @@ tcp_segmentation(struct rte_mbuf *mbin, struct rte_mbuf *mbout[], uint16_t num,
struct rte_mbuf *in_seg = NULL;
uint32_t nbseg, in_seg_data_pos;
uint32_t more_in_segs;
- uint16_t bytes_left;
+ uint16_t out_bytes_remain;
in_seg = mbin;
in_seg_data_pos = 0;
@@ -35,7 +35,7 @@ tcp_segmentation(struct rte_mbuf *mbin, struct rte_mbuf *mbout[], uint16_t num,
/* Check that pkts_out is big enough to hold all fragments */
if (mss * num < (uint16_t)mbin->pkt_len)
- return -ENOSPC;
+ return -EAGAIN;
more_in_segs = 1;
while (more_in_segs) {
@@ -49,7 +49,7 @@ tcp_segmentation(struct rte_mbuf *mbin, struct rte_mbuf *mbout[], uint16_t num,
return -ENOMEM;
}
- bytes_left = mss;
+ out_bytes_remain = mss;
out_seg_prev = out_pkt;
more_out_segs = 1;
while (more_out_segs && more_in_segs) {
@@ -68,7 +68,7 @@ tcp_segmentation(struct rte_mbuf *mbin, struct rte_mbuf *mbout[], uint16_t num,
/* Prepare indirect buffer */
rte_pktmbuf_attach(out_seg, in_seg);
- len = bytes_left;
+ len = out_bytes_remain;
if (len > (in_seg->data_len - in_seg_data_pos))
len = in_seg->data_len - in_seg_data_pos;
@@ -77,10 +77,10 @@ tcp_segmentation(struct rte_mbuf *mbin, struct rte_mbuf *mbout[], uint16_t num,
out_pkt->pkt_len = (uint16_t)(len + out_pkt->pkt_len);
out_pkt->nb_segs += 1;
in_seg_data_pos += len;
- bytes_left -= len;
+ out_bytes_remain -= len;
/* Current output packet (i.e. fragment) done ? */
- if (bytes_left == 0)
+ if (out_bytes_remain == 0)
more_out_segs = 0;
/* Current input segment done ? */
diff --git a/lib/libtle_l4p/tcp_txq.h b/lib/libtle_l4p/tcp_txq.h
index 78f1d56..303b8fd 100644
--- a/lib/libtle_l4p/tcp_txq.h
+++ b/lib/libtle_l4p/tcp_txq.h
@@ -68,9 +68,28 @@ tcp_txq_set_nxt_head(struct tle_tcp_stream *s, uint32_t num)
static inline void
tcp_txq_rst_nxt_head(struct tle_tcp_stream *s)
{
- struct rte_ring *r;
+ struct rte_ring *r = s->tx.q;
+ struct rte_mbuf *m;
+ uint32_t offset, data_len;
+
+ if (s->tcb.snd.nxt_pkt != NULL) {
+ s->tcb.snd.nxt_offset = 0;
+ s->tcb.snd.nxt_pkt = NULL;
+ }
+
+ offset = s->tcb.snd.una_offset;
+ if (offset) {
+ m = (struct rte_mbuf *)(_rte_ring_get_data(r)[r->cons.tail & r->mask]);
+ data_len = m->data_len - PKT_L234_HLEN(m);
+ while (offset >= data_len) {
+ offset -= data_len;
+ m = m->next;
+ data_len = m->data_len;
+ }
+ s->tcb.snd.nxt_pkt = m;
+ s->tcb.snd.nxt_offset = offset;
+ }
- r = s->tx.q;
r->cons.head = r->cons.tail;
}
@@ -110,9 +129,13 @@ static inline uint32_t
txs_dequeue_bulk(struct tle_ctx *ctx, struct tle_tcp_stream *s[], uint32_t num)
{
struct rte_ring *r;
+ uint32_t n, i;
r = CTX_TCP_TSQ(ctx);
- return _rte_ring_dequeue_burst(r, (void **)s, num);
+ n = _rte_ring_dequeue_burst(r, (void **)s, num);
+ for (i = 0; i < n; i++)
+ rte_atomic32_clear(&s[i]->tx.arm);
+ return n;
}
#ifdef __cplusplus
diff --git a/lib/libtle_l4p/tle_ctx.h b/lib/libtle_l4p/tle_ctx.h
index de78a6b..f0efd51 100644
--- a/lib/libtle_l4p/tle_ctx.h
+++ b/lib/libtle_l4p/tle_ctx.h
@@ -54,6 +54,43 @@ extern "C" {
struct tle_ctx;
struct tle_dev;
+typedef union tle_stream_options {
+ struct {
+ uint32_t reuseaddr: 1;
+ uint32_t reuseport: 1;
+ uint32_t keepalive: 1;
+ uint32_t ipv6only: 1;
+ uint32_t oobinline: 1;
+ uint32_t tcpcork: 1;
+ uint32_t tcpnodelay: 1;
+ uint32_t mulloop: 1;
+ uint32_t timestamp: 1;
+ uint32_t reserve: 3;
+ uint32_t tcpquickack: 4;
+ uint32_t multtl: 8;
+ uint32_t keepcnt: 8;
+ uint16_t keepidle;
+ uint16_t keepintvl;
+ };
+ uint64_t raw;
+} tle_stream_options_t;
+
+static inline void
+tle_set_timestamp(struct msghdr *msg, struct rte_mbuf *m)
+{
+ struct timeval *tv;
+ struct cmsghdr *cmsg;
+
+ cmsg = CMSG_FIRSTHDR(msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SO_TIMESTAMP;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(struct timeval));
+ msg->msg_controllen = cmsg->cmsg_len;
+ tv = (struct timeval*)CMSG_DATA(cmsg);
+ tv->tv_sec = m->timestamp >> 20;
+ tv->tv_usec = m->timestamp & 0xFFFFFUL;
+}
+
/**
* Blocked L4 ports info.
*/
@@ -112,6 +149,8 @@ struct tle_ctx_param {
int32_t socket_id; /**< socket ID to allocate memory for. */
uint32_t proto; /**< L4 proto to handle. */
uint32_t max_streams; /**< max number of streams in context. */
+ uint32_t min_streams; /**< min number of streams at init. */
+ uint32_t delta_streams; /**< delta of streams of each allocation. */
uint32_t max_stream_rbufs; /**< max recv mbufs per stream. */
uint32_t max_stream_sbufs; /**< max send mbufs per stream. */
uint32_t send_bulk_size; /**< expected # of packets per send call. */
@@ -145,6 +184,8 @@ struct tle_ctx_param {
*/
#define TLE_TCP_TIMEWAIT_DEFAULT UINT32_MAX
+#define TLE_TCP_FINWAIT_TIMEOUT 60000
+
/**
* create L4 processing context.
* @param ctx_prm
diff --git a/lib/libtle_l4p/tle_event.h b/lib/libtle_l4p/tle_event.h
index d730345..dd7a997 100644
--- a/lib/libtle_l4p/tle_event.h
+++ b/lib/libtle_l4p/tle_event.h
@@ -43,7 +43,7 @@ struct tle_event {
struct tle_evq *head;
const void *data;
enum tle_ev_state state;
-} __rte_cache_aligned;
+};
struct tle_evq {
rte_spinlock_t lock;
diff --git a/lib/libtle_l4p/tle_stats.h b/lib/libtle_l4p/tle_stats.h
new file mode 100644
index 0000000..3588c6d
--- /dev/null
+++ b/lib/libtle_l4p/tle_stats.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2018 Ant Financial Services Group.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TLE_STATS_H
+#define TLE_STATS_H
+
+#include <rte_per_lcore.h>
+#include <rte_memory.h>
+
+/* tcp mib definitions */
+/*
+ * RFC 1213: MIB-II TCP group
+ * RFC 2012 (updates 1213): SNMPv2-MIB-TCP
+ */
+enum
+{
+ TCP_MIB_RTOALGORITHM, /* RtoAlgorithm */
+ TCP_MIB_RTOMIN, /* RtoMin */
+ TCP_MIB_RTOMAX, /* RtoMax */
+ TCP_MIB_MAXCONN, /* MaxConn */
+ TCP_MIB_ACTIVEOPENS, /* ActiveOpens */
+ TCP_MIB_PASSIVEOPENS, /* PassiveOpens */
+ TCP_MIB_ATTEMPTFAILS, /* AttemptFails */
+ TCP_MIB_ESTABRESETS, /* EstabResets */
+ TCP_MIB_CURRESTAB, /* CurrEstab */
+ TCP_MIB_INSEGS, /* InSegs */
+ TCP_MIB_OUTSEGS, /* OutSegs */
+ TCP_MIB_RETRANSSEGS, /* RetransSegs */
+ TCP_MIB_INERRS, /* InErrs */
+ TCP_MIB_OUTRSTS, /* OutRsts */
+ TCP_MIB_CSUMERRORS, /* InCsumErrors */
+ TCP_MIB_MAX
+};
+
+/* udp mib definitions */
+/*
+ * RFC 1213: MIB-II UDP group
+ * RFC 2013 (updates 1213): SNMPv2-MIB-UDP
+ */
+enum
+{
+ UDP_MIB_INDATAGRAMS, /* InDatagrams */
+ UDP_MIB_NOPORTS, /* NoPorts */
+ UDP_MIB_INERRORS, /* InErrors */
+ UDP_MIB_OUTDATAGRAMS, /* OutDatagrams */
+ UDP_MIB_RCVBUFERRORS, /* RcvbufErrors */
+ UDP_MIB_SNDBUFERRORS, /* SndbufErrors */
+ UDP_MIB_CSUMERRORS, /* InCsumErrors */
+ UDP_MIB_IGNOREDMULTI, /* IgnoredMulti */
+ UDP_MIB_MAX
+};
+
+struct tcp_mib {
+ unsigned long mibs[TCP_MIB_MAX];
+};
+
+struct udp_mib {
+ unsigned long mibs[UDP_MIB_MAX];
+};
+
+struct tle_mib {
+ struct tcp_mib tcp;
+ struct udp_mib udp;
+} __rte_cache_aligned;
+
+extern struct tle_mib default_mib;
+
+RTE_DECLARE_PER_LCORE(struct tle_mib *, mib);
+
+#define PERCPU_MIB RTE_PER_LCORE(mib)
+
+#define SNMP_INC_STATS(mib, field) (mib).mibs[field]++
+#define SNMP_DEC_STATS(mib, field) (mib).mibs[field]--
+#define SNMP_ADD_STATS(mib, field, n) (mib).mibs[field] += n
+#define SNMP_ADD_STATS_ATOMIC(mib, field, n) \
+ rte_atomic64_add((rte_atomic64_t *)(&(mib).mibs[field]), n)
+
+#define TCP_INC_STATS(field) SNMP_INC_STATS(PERCPU_MIB->tcp, field)
+#define TCP_DEC_STATS(field) SNMP_DEC_STATS(PERCPU_MIB->tcp, field)
+#define TCP_ADD_STATS(field, n) SNMP_ADD_STATS(PERCPU_MIB->tcp, field, n)
+#define TCP_INC_STATS_ATOMIC(field) SNMP_ADD_STATS_ATOMIC(PERCPU_MIB->tcp, field, 1)
+#define TCP_DEC_STATS_ATOMIC(field) SNMP_ADD_STATS_ATOMIC(PERCPU_MIB->tcp, field, (-1))
+
+#define UDP_INC_STATS(field) SNMP_INC_STATS(PERCPU_MIB->udp, field)
+#define UDP_ADD_STATS(field, n) SNMP_ADD_STATS(PERCPU_MIB->udp, field, n)
+#define UDP_ADD_STATS_ATOMIC(field, n) \
+ SNMP_ADD_STATS_ATOMIC(PERCPU_MIB->udp, field, n)
+
+#endif /* TLE_STATS_H */
diff --git a/lib/libtle_l4p/tle_tcp.h b/lib/libtle_l4p/tle_tcp.h
index b0cbda6..93e853e 100644
--- a/lib/libtle_l4p/tle_tcp.h
+++ b/lib/libtle_l4p/tle_tcp.h
@@ -49,6 +49,7 @@ struct tle_tcp_stream_cfg {
struct tle_tcp_stream_param {
struct tle_tcp_stream_addr addr;
struct tle_tcp_stream_cfg cfg;
+ uint64_t option;
};
/**
@@ -86,6 +87,25 @@ tle_tcp_stream_open(struct tle_ctx *ctx,
int tle_tcp_stream_close(struct tle_stream *s);
/**
+ * shutdown an open stream in SHUT_WR way.
+ * similar to tle_tcp_stream_close(), except:
+ * - rx still works
+ * - er still works
+ * @param s
+ * Pointer to the stream to close.
+ * @return
+ * zero on successful completion.
+ * - -EINVAL - invalid parameter passed to function
+ * - -EDEADLK - close was already invoked on that stream
+ */
+int tle_tcp_stream_shutdown(struct tle_stream *s, int how);
+
+/**
+ * Send rst on this stream.
+ */
+void tle_tcp_stream_kill(struct tle_stream *s);
+
+/**
* close a group of open streams.
* if the stream is in connected state, then:
* - connection termination would be performed.
@@ -268,6 +288,15 @@ uint16_t tle_tcp_stream_recv(struct tle_stream *s, struct rte_mbuf *pkt[],
uint16_t num);
/**
+ * Get how many bytes are received in recv window.
+ * @param ts
+ * TCP stream to receive data from.
+ * @return
+ * bytes of data inside recv window.
+ */
+uint16_t tle_tcp_stream_inq(struct tle_stream *s);
+
+/**
* Reads iovcnt buffers from the for given TCP stream.
* Note that the stream has to be in connected state.
* Data ordering is preserved.
@@ -290,6 +319,19 @@ ssize_t tle_tcp_stream_readv(struct tle_stream *ts, const struct iovec *iov,
int iovcnt);
/**
+ * Like tle_tcp_stream_readv, but with more information returned in msghdr.
+ * Note that the stream has to be in connected state.
+ * @param ts
+ * TCP stream to receive data from.
+ * @param msg
+ * If not NULL, generate control message into msg_control field of msg.
+ * @return
+ * On success, number of bytes read in the stream receive buffer.
+ * In case of error, returns -1 and error code will be set in rte_errno.
+ */
+ssize_t tle_tcp_stream_recvmsg(struct tle_stream *ts, struct msghdr *msg);
+
+/**
* Consume and queue up to *num* packets, that will be sent eventually
* by tle_tcp_tx_bulk().
* Note that the stream has to be in connected state.
@@ -420,6 +462,24 @@ uint16_t tle_tcp_tx_bulk(struct tle_dev *dev, struct rte_mbuf *pkt[],
*/
int tle_tcp_process(struct tle_ctx *ctx, uint32_t num);
+/**
+ * Get tcp info of a tcp stream.
+ * This function is not multi-thread safe.
+ * @param ts
+ * TCP stream to get info from.
+ * @param info
+ * Pointer to store info.
+ * @param optlen
+ * Pointer to length of info.
+ * @return
+ * zero on successful completion.
+ * - ENOTCONN - connection is not connected yet.
+ */
+int
+tle_tcp_stream_get_info(const struct tle_stream * ts, void *info, socklen_t *optlen);
+
+void tle_tcp_stream_set_keepalive(struct tle_stream *ts);
+
#ifdef __cplusplus
}
#endif
diff --git a/lib/libtle_l4p/tle_udp.h b/lib/libtle_l4p/tle_udp.h
index d3a8fe9..640ed64 100644
--- a/lib/libtle_l4p/tle_udp.h
+++ b/lib/libtle_l4p/tle_udp.h
@@ -35,6 +35,7 @@ struct tle_udp_stream_param {
struct tle_event *send_ev; /**< send event to use. */
struct tle_stream_cb send_cb; /**< send callback to use. */
+ uint64_t option;
};
/**
@@ -55,6 +56,36 @@ tle_udp_stream_open(struct tle_ctx *ctx,
const struct tle_udp_stream_param *prm);
/**
+ * set an existed stream within given UDP context with new param.
+ * @param ts
+ * stream to set with new param
+ * @param ctx
+ * UDP context to set the stream within.
+ * @param prm
+ * Parameters used to set the stream.
+ * @return
+ * Pointer to UDP stream structure that can be used in future UDP API calls,
+ * or NULL on error, with error code set in rte_errno.
+ * Possible rte_errno errors include:
+ * - EINVAL - invalid parameter passed to function
+ * - ENOFILE - max limit of open streams reached for that context
+ */
+struct tle_stream *
+tle_udp_stream_set(struct tle_stream *ts, struct tle_ctx *ctx,
+ const struct tle_udp_stream_param *prm);
+
+/**
+ * shutdown an open stream.
+ *
+ * @param s
+ * Pointer to the stream to shutdown.
+ * @return
+ * zero on successful completion.
+ * - -EINVAL - invalid parameter passed to function
+ */
+int tle_udp_stream_shutdown(struct tle_stream *s, int how);
+
+/**
* close an open stream.
* All packets still remaining in stream receive buffer will be freed.
* All packets still remaining in stream transmit buffer will be kept
@@ -180,6 +211,24 @@ uint16_t tle_udp_stream_recv(struct tle_stream *s, struct rte_mbuf *pkt[],
uint16_t tle_udp_stream_send(struct tle_stream *s, struct rte_mbuf *pkt[],
uint16_t num, const struct sockaddr *dst_addr);
+/**
+ * updates configuration (associated events, callbacks, stream parameters)
+ * for the given streams.
+ * @param ts
+ * An array of pointers to the streams to update.
+ * @param prm
+ * An array of parameters to update for the given streams.
+ * @param num
+ * Number of elements in the *ts* and *prm* arrays.
+ * @return
+ * number of streams successfully updated.
+ * In case of error, error code set in rte_errno.
+ * Possible rte_errno errors include:
+ * - EINVAL - invalid parameter passed to function
+ */
+uint32_t tle_udp_stream_update_cfg(struct tle_stream *ts[],
+ struct tle_udp_stream_param prm[], uint32_t num);
+
#ifdef __cplusplus
}
#endif
diff --git a/lib/libtle_l4p/udp_rxtx.c b/lib/libtle_l4p/udp_rxtx.c
index 84a13ea..e9539b9 100644
--- a/lib/libtle_l4p/udp_rxtx.c
+++ b/lib/libtle_l4p/udp_rxtx.c
@@ -13,7 +13,6 @@
* limitations under the License.
*/
-#include <rte_malloc.h>
#include <rte_errno.h>
#include <rte_ethdev.h>
#include <rte_ip.h>
@@ -24,14 +23,11 @@
#include "misc.h"
static inline struct tle_udp_stream *
-rx_stream_obtain(struct tle_dev *dev, uint32_t type, uint32_t port)
+rx_stream_obtain_by_tuples(struct stbl *st, const union pkt_info *pi)
{
struct tle_udp_stream *s;
- if (type >= TLE_VNUM || dev->dp[type] == NULL)
- return NULL;
-
- s = (struct tle_udp_stream *)dev->dp[type]->streams[port];
+ s = UDP_STREAM(stbl_find_stream(st, pi));
if (s == NULL)
return NULL;
@@ -41,6 +37,24 @@ rx_stream_obtain(struct tle_dev *dev, uint32_t type, uint32_t port)
return s;
}
+static inline struct tle_udp_stream *
+rx_stream_obtain(struct tle_dev *dev, uint32_t type, const union pkt_info *pi)
+{
+ struct tle_udp_stream *s;
+
+ if (type == TLE_V4)
+ s = bhash_lookup4(dev->ctx->bhash[type],
+ pi->addr4.dst, pi->port.dst, 1);
+ else
+ s = bhash_lookup6(dev->ctx->bhash[type],
+ pi->addr6->dst, pi->port.dst, 1);
+
+ if (s == NULL || rwl_acquire(&s->rx.use) < 0)
+ return NULL;
+
+ return s;
+}
+
static inline uint16_t
get_pkt_type(const struct rte_mbuf *m)
{
@@ -57,8 +71,9 @@ get_pkt_type(const struct rte_mbuf *m)
}
static inline union l4_ports
-pkt_info(struct rte_mbuf *m, union l4_ports *ports, union ipv4_addrs *addr4,
- union ipv6_addrs **addr6)
+pkt_info_udp(struct rte_mbuf *m, union l4_ports *ports,
+ union ipv4_addrs *addr4, union ipv6_addrs **addr6,
+ union pkt_info *pi)
{
uint32_t len;
union l4_ports ret, *up;
@@ -71,15 +86,20 @@ pkt_info(struct rte_mbuf *m, union l4_ports *ports, union ipv4_addrs *addr4,
pa4 = rte_pktmbuf_mtod_offset(m, union ipv4_addrs *,
len + offsetof(struct ipv4_hdr, src_addr));
addr4->raw = pa4->raw;
+ pi->addr4.raw = pa4->raw;
+ pi->tf.type = TLE_V4;
} else if (ret.src == TLE_V6) {
*addr6 = rte_pktmbuf_mtod_offset(m, union ipv6_addrs *,
len + offsetof(struct ipv6_hdr, src_addr));
+ pi->addr6 = *addr6;
+ pi->tf.type = TLE_V6;
}
len += m->l3_len;
up = rte_pktmbuf_mtod_offset(m, union l4_ports *,
len + offsetof(struct udp_hdr, src_port));
ports->raw = up->raw;
+ pi->port.raw = up->raw;
ret.dst = ports->dst;
return ret;
}
@@ -96,6 +116,11 @@ rx_stream(struct tle_udp_stream *s, void *mb[], struct rte_mbuf *rp[],
r = _rte_ring_enqueue_burst(s->rx.q, mb, num);
+ if (unlikely(r != num)) {
+ UDP_ADD_STATS(UDP_MIB_RCVBUFERRORS, num - r);
+ UDP_ADD_STATS(UDP_MIB_INERRORS, num - r);
+ }
+
/* if RX queue was empty invoke user RX notification callback. */
if (s->rx.cb.func != NULL && r != 0 && rte_ring_count(s->rx.q) == r)
s->rx.cb.func(s->rx.cb.data, &s->s);
@@ -164,28 +189,64 @@ rx_stream4(struct tle_udp_stream *s, struct rte_mbuf *pkt[],
return rx_stream(s, mb, rp + k, rc + k, n);
}
+/*
+ * Consider 2 UDP pkt_info *equal* if their:
+ * - types (IPv4/IPv6)
+ * - TCP src and dst ports
+ * - IP src and dst addresses
+ * are equal.
+ */
+static inline int
+udp_pkt_info_bulk_eq(const union pkt_info pi[], uint32_t num)
+{
+ uint32_t i;
+
+ i = 1;
+
+ if (pi[0].tf.type == TLE_V4) {
+ while (i != num && pi[i].tf.type == TLE_V4 &&
+ pi[0].port.raw == pi[i].port.raw &&
+ pi[0].addr4.raw == pi[i].addr4.raw)
+ i++;
+ } else if (pi[0].tf.type == TLE_V6) {
+ while (i != num && pi[i].tf.type == TLE_V6 &&
+ pi[0].port.raw == pi[i].port.raw &&
+ ymm_cmp(&pi[0].addr6->raw,
+ &pi[i].addr6->raw) == 0)
+ i++;
+ }
+
+ return i;
+}
+
uint16_t
tle_udp_rx_bulk(struct tle_dev *dev, struct rte_mbuf *pkt[],
struct rte_mbuf *rp[], int32_t rc[], uint16_t num)
{
+ struct stbl *st;
struct tle_udp_stream *s;
- uint32_t i, j, k, n, p, t;
+ uint32_t i, j, k, n, t;
union l4_ports tp[num], port[num];
union ipv4_addrs a4[num];
union ipv6_addrs *pa6[num];
+ union pkt_info pi[num];
+
+ st = CTX_UDP_STLB(dev->ctx);
for (i = 0; i != num; i++)
- tp[i] = pkt_info(pkt[i], &port[i], &a4[i], &pa6[i]);
+ tp[i] = pkt_info_udp(pkt[i], &port[i], &a4[i],
+ &pa6[i], &pi[i]);
k = 0;
for (i = 0; i != num; i = j) {
- for (j = i + 1; j != num && tp[j].raw == tp[i].raw; j++)
- ;
+ j = i + udp_pkt_info_bulk_eq(pi + i, num - i);
t = tp[i].src;
- p = tp[i].dst;
- s = rx_stream_obtain(dev, t, p);
+
+ s = rx_stream_obtain_by_tuples(st, &pi[i]);
+ if (s == NULL)
+ s = rx_stream_obtain(dev, t, &pi[i]);
if (s != NULL) {
if (t == TLE_V4)
@@ -202,6 +263,7 @@ tle_udp_rx_bulk(struct tle_dev *dev, struct rte_mbuf *pkt[],
rwl_release(&s->rx.use);
} else {
+ UDP_ADD_STATS(UDP_MIB_NOPORTS, j - i);
for (; i != j; i++) {
rc[k] = ENOENT;
rp[k] = pkt[i];
@@ -262,6 +324,8 @@ tle_udp_tx_bulk(struct tle_dev *dev, struct rte_mbuf *pkt[], uint16_t num)
stream_drb_release(s, drb + i, j - i);
}
+ UDP_ADD_STATS(UDP_MIB_OUTDATAGRAMS, n);
+
return n;
}
@@ -272,24 +336,18 @@ tle_udp_tx_bulk(struct tle_dev *dev, struct rte_mbuf *pkt[], uint16_t num)
static inline uint32_t
recv_pkt_process(struct rte_mbuf *m[], uint32_t num, uint32_t type)
{
- uint32_t i, k;
- uint64_t flg[num], ofl[num];
-
- for (i = 0; i != num; i++) {
- flg[i] = m[i]->ol_flags;
- ofl[i] = m[i]->tx_offload;
- }
+ uint32_t i, k, offset;
- k = 0;
- for (i = 0; i != num; i++) {
-
- /* drop packets with invalid cksum(s). */
- if (check_pkt_csum(m[i], flg[i], type, IPPROTO_UDP) != 0) {
+ for (i = 0, k = 0; i != num; i++) {
+ if (check_pkt_csum(m[i], type, IPPROTO_UDP) != 0) {
+ UDP_INC_STATS(UDP_MIB_CSUMERRORS);
rte_pktmbuf_free(m[i]);
m[i] = NULL;
k++;
- } else
- rte_pktmbuf_adj(m[i], _tx_offload_l4_offset(ofl[i]));
+ } else {
+ offset = _tx_offload_l4_offset(m[i]->tx_offload);
+ rte_pktmbuf_adj(m[i], offset);
+ }
}
return k;
@@ -302,9 +360,25 @@ tle_udp_stream_recv(struct tle_stream *us, struct rte_mbuf *pkt[], uint16_t num)
struct tle_udp_stream *s;
s = UDP_STREAM(us);
+ n = 0;
+
+again:
n = _rte_ring_mc_dequeue_burst(s->rx.q, (void **)pkt, num);
- if (n == 0)
+ if (n == 0) {
+ if (rwl_try_acquire(&s->rx.use) > 0)
+ rte_errno = EAGAIN;
+ else
+ rte_errno = ESHUTDOWN;
+ rwl_release(&s->rx.use);
return 0;
+ }
+
+ k = recv_pkt_process(pkt, n, s->s.type);
+ if (unlikely(k))
+ UDP_ADD_STATS_ATOMIC(UDP_MIB_CSUMERRORS, k);
+ n = compress_pkt_list(pkt, n, k);
+ if (n == 0)
+ goto again;
/*
* if we still have packets to read,
@@ -316,8 +390,8 @@ tle_udp_stream_recv(struct tle_stream *us, struct rte_mbuf *pkt[], uint16_t num)
rwl_release(&s->rx.use);
}
- k = recv_pkt_process(pkt, n, s->s.type);
- return compress_pkt_list(pkt, n, k);
+ UDP_ADD_STATS_ATOMIC(UDP_MIB_INDATAGRAMS, n);
+ return n;
}
static inline int
@@ -413,7 +487,7 @@ fragment(struct rte_mbuf *pkt, struct rte_mbuf *frag[], uint32_t num,
/* Remove the Ethernet header from the input packet */
rte_pktmbuf_adj(pkt, dst->l2_len);
- mtu = dst->mtu - dst->l2_len;
+ mtu = dst->mtu;
/* fragment packet */
if (type == TLE_V4)
@@ -475,13 +549,22 @@ queue_pkt_out(struct tle_udp_stream *s, struct tle_dev *dev,
nb += nbc;
/* no free drbs, can't send anything */
- if (nb == 0)
+ if (unlikely(nb == 0)) {
+ if (all_or_nothing)
+ UDP_ADD_STATS_ATOMIC(UDP_MIB_SNDBUFERRORS, 1);
+ else
+ UDP_ADD_STATS_ATOMIC(UDP_MIB_SNDBUFERRORS, nb_pkt);
return 0;
+ }
/* not enough free drbs, reduce number of packets to send. */
else if (nb != nbm) {
- if (all_or_nothing)
+ if (all_or_nothing) {
+ UDP_ADD_STATS_ATOMIC(UDP_MIB_SNDBUFERRORS, 1);
return 0;
+ }
+
+ UDP_ADD_STATS_ATOMIC(UDP_MIB_SNDBUFERRORS, nb_pkt - nb * bsz);
nb_pkt = nb * bsz;
}
@@ -509,12 +592,18 @@ tle_udp_stream_send(struct tle_stream *us, struct rte_mbuf *pkt[],
const struct sockaddr_in *d4;
const struct sockaddr_in6 *d6;
struct tle_udp_stream *s;
- const void *da;
+ const void *sa, *da;
union udph udph;
struct tle_dest dst;
struct tle_drb *drb[num];
+ uint8_t ufo;
s = UDP_STREAM(us);
+ if (rwl_acquire(&s->tx.use) < 0) {
+ rte_errno = EPIPE; /* tx is shutdown */
+ return 0;
+ }
+
type = s->s.type;
/* start filling UDP header. */
@@ -523,7 +612,10 @@ tle_udp_stream_send(struct tle_stream *us, struct rte_mbuf *pkt[],
/* figure out what destination addr/port to use. */
if (dst_addr != NULL) {
- if (dst_addr->sa_family != s->prm.remote_addr.ss_family) {
+ if (dst_addr->sa_family != s->prm.remote_addr.ss_family &&
+ (s->prm.remote_addr.ss_family == AF_INET ||
+ !IN6_IS_ADDR_UNSPECIFIED(&s->s.ipv6.addr.dst))) {
+ rwl_release(&s->tx.use);
rte_errno = EINVAL;
return 0;
}
@@ -531,21 +623,28 @@ tle_udp_stream_send(struct tle_stream *us, struct rte_mbuf *pkt[],
d4 = (const struct sockaddr_in *)dst_addr;
da = &d4->sin_addr;
udph.ports.dst = d4->sin_port;
+ sa = &s->s.ipv4.addr.dst;
} else {
d6 = (const struct sockaddr_in6 *)dst_addr;
da = &d6->sin6_addr;
udph.ports.dst = d6->sin6_port;
+ sa = &s->s.ipv6.addr.dst;
}
} else {
udph.ports.dst = s->s.port.src;
- if (type == TLE_V4)
+ if (type == TLE_V4) {
da = &s->s.ipv4.addr.src;
- else
+ sa = &s->s.ipv4.addr.dst;
+ }
+ else {
da = &s->s.ipv6.addr.src;
+ sa = &s->s.ipv6.addr.dst;
+ }
}
- di = stream_get_dest(&s->s, da, &dst);
+ di = stream_get_dest(type, &s->s, sa, da, &dst);
if (di < 0) {
+ rwl_release(&s->tx.use);
rte_errno = -di;
return 0;
}
@@ -553,12 +652,7 @@ tle_udp_stream_send(struct tle_stream *us, struct rte_mbuf *pkt[],
pid = rte_atomic32_add_return(&dst.dev->tx.packet_id[type], num) - num;
mtu = dst.mtu - dst.l2_len - dst.l3_len;
- /* mark stream as not closable. */
- if (rwl_acquire(&s->tx.use) < 0) {
- rte_errno = EAGAIN;
- return 0;
- }
-
+ ufo = dst.dev->prm.tx_offload & DEV_TX_OFFLOAD_UDP_TSO;
nb = 0;
for (i = 0, k = 0; k != num; k = i) {
@@ -568,7 +662,7 @@ tle_udp_stream_send(struct tle_stream *us, struct rte_mbuf *pkt[],
ol_flags = dst.dev->tx.ol_flags[type];
while (i != num && frg == 0) {
- frg = pkt[i]->pkt_len > mtu;
+ frg = (!ufo) && pkt[i]->pkt_len > mtu;
if (frg != 0)
ol_flags &= ~PKT_TX_UDP_CKSUM;
rc = udp_fill_mbuf(pkt[i], type, ol_flags, pid + i,
diff --git a/lib/libtle_l4p/udp_stream.c b/lib/libtle_l4p/udp_stream.c
index 29f5a40..0cd5c27 100644
--- a/lib/libtle_l4p/udp_stream.c
+++ b/lib/libtle_l4p/udp_stream.c
@@ -43,74 +43,87 @@ fini_stream(struct tle_udp_stream *s)
static void
udp_fini_streams(struct tle_ctx *ctx)
{
- uint32_t i;
- struct tle_udp_stream *s;
+ struct udp_streams *us;
+ struct tle_stream *s;
+
+ us = CTX_UDP_STREAMS(ctx);
+ if (us != NULL) {
+ stbl_fini(&us->st);
+
+ while (ctx->streams.nb_free--) {
+ s = STAILQ_FIRST(&ctx->streams.free);
+ STAILQ_FIRST(&ctx->streams.free) = STAILQ_NEXT(s, link);
+ fini_stream(UDP_STREAM(s));
+ }
- s = ctx->streams.buf;
- if (s != NULL) {
- for (i = 0; i != ctx->prm.max_streams; i++)
- fini_stream(s + i);
}
- rte_free(s);
+ rte_free(us);
ctx->streams.buf = NULL;
STAILQ_INIT(&ctx->streams.free);
}
+/* stream memory layout:
+ * [tle_udp_stream] [rx.q] [tx.drb.r]
+ */
static int
-init_stream(struct tle_ctx *ctx, struct tle_udp_stream *s)
+add_stream(struct tle_ctx *ctx)
{
- size_t bsz, rsz, sz;
- uint32_t i, k, n, nb;
+ size_t sz_s, sz_rxq, sz_drb_r, sz;
+ /* for rx.q */
+ uint32_t n_rxq;
+ /* for tx.drb.r */
+ size_t bsz, rsz;
struct tle_drb *drb;
- char name[RTE_RING_NAMESIZE];
+ uint32_t k, nb, n_drb;
- /* init RX part. */
-
- n = RTE_MAX(ctx->prm.max_stream_rbufs, 1U);
- n = rte_align32pow2(n);
- sz = rte_ring_get_memsize(n);
-
- s->rx.q = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE,
- ctx->prm.socket_id);
- if (s->rx.q == NULL) {
- UDP_LOG(ERR, "%s(%p): allocation of %zu bytes on socket %d "
- "failed with error code: %d\n",
- __func__, s, sz, ctx->prm.socket_id, rte_errno);
- return -ENOMEM;
- }
+ uint32_t i, f;
+ char name[RTE_RING_NAMESIZE];
+ struct tle_udp_stream *s;
- snprintf(name, sizeof(name), "%p@%zu", s, sz);
- rte_ring_init(s->rx.q, name, n, RING_F_SP_ENQ);
+ // stream
+ sz_s = RTE_ALIGN_CEIL(sizeof(*s), RTE_CACHE_LINE_SIZE);
- /* init TX part. */
+ // rx.q
+ n_rxq = RTE_MAX(ctx->prm.max_stream_rbufs, 1U);
+ n_rxq = rte_align32pow2(n_rxq);
+ sz_rxq = rte_ring_get_memsize(n_rxq);
+ sz_rxq = RTE_ALIGN_CEIL(sz_rxq, RTE_CACHE_LINE_SIZE);
+ // tx.drb.r
nb = drb_nb_elem(ctx);
k = calc_stream_drb_num(ctx, nb);
- n = rte_align32pow2(k);
-
- /* size of the drbs ring */
- rsz = rte_ring_get_memsize(n);
+ n_drb = rte_align32pow2(k);
+ rsz = rte_ring_get_memsize(n_drb); /* size of the drbs ring */
rsz = RTE_ALIGN_CEIL(rsz, RTE_CACHE_LINE_SIZE);
+ bsz = tle_drb_calc_size(nb); /* size of the drb. */
+ sz_drb_r = rsz + bsz * k; /* total stream drbs size. */
+ sz_drb_r = RTE_ALIGN_CEIL(sz_drb_r, RTE_CACHE_LINE_SIZE);
- /* size of the drb. */
- bsz = tle_drb_calc_size(nb);
-
- /* total stream drbs size. */
- sz = rsz + bsz * k;
-
- s->tx.drb.r = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE,
- ctx->prm.socket_id);
- if (s->tx.drb.r == NULL) {
- UDP_LOG(ERR, "%s(%p): allocation of %zu bytes on socket %d "
+ sz = sz_s + sz_rxq + sz_drb_r;
+ s = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE,
+ ctx->prm.socket_id);
+ if (s == NULL) {
+ UDP_LOG(ERR, "%s: allocation of %zu bytes on socket %d "
"failed with error code: %d\n",
- __func__, s, sz, ctx->prm.socket_id, rte_errno);
+ __func__, sz, ctx->prm.socket_id, rte_errno);
return -ENOMEM;
}
- snprintf(name, sizeof(name), "%p@%zu", s, sz);
- rte_ring_init(s->tx.drb.r, name, n, 0);
+ s->rx.q = (struct rte_ring *)((uintptr_t)s + sz_s);
+ s->tx.drb.r = (struct rte_ring *)((uintptr_t)s->rx.q + sz_rxq);
+
+ // ring flags
+ f = ((ctx->prm.flags & TLE_CTX_FLAG_ST) == 0) ? 0 :
+ (RING_F_SP_ENQ | RING_F_SC_DEQ);
+
+ /* init RX part. */
+ snprintf(name, sizeof(name), "%p@%zu", s->rx.q, sz_rxq);
+ rte_ring_init(s->rx.q, name, n_rxq, f);
+ /* init TX part. */
+ snprintf(name, sizeof(name), "%p@%zu", s->tx.drb.r, sz_drb_r);
+ rte_ring_init(s->tx.drb.r, name, n_drb, f);
for (i = 0; i != k; i++) {
drb = (struct tle_drb *)((uintptr_t)s->tx.drb.r +
rsz + bsz * i);
@@ -146,38 +159,59 @@ udp_init_streams(struct tle_ctx *ctx)
size_t sz;
uint32_t i;
int32_t rc;
- struct tle_udp_stream *s;
+ struct udp_streams *us;
- sz = sizeof(*s) * ctx->prm.max_streams;
- s = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE,
+ sz = sizeof(*us);
+ us = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE,
ctx->prm.socket_id);
- if (s == NULL) {
+ if (us == NULL) {
UDP_LOG(ERR, "allocation of %zu bytes on socket %d "
"for %u udp_streams failed\n",
sz, ctx->prm.socket_id, ctx->prm.max_streams);
return -ENOMEM;
}
- ctx->streams.buf = s;
+ ctx->streams.buf = us;
STAILQ_INIT(&ctx->streams.free);
- for (i = 0; i != ctx->prm.max_streams; i++) {
- rc = init_stream(ctx, s + i);
- if (rc != 0) {
- UDP_LOG(ERR, "initalisation of %u-th stream failed", i);
- udp_fini_streams(ctx);
- return rc;
- }
+ rc = stbl_init(&us->st, (ctx->prm.flags & TLE_CTX_FLAG_ST) == 0);
+ if (rc < 0) {
+ UDP_LOG(ERR, "failed to init UDP stbl: rc = %dl\n", rc);
+ return rc;
}
- return 0;
+ for (i = 0; rc == 0 && i != ctx->prm.min_streams; i++)
+ rc = add_stream(ctx);
+
+ if (rc != 0) {
+ UDP_LOG(ERR, "initalisation of %u-th stream failed", i);
+ udp_fini_streams(ctx);
+ }
+
+ return rc;
}
-static void __attribute__((constructor))
+static uint32_t
+udp_more_streams(struct tle_ctx *ctx)
+{
+ uint32_t i, nb;
+ uint32_t nb_max = ctx->prm.max_streams;
+ uint32_t nb_cur = ctx->streams.nb_cur;
+
+ nb = RTE_MIN(ctx->prm.delta_streams, nb_max - nb_cur);
+ for (i = 0; i < nb; i++)
+ if (add_stream(ctx) != 0)
+ break;
+
+ return i;
+}
+
+static void __attribute__((constructor(101)))
udp_stream_setup(void)
{
static const struct stream_ops udp_ops = {
.init_streams = udp_init_streams,
+ .more_streams = udp_more_streams,
.fini_streams = udp_fini_streams,
.free_drbs = udp_free_drbs,
};
@@ -188,8 +222,8 @@ udp_stream_setup(void)
static inline void
stream_down(struct tle_udp_stream *s)
{
- rwl_down(&s->rx.use);
- rwl_down(&s->tx.use);
+ rwl_try_down(&s->rx.use);
+ rwl_try_down(&s->tx.use);
}
static inline void
@@ -224,6 +258,59 @@ check_stream_prm(const struct tle_ctx *ctx,
}
struct tle_stream *
+tle_udp_stream_set(struct tle_stream *ts, struct tle_ctx *ctx,
+ const struct tle_udp_stream_param *prm)
+{
+ struct tle_udp_stream *s;
+ int32_t rc;
+
+ if (ctx == NULL || prm == NULL || check_stream_prm(ctx, prm) != 0) {
+ tle_udp_stream_close(ts);
+ rte_errno = EINVAL;
+ return NULL;
+ }
+
+ s = UDP_STREAM(ts);
+
+ /* free stream's destination port */
+ rc = stream_clear_ctx(ctx, &s->s);
+
+ if (s->ste) {
+ stbl_del_stream(CTX_UDP_STLB(ctx), s->ste, ts);
+ s->ste = NULL;
+ }
+
+ /* copy input parameters. */
+ s->prm = *prm;
+ s->s.option.raw = prm->option;
+
+ /* setup L4 ports and L3 addresses fields. */
+ rc = stream_fill_ctx(ctx, &s->s,
+ (const struct sockaddr *)&prm->local_addr,
+ (const struct sockaddr *)&prm->remote_addr);
+
+ if (rc != 0)
+ goto error;
+
+ /* add stream to the table for non-listen type stream */
+ if (!is_empty_addr((const struct sockaddr *)&prm->remote_addr)) {
+ s->ste = stbl_add_stream(CTX_UDP_STLB(ctx), &s->s);
+ if (s->ste == NULL) {
+ rc = EEXIST;
+ goto error;
+ }
+ }
+
+ return &s->s;
+
+error:
+ tle_udp_stream_close(ts);
+ rte_errno = rc;
+ return NULL;
+
+}
+
+struct tle_stream *
tle_udp_stream_open(struct tle_ctx *ctx,
const struct tle_udp_stream_param *prm)
{
@@ -237,42 +324,80 @@ tle_udp_stream_open(struct tle_ctx *ctx,
s = (struct tle_udp_stream *)get_stream(ctx);
if (s == NULL) {
- rte_errno = ENFILE;
- return NULL;
-
- /* some TX still pending for that stream. */
- } else if (UDP_STREAM_TX_PENDING(s)) {
- put_stream(ctx, &s->s, 0);
rte_errno = EAGAIN;
return NULL;
}
/* copy input parameters. */
s->prm = *prm;
+ s->s.option.raw = prm->option;
/* setup L4 ports and L3 addresses fields. */
rc = stream_fill_ctx(ctx, &s->s,
(const struct sockaddr *)&prm->local_addr,
(const struct sockaddr *)&prm->remote_addr);
- if (rc != 0) {
- put_stream(ctx, &s->s, 1);
- s = NULL;
- rte_errno = rc;
- } else {
- /* setup stream notification menchanism */
- s->rx.ev = prm->recv_ev;
- s->rx.cb = prm->recv_cb;
- s->tx.ev = prm->send_ev;
- s->tx.cb = prm->send_cb;
-
- /* mark stream as avaialbe for RX/TX */
- if (s->tx.ev != NULL)
- tle_event_raise(s->tx.ev);
- stream_up(s);
+ if (rc != 0)
+ goto error;
+
+ /* add stream to the table for non-listen type stream */
+ if (!is_empty_addr((const struct sockaddr *)&prm->remote_addr)) {
+ s->ste = stbl_add_stream(CTX_UDP_STLB(ctx), &s->s);
+ if (s->ste == NULL) {
+ rc = EEXIST;
+ goto error;
+ }
}
+ /* setup stream notification menchanism */
+ s->rx.ev = prm->recv_ev;
+ s->rx.cb = prm->recv_cb;
+ s->tx.ev = prm->send_ev;
+ s->tx.cb = prm->send_cb;
+
+ /* mark stream as avaialbe for RX/TX */
+ if (s->tx.ev != NULL)
+ tle_event_raise(s->tx.ev);
+ stream_up(s);
+
return &s->s;
+
+error:
+ put_stream(ctx, &s->s, 1);
+ rte_errno = rc;
+ return NULL;
+}
+
+int
+tle_udp_stream_shutdown(struct tle_stream *us, int how)
+{
+ bool shut_rd = false;
+ bool shut_wr = false;
+ struct tle_udp_stream *s = UDP_STREAM(us);
+
+ switch (how) {
+ case SHUT_RD:
+ shut_rd = true;
+ rwl_down(&s->rx.use);
+ break;
+ case SHUT_WR:
+ shut_wr = true;
+ rwl_down(&s->tx.use);
+ break;
+ case SHUT_RDWR:
+ shut_rd = true;
+ shut_wr = true;
+ stream_down(s);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (shut_rd && s->rx.ev != NULL)
+ tle_event_raise(s->rx.ev);
+ if (shut_wr && s->tx.ev != NULL)
+ tle_event_raise(s->tx.ev);
+ return 0;
}
int
@@ -312,6 +437,11 @@ tle_udp_stream_close(struct tle_stream *us)
/* empty stream's RX queue */
empty_mbuf_ring(s->rx.q);
+ if (s->ste) {
+ stbl_del_stream(CTX_UDP_STLB(ctx), s->ste, us);
+ s->ste = NULL;
+ }
+
/*
* mark the stream as free again.
* if there still are pkts queued for TX,
@@ -344,3 +474,56 @@ tle_udp_stream_get_param(const struct tle_stream *us,
return 0;
}
+
+/*
+ * helper function, updates stream config
+ */
+static inline int
+stream_update_cfg(struct tle_stream *us, struct tle_udp_stream_param *prm)
+{
+ struct tle_udp_stream *s;
+
+ s = UDP_STREAM(us);
+
+ /* setup stream notification menchanism */
+ s->rx.ev = prm->recv_ev;
+ s->rx.cb = prm->recv_cb;
+ s->tx.ev = prm->send_ev;
+ s->tx.cb = prm->send_cb;
+
+ rte_smp_wmb();
+
+ /* invoke async notifications, if any */
+ if (rte_ring_count(s->rx.q) != 0) {
+ if (s->rx.ev != NULL)
+ tle_event_raise(s->rx.ev);
+ else if (s->rx.cb.func != NULL)
+ s->rx.cb.func(s->rx.cb.data, &s->s);
+ }
+
+ /* always ok to write */
+ if (s->tx.ev != NULL)
+ tle_event_raise(s->tx.ev);
+ else if (s->tx.cb.func != NULL)
+ s->tx.cb.func(s->tx.cb.data, &s->s);
+
+ return 0;
+}
+
+uint32_t
+tle_udp_stream_update_cfg(struct tle_stream *us[],
+ struct tle_udp_stream_param prm[], uint32_t num)
+{
+ int32_t rc;
+ uint32_t i;
+
+ for (i = 0; i != num; i++) {
+ rc = stream_update_cfg(us[i], &prm[i]);
+ if (rc != 0) {
+ rte_errno = -rc;
+ break;
+ }
+ }
+
+ return i;
+}
diff --git a/lib/libtle_l4p/udp_stream.h b/lib/libtle_l4p/udp_stream.h
index a950e56..55a66f8 100644
--- a/lib/libtle_l4p/udp_stream.h
+++ b/lib/libtle_l4p/udp_stream.h
@@ -24,6 +24,7 @@
#include "osdep.h"
#include "ctx.h"
#include "stream.h"
+#include "stream_table.h"
#ifdef __cplusplus
extern "C" {
@@ -41,6 +42,7 @@ union udph {
struct tle_udp_stream {
struct tle_stream s;
+ struct stbl_entry *ste; /* entry in streams table. */
struct {
struct rte_ring *q;
@@ -63,6 +65,13 @@ struct tle_udp_stream {
struct tle_udp_stream_param prm;
} __rte_cache_aligned;
+struct udp_streams {
+ struct stbl st;
+};
+
+#define CTX_UDP_STREAMS(ctx) ((struct udp_streams *)(ctx)->streams.buf)
+#define CTX_UDP_STLB(ctx) (&CTX_UDP_STREAMS(ctx)->st)
+
#define UDP_STREAM(p) \
((struct tle_udp_stream *)((uintptr_t)(p) - offsetof(struct tle_udp_stream, s)))
diff --git a/lib/libtle_timer/timer.c b/lib/libtle_timer/timer.c
index 8b89fd6..a0169ef 100644
--- a/lib/libtle_timer/timer.c
+++ b/lib/libtle_timer/timer.c
@@ -134,6 +134,30 @@ put_timer(struct tle_timer_list *list, struct tle_timer_elmt *e)
list->num++;
}
+static inline struct tle_timer_elmt *
+get_free_timer(struct tle_timer_wheel *tw)
+{
+ unsigned i, n;
+ struct tle_timer_elmt *e;
+
+ e = LIST_FIRST(&tw->free.head);
+ if (e == NULL) {
+ n = 128;
+ n = RTE_MIN(n, tw->prm.max_timer - tw->free.num);
+ for (i = 0; i < n; i++) {
+ e = rte_zmalloc_socket(NULL, sizeof(*e),
+ sizeof(e), tw->prm.socket_id);
+ if (e != NULL)
+ put_timer(&tw->free, e);
+ else
+ rte_panic("Failed to allocate timer");
+ }
+ }
+
+ e = get_timer(&tw->free);
+ return e;
+}
+
static inline void
rem_timer(struct tle_timer_list *list, struct tle_timer_elmt *e)
{
@@ -149,8 +173,6 @@ tle_timer_create(struct tle_timer_wheel_args *prm, uint64_t now)
uint32_t i, j;
size_t sz;
struct tle_timer_wheel *tw;
- struct tle_timer_elmt *e;
- struct tle_timer_elmt *timers;
if (prm == NULL) {
rte_errno = -EINVAL;
@@ -169,7 +191,7 @@ tle_timer_create(struct tle_timer_wheel_args *prm, uint64_t now)
return NULL;
}
- sz = sizeof(*tw) + prm->max_timer * sizeof(struct tle_timer_elmt);
+ sz = sizeof(*tw);
/* allocate memory */
tw = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE,
@@ -182,17 +204,11 @@ tle_timer_create(struct tle_timer_wheel_args *prm, uint64_t now)
tw->last_run_time = now;
tw->prm = *prm;
- timers = (struct tle_timer_elmt *)(tw + 1);
/* initialize the lists */
LIST_INIT(&tw->free.head);
LIST_INIT(&tw->expired.head);
- for (i = 0; i < prm->max_timer; i++) {
- e = timers + i;
- put_timer(&tw->free, e);
- }
-
for (i = 0; i < TW_N_RINGS; i++)
for (j = 0; j < TW_SLOTS_PER_RING; j++)
LIST_INIT(&tw->w[i][j].head);
@@ -223,11 +239,6 @@ tle_timer_start(struct tle_timer_wheel *tw, void *obj, uint64_t interval)
return NULL;
}
- if (tw->free.num == 0) {
- rte_errno = ENOMEM;
- return NULL;
- }
-
nb_tick = interval / tw->prm.tick_size;
fast_ring_index = nb_tick & TW_RING_MASK;
@@ -248,7 +259,7 @@ tle_timer_start(struct tle_timer_wheel *tw, void *obj, uint64_t interval)
slow_ring_index %= TW_SLOTS_PER_RING;
ts = &tw->w[TW_RING_SLOW][slow_ring_index];
- e = get_timer(&tw->free);
+ e = get_free_timer(tw);
e->obj = obj;
e->fast_index = fast_ring_index;
put_timer(ts, e);
@@ -260,7 +271,7 @@ tle_timer_start(struct tle_timer_wheel *tw, void *obj, uint64_t interval)
/* Timer expires less than 51.2 seconds from now */
ts = &tw->w[TW_RING_FAST][fast_ring_index];
- e = get_timer(&tw->free);
+ e = get_free_timer(tw);
e->obj = obj;
put_timer(ts, e);
diff --git a/mk/tle.app.mk b/mk/tle.app.mk
index 602b870..14a5c23 100644
--- a/mk/tle.app.mk
+++ b/mk/tle.app.mk
@@ -13,6 +13,10 @@
EXTLIB_BUILD := y
+ifneq ($(HACK_CC),)
+CC = $(HACK_CC)
+endif
+
# we must create the output dir first and recall the same Makefile
# from this directory
ifeq ($(NOT_FIRST_CALL),)
diff --git a/mk/tle.lib.mk b/mk/tle.lib.mk
index 7455585..302cb60 100644
--- a/mk/tle.lib.mk
+++ b/mk/tle.lib.mk
@@ -13,6 +13,10 @@
EXTLIB_BUILD := y
+ifneq ($(HACK_CC),)
+CC = $(HACK_CC)
+endif
+
# we must create the output dir first and recall the same Makefile
# from this directory
ifeq ($(NOT_FIRST_CALL),)
diff --git a/test/Makefile b/test/Makefile
index c5cf270..46ac27d 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -25,4 +25,8 @@ DIRS-y += dring
DIRS-y += gtest
DIRS-y += timer
+ifeq ($(PACKETDRILL),y)
+DIRS-y += packetdrill
+endif
+
include $(TLDK_ROOT)/mk/tle.subdir.mk
diff --git a/test/gtest/Makefile b/test/gtest/Makefile
index e980c23..2f7b800 100644
--- a/test/gtest/Makefile
+++ b/test/gtest/Makefile
@@ -95,6 +95,7 @@ CXXFLAGS += $(EXTRA_CFLAGS)
LDFLAGS += -lstdc++
LDFLAGS += -L$(GMOCK_DIR) -L$(GMOCK_DIR)/../lib -lgmock
+LDLIBS += -L$(GMOCK_DIR)/gtest -L$(GMOCK_DIR)/../lib -lgtest
LDLIBS += -whole-archive -ltle_l4p -ltle_dring
include $(TLDK_ROOT)/mk/tle.cpp-obj.mk
diff --git a/test/gtest/test_tle_ctx.cpp b/test/gtest/test_tle_ctx.cpp
index b9808ee..bbb36e0 100644
--- a/test/gtest/test_tle_ctx.cpp
+++ b/test/gtest/test_tle_ctx.cpp
@@ -32,6 +32,7 @@ TEST(ctx_create, create_invalid_socket)
memset(&prm, 0, sizeof(prm));
prm.socket_id = SOCKET_ID_ANY;
prm.max_streams = 0x10;
+ prm.min_streams = 0x10;
prm.max_stream_rbufs = 0x100;
prm.max_stream_sbufs = 0x100;
diff --git a/test/gtest/test_tle_tcp_stream.cpp b/test/gtest/test_tle_tcp_stream.cpp
index b861049..1538f0b 100644
--- a/test/gtest/test_tle_tcp_stream.cpp
+++ b/test/gtest/test_tle_tcp_stream.cpp
@@ -86,7 +86,7 @@ TEST_F(test_tle_tcp_stream, tcp_stream_test_open_duplicate_ipv4)
stream_dup = tle_tcp_stream_open(ctx,
(const struct tle_tcp_stream_param*)&stream_prm);
ASSERT_EQ(stream_dup, nullptr);
- ASSERT_EQ(rte_errno, EEXIST);
+ ASSERT_EQ(rte_errno, EADDRINUSE);
ret = tle_tcp_stream_close(stream);
ASSERT_EQ(ret, 0);
@@ -103,7 +103,7 @@ TEST_F(test_tle_tcp_stream, tcp_stream_test_open_duplicate_ipv6)
stream_dup = tle_tcp_stream_open(ctx,
(const struct tle_tcp_stream_param*)&stream_prm6);
ASSERT_EQ(stream_dup, nullptr);
- ASSERT_EQ(rte_errno, EEXIST);
+ ASSERT_EQ(rte_errno, EADDRINUSE);
ret = tle_tcp_stream_close(stream6);
ASSERT_EQ(ret, 0);
diff --git a/test/gtest/test_tle_tcp_stream.h b/test/gtest/test_tle_tcp_stream.h
index 2caf2b5..80f0bea 100644
--- a/test/gtest/test_tle_tcp_stream.h
+++ b/test/gtest/test_tle_tcp_stream.h
@@ -32,6 +32,8 @@
#include "test_common.h"
#define MAX_STREAMS 0x100
+#define MIN_STREAMS 0x10
+#define DELTA_STREAMS 0x20
#define MAX_STREAM_RBUFS 0x100
#define MAX_STREAM_SBUFS 0x100
#define RX_NO_OFFLOAD 0x0
@@ -41,6 +43,8 @@ static struct tle_ctx_param ctx_prm_tmpl = {
.socket_id = SOCKET_ID_ANY,
.proto = TLE_PROTO_TCP,
.max_streams = MAX_STREAMS,
+ .min_streams = MIN_STREAMS,
+ .delta_streams = DELTA_STREAMS,
.max_stream_rbufs = MAX_STREAM_RBUFS,
.max_stream_sbufs = MAX_STREAM_SBUFS,
};
diff --git a/test/gtest/test_tle_udp_destroy.cpp b/test/gtest/test_tle_udp_destroy.cpp
index 2f26dd8..49306b5 100644
--- a/test/gtest/test_tle_udp_destroy.cpp
+++ b/test/gtest/test_tle_udp_destroy.cpp
@@ -24,6 +24,7 @@ TEST(udp_destroy_null, udp_destroy_null)
TEST_F(udp_destroy, udp_destroy_positive)
{
int rc;
+ rte_errno = 0;
tle_ctx_destroy(ctx);
ASSERT_EQ(rte_errno, 0);
}
diff --git a/test/gtest/test_tle_udp_stream_gen.cpp b/test/gtest/test_tle_udp_stream_gen.cpp
index 0f60b09..1007e4d 100644
--- a/test/gtest/test_tle_udp_stream_gen.cpp
+++ b/test/gtest/test_tle_udp_stream_gen.cpp
@@ -123,14 +123,13 @@ TEST_P(tle_rx_test, test)
/* Receive packets until we reach end on pcap file*/
do {
nb_rx = rte_eth_rx_burst(portid, 0, m, BURST_SIZE);
- for(auto &d: tp.devs) {
- memset(rc, 0, sizeof(int) * BURST_SIZE);
- nb_rx_bulk = tle_udp_rx_bulk(d.ptr, m, rp, rc, nb_rx);
- d.act_pkts_bulk_rx += nb_rx_bulk;
- for(j = 0; j < BURST_SIZE; j++) {
- if(rc[j] == ENOENT)
- d.act_pkts_enoent += 1;
- }
+ auto &d = tp.devs[0];
+ memset(rc, 0, sizeof(int) * BURST_SIZE);
+ nb_rx_bulk = tle_udp_rx_bulk(d.ptr, m, rp, rc, nb_rx);
+ d.act_pkts_bulk_rx += nb_rx_bulk;
+ for(j = 0; j < BURST_SIZE; j++) {
+ if(rc[j] == ENOENT)
+ d.act_pkts_enoent += 1;
}
for(auto &s: tp.streams) {
@@ -139,14 +138,12 @@ TEST_P(tle_rx_test, test)
}
} while (nb_rx > 0);
-
/*
* Verify results - number of rx packets per dev and stream.
*/
- for(auto &d: tp.devs) {
- EXPECT_EQ(d.act_pkts_bulk_rx, d.exp_pkts_bulk_rx);
- EXPECT_EQ(d.act_pkts_enoent, d.exp_pkts_enoent);
- }
+ auto &d = tp.devs[0];
+ EXPECT_EQ(d.act_pkts_bulk_rx, d.exp_pkts_bulk_rx);
+ EXPECT_EQ(d.act_pkts_enoent, d.exp_pkts_enoent);
for(auto &s: tp.streams) {
EXPECT_EQ(s.act_pkts_rx, s.exp_pkts_rx);
@@ -257,9 +254,9 @@ test_str
* 3 dev, 3 stream per dev, only correct pkts */
"Mixed IPv4+IPv6; Multiple devs with multiple correct streams",
{
- {"10.0.0.1", "2001::1000",RX_NO_OFFLOAD, TX_NO_OFFLOAD, 300, 0, 600},
- {"20.0.0.1", "2002::1000", RX_NO_OFFLOAD, TX_NO_OFFLOAD, 300, 0, 600},
- {"30.0.0.1", "2003::1000", RX_NO_OFFLOAD, TX_NO_OFFLOAD, 300, 0, 600},
+ {"10.0.0.1", "2001::1000",RX_NO_OFFLOAD, TX_NO_OFFLOAD, 900, 0, 0},
+ {"20.0.0.1", "2002::1000", RX_NO_OFFLOAD, TX_NO_OFFLOAD, 900, 0, 0},
+ {"30.0.0.1", "2003::1000", RX_NO_OFFLOAD, TX_NO_OFFLOAD, 900, 0, 0},
},
{
{AF_INET, 10001, 10011, "10.0.0.1", "10.0.0.2", 100, 0},
@@ -268,20 +265,20 @@ test_str
{AF_INET, 20001, 20011, "20.0.0.1", "20.0.0.2", 100, 0},
{AF_INET6, 20002, 20012, "2002::1000", "2002::3000", 100, 0},
{AF_INET6, 20003, 20013, "2002::1000", "2002::4000", 100, 0},
- {AF_INET, 20001, 20011, "30.0.0.1", "30.0.0.2", 100, 0},
- {AF_INET6, 20002, 20012, "2003::1000", "2003::3000", 100, 0},
- {AF_INET6, 20003, 20013, "2003::1000", "2003::4000", 100, 0}
+ {AF_INET, 30001, 30011, "30.0.0.1", "30.0.0.2", 100, 0},
+ {AF_INET6, 30002, 30012, "2003::1000", "2003::3000", 100, 0},
+ {AF_INET6, 30003, 30013, "2003::1000", "2003::4000", 100, 0}
},
{
{AF_INET, "10.0.0.2", "10.0.0.1", 10011, 10001, 100, 0, 0, 0},
{AF_INET, "10.0.0.3", "10.0.0.1", 10012, 10002, 100, 0, 0, 0},
{AF_INET, "20.0.0.2", "20.0.0.1", 20011, 20001, 100, 0, 0, 0},
- {AF_INET, "30.0.0.2", "30.0.0.1", 20011, 20001, 100, 0, 0, 0},
+ {AF_INET, "30.0.0.2", "30.0.0.1", 30011, 30001, 100, 0, 0, 0},
{AF_INET6, "2001::4000", "2001::1000", 10013, 10003, 100, 0, 0, 0},
{AF_INET6, "2002::3000", "2002::1000", 20012, 20002, 100, 0, 0, 0},
{AF_INET6, "2002::4000", "2002::1000", 20013, 20003, 100, 0, 0, 0},
- {AF_INET6, "2003::3000", "2003::1000", 20012, 20002, 100, 0, 0, 0},
- {AF_INET6, "2003::4000", "2003::1000", 20013, 20003, 100, 0, 0, 0},
+ {AF_INET6, "2003::3000", "2003::1000", 30012, 30002, 100, 0, 0, 0},
+ {AF_INET6, "2003::4000", "2003::1000", 30013, 30003, 100, 0, 0, 0},
}
}
));
@@ -425,20 +422,20 @@ test_str
{AF_INET, 20001, 20011, "20.0.0.1", "20.0.0.2", 0, 100},
{AF_INET6, 20002, 20012, "2002::1000", "2002::3000", 0, 100},
{AF_INET6, 20003, 20013, "2002::1000", "2002::4000", 0, 100},
- {AF_INET, 20001, 20011, "30.0.0.1", "30.0.0.2", 0, 100},
- {AF_INET6, 20002, 20012, "2003::1000", "2003::3000", 0, 100},
- {AF_INET6, 20003, 20013, "2003::1000", "2003::4000", 0, 100}
+ {AF_INET, 30001, 30011, "30.0.0.1", "30.0.0.2", 0, 100},
+ {AF_INET6, 30002, 30012, "2003::1000", "2003::3000", 0, 100},
+ {AF_INET6, 30003, 30013, "2003::1000", "2003::4000", 0, 100}
},
{
{AF_INET, "10.0.0.2", "10.0.0.1", 10011, 10001, 100, 0, 0, 0},
{AF_INET, "10.0.0.3", "10.0.0.1", 10012, 10002, 100, 0, 0, 0},
{AF_INET, "20.0.0.2", "20.0.0.1", 20011, 20001, 100, 0, 0, 0},
- {AF_INET, "30.0.0.2", "30.0.0.1", 20011, 20001, 100, 0, 0, 0},
+ {AF_INET, "30.0.0.2", "30.0.0.1", 30011, 30001, 100, 0, 0, 0},
{AF_INET6, "2001::4000", "2001::1000", 10013, 10003, 100, 0, 0, 0},
{AF_INET6, "2002::3000", "2002::1000", 20012, 20002, 100, 0, 0, 0},
{AF_INET6, "2002::4000", "2002::1000", 20013, 20003, 100, 0, 0, 0},
- {AF_INET6, "2003::3000", "2003::1000", 20012, 20002, 100, 0, 0, 0},
- {AF_INET6, "2003::4000", "2003::1000", 20013, 20003, 100, 0, 0, 0},
+ {AF_INET6, "2003::3000", "2003::1000", 30012, 30002, 100, 0, 0, 0},
+ {AF_INET6, "2003::4000", "2003::1000", 30013, 30003, 100, 0, 0, 0},
}
}
));
diff --git a/test/gtest/test_tle_udp_stream_gen.h b/test/gtest/test_tle_udp_stream_gen.h
index 1f3d210..eb92385 100644
--- a/test/gtest/test_tle_udp_stream_gen.h
+++ b/test/gtest/test_tle_udp_stream_gen.h
@@ -379,6 +379,8 @@ test_tle_udp_gen_base::setup_ctx(void)
memset(&ctx_prm, 0, sizeof(ctx_prm));
ctx_prm.socket_id = SOCKET_ID_ANY;
ctx_prm.max_streams = 0x10;
+ ctx_prm.min_streams = 0x8;
+ ctx_prm.delta_streams = 0x8;
ctx_prm.max_stream_rbufs = CTX_MAX_RBUFS;
ctx_prm.max_stream_sbufs = CTX_MAX_SBUFS;
ctx_prm.lookup4 = lookup4_function;
diff --git a/test/packetdrill/COPYING b/test/packetdrill/COPYING
new file mode 100644
index 0000000..d159169
--- /dev/null
+++ b/test/packetdrill/COPYING
@@ -0,0 +1,339 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.) You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+ To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+ We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+ Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ GNU GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+ 1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+ 2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that in
+ whole or in part contains or is derived from the Program or any
+ part thereof, to be licensed as a whole at no charge to all third
+ parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you provide
+ a warranty) and that users may redistribute the program under
+ these conditions, and telling the user how to view a copy of this
+ License. (Exception: if the Program itself is interactive but
+ does not normally print such an announcement, your work based on
+ the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of Sections
+ 1 and 2 above on a medium customarily used for software interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+ 5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+ 6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+ 7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+ 9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation. If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+ 10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission. For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this. Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+ NO WARRANTY
+
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+ Gnomovision version 69, Copyright (C) year name of author
+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary. Here is a sample; alter the names:
+
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+ <signature of Ty Coon>, 1 April 1989
+ Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs. If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff --git a/test/packetdrill/Makefile b/test/packetdrill/Makefile
new file mode 100644
index 0000000..1cceb47
--- /dev/null
+++ b/test/packetdrill/Makefile
@@ -0,0 +1,2 @@
+packetdrill-ext-libs := -lrt -ldl -static -L$(TLDK_ROOT) -Wl,--whole-archive -ltldk -Wl,--no-whole-archive -lnuma -lpthread
+include Makefile.common
diff --git a/test/packetdrill/Makefile.FreeBSD b/test/packetdrill/Makefile.FreeBSD
new file mode 100644
index 0000000..a32f827
--- /dev/null
+++ b/test/packetdrill/Makefile.FreeBSD
@@ -0,0 +1,2 @@
+packetdrill-ext-libs := -lpthread -lpcap
+.include "Makefile.common"
diff --git a/test/packetdrill/Makefile.Linux b/test/packetdrill/Makefile.Linux
new file mode 100644
index 0000000..0c8b7ac
--- /dev/null
+++ b/test/packetdrill/Makefile.Linux
@@ -0,0 +1,2 @@
+packetdrill-ext-libs := -lpthread -lrt -ldl -static
+include Makefile.common
diff --git a/test/packetdrill/Makefile.NetBSD b/test/packetdrill/Makefile.NetBSD
new file mode 100644
index 0000000..a32f827
--- /dev/null
+++ b/test/packetdrill/Makefile.NetBSD
@@ -0,0 +1,2 @@
+packetdrill-ext-libs := -lpthread -lpcap
+.include "Makefile.common"
diff --git a/test/packetdrill/Makefile.OpenBSD b/test/packetdrill/Makefile.OpenBSD
new file mode 100644
index 0000000..a32f827
--- /dev/null
+++ b/test/packetdrill/Makefile.OpenBSD
@@ -0,0 +1,2 @@
+packetdrill-ext-libs := -lpthread -lpcap
+.include "Makefile.common"
diff --git a/test/packetdrill/Makefile.common b/test/packetdrill/Makefile.common
new file mode 100644
index 0000000..b614d08
--- /dev/null
+++ b/test/packetdrill/Makefile.common
@@ -0,0 +1,63 @@
+all: binaries
+
+CFLAGS = -g -Wall -Werror
+
+parser.o: parser.y
+ bison --output=parser.c --defines=parser.h --report=state parser.y
+ $(CC) $(CFLAGS) -c parser.c
+
+lexer.o: lexer.l parser.o
+ flex -olexer.c lexer.l
+ $(CC) -O2 -g -Wall -c lexer.c
+
+packetdrill-lib := \
+ checksum.o code.o config.o hash.o hash_map.o ip_address.o ip_prefix.o \
+ netdev.o net_utils.o \
+ packet.o packet_socket_linux.o packet_socket_pcap.o \
+ packet_checksum.o packet_parser.o packet_to_string.o \
+ symbols_linux.o \
+ symbols_freebsd.o \
+ symbols_openbsd.o \
+ symbols_netbsd.o \
+ gre_packet.o icmp_packet.o ip_packet.o tcp_packet.o udp_packet.o \
+ mpls_packet.o \
+ run.o run_command.o run_packet.o run_system_call.o \
+ script.o socket.o system.o \
+ tcp_options.o tcp_options_iterator.o tcp_options_to_string.o \
+ logging.o types.o lexer.o parser.o \
+ fmemopen.o open_memstream.o \
+ link_layer.o wire_conn.o wire_protocol.o \
+ wire_client.o wire_client_netdev.o \
+ wire_server.o wire_server_netdev.o \
+ epoll.o pipe.o file.o so_testing.o wrap.o
+
+packetdrill-objs := packetdrill.o $(packetdrill-lib)
+
+packetdrill: $(packetdrill-objs)
+ $(CC) -o packetdrill -g $(packetdrill-objs) $(packetdrill-ext-libs)
+
+test-bins := checksum_test packet_parser_test packet_to_string_test
+tests: $(test-bins)
+ ./checksum_test
+ ./packet_parser_test
+ ./packet_to_string_test
+
+binaries: packetdrill $(test-bins)
+
+checksum_test-objs := $(packetdrill-lib) checksum_test.o
+checksum_test: $(checksum_test-objs)
+ $(CC) -o checksum_test $(checksum_test-objs) $(packetdrill-ext-libs)
+
+packet_parser_test-objs := $(packetdrill-lib) packet_parser_test.o
+packet_parser_test: $(packet_parser_test-objs)
+ $(CC) -o packet_parser_test $(packet_parser_test-objs) \
+ $(packetdrill-ext-libs)
+
+packet_to_string_test-objs := $(packetdrill-lib) packet_to_string_test.o
+packet_to_string_test: $(packet_to_string_test-objs)
+ $(CC) -o packet_to_string_test $(packet_to_string_test-objs) \
+ $(packetdrill-ext-libs)
+
+clean:
+ /bin/rm -f *.o packetdrill lexer.c parser.c parser.h parser.output \
+ $(test-bins)
diff --git a/test/packetdrill/README b/test/packetdrill/README
new file mode 100644
index 0000000..bfa0a47
--- /dev/null
+++ b/test/packetdrill/README
@@ -0,0 +1,58 @@
+
+packetdrill
+===========
+
+This directory contains the source code for the packetdrill network
+stack testing tool.
+
+The web site for packetdrill is here:
+
+https://code.google.com/p/packetdrill/
+
+
+building
+========
+
+To build packetdrill, first install flex and bison.
+
+Then set up the Makefile for your platform:
+
+# ./configure
+
+Then build the tool:
+
+# make
+
+
+running
+=======
+
+Here's a quick example.
+
+On FreeBSD, OpenBSD, and NetBSD, try:
+
+# ./packetdrill tests/bsd/fast_retransmit/fr-4pkt-sack-bsd.pkt
+
+On Linux try:
+
+# ./packetdrill tests/linux/fast_retransmit/fr-4pkt-sack-linux.pkt
+
+
+license
+=======
+
+The packetdrill tool is released under version 2 of the GPL. See the
+COPYING file for full details.
+
+
+discussion and contributions
+==============================
+
+If you have any questions, or code or patches to offer, please join
+the packetdrill e-mail list at:
+
+http://groups.google.com/group/packetdrill
+
+Contributions of code or tests are both welcomed!
+
+Enjoy!
diff --git a/test/packetdrill/assert.h b/test/packetdrill/assert.h
new file mode 100644
index 0000000..9d03822
--- /dev/null
+++ b/test/packetdrill/assert.h
@@ -0,0 +1,10 @@
+#include <stdio.h>
+
+extern void __attribute__((noreturn)) die(char *format, ...);
+
+#define assert(expr) \
+ do { \
+ if (!(expr)) \
+ die("assertion %s failed at %s line %d",\
+ __STRING(expr), __FILE__, __LINE__);\
+ } while (0)
diff --git a/test/packetdrill/capability.h b/test/packetdrill/capability.h
new file mode 100644
index 0000000..7ec5ee2
--- /dev/null
+++ b/test/packetdrill/capability.h
@@ -0,0 +1,102 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+// Author: willemb@google.com (Will de Bruijn)
+//
+// POSIX capability support for Linux: simplified libcap
+// GPL applies, as this interface was inspired by sys/capability.h
+
+#ifndef _LINUX_GTESTS_NET_CAPABILITY_H
+#define _LINUX_GTESTS_NET_CAPABILITY_H
+
+#ifdef HAVE_SYS_CAPABILITY_H
+#include <sys/capability.h>
+#else
+#include <linux/capability.h>
+
+typedef struct __user_cap_data_struct *cap_t;
+typedef int cap_value_t;
+
+typedef enum {
+ CAP_EFFECTIVE=0,
+ CAP_PERMITTED=1,
+ CAP_INHERITABLE=2
+} cap_flag_t;
+
+typedef enum {
+ CAP_CLEAR=0,
+ CAP_SET=1
+} cap_flag_value_t;
+
+static struct __user_cap_header_struct header = {
+ .version = _LINUX_CAPABILITY_VERSION_3,
+ .pid = 0,
+};
+
+// System calls: implemented in libc
+int capset(cap_user_header_t header, cap_user_data_t data);
+int capget(cap_user_header_t header, const cap_user_data_t data);
+
+// Extract a value for one name in one of the capability lists
+// only supports flag CAP_EFFECTIVE
+static inline int
+cap_get_flag(cap_t cap, cap_value_t name, cap_flag_t flag, cap_flag_value_t *val)
+{
+ assert(flag == CAP_EFFECTIVE);
+ assert(name < (sizeof(cap->effective) * 8));
+ *val = (cap->effective & (1 << name)) ? CAP_SET : CAP_CLEAR;
+ return 0;
+}
+
+// Set the value for a number of names in one of the capability lists
+// only supports flag CAP_EFFECTIVE
+static inline int
+cap_set_flag(cap_t cap, cap_flag_t flag, int num_name,
+ const cap_value_t *names, cap_flag_value_t val)
+{
+ int i;
+
+ assert(flag == CAP_EFFECTIVE);
+ if (val == CAP_SET)
+ for (i = 0; i < num_name; i++)
+ cap->effective |= (1 << names[i]);
+ else
+ for (i = 0; i < num_name; i++)
+ cap->effective &= ~(1 << names[i]);
+
+ return 0;
+}
+
+// Get the capability lists from the kernel
+static inline cap_t
+cap_get_proc(void)
+{
+ cap_t capabilities = calloc(_LINUX_CAPABILITY_U32S_3,
+ sizeof(struct __user_cap_data_struct));
+ if (capget(&header, capabilities)) {
+ perror("capget");
+ return NULL;
+ }
+
+ return capabilities;
+}
+
+// Update the capability lists in the kernel
+static inline int
+cap_set_proc(cap_t capabilities)
+{
+ if (capset(&header, capabilities)) {
+ perror("capset");
+ return -1;
+ }
+ return 0;
+}
+
+// Free a capability list
+static inline int
+cap_free(void *capabilities)
+{
+ free(capabilities);
+ return 0;
+}
+
+#endif /* !HAVE_SYS_CAPABILITY_H */
+#endif /* _LINUX_GTESTS_NET_CAPABILITY_H */
diff --git a/test/packetdrill/checksum.c b/test/packetdrill/checksum.c
new file mode 100644
index 0000000..3e549d3
--- /dev/null
+++ b/test/packetdrill/checksum.c
@@ -0,0 +1,239 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Helpers to calculate IP, TCP, and UDP checksums.
+ */
+
+#include "checksum.h"
+
+#include "assert.h"
+
+/* Add bytes in buffer to a running checksum. Returns the new
+ * intermediate checksum. Use ip_checksum_fold() to convert the
+ * intermediate checksum to final form.
+ */
+static u64 ip_checksum_partial(const void *p, size_t len, u64 sum)
+{
+ /* Main loop: 32 bits at a time.
+ * We take advantage of intel's ability to do unaligned memory
+ * accesses with minimal additional cost. Other architectures
+ * probably want to be more careful here.
+ */
+ const u32 *p32 = (const u32 *)(p);
+ for (; len >= sizeof(*p32); len -= sizeof(*p32))
+ sum += *p32++;
+
+ /* Handle un-32bit-aligned trailing bytes */
+ const u16 *p16 = (const u16 *)(p32);
+ if (len >= 2) {
+ sum += *p16++;
+ len -= sizeof(*p16);
+ }
+ if (len > 0) {
+ const u8 *p8 = (const u8 *)(p16);
+ sum += ntohs(*p8 << 8); /* RFC says pad last byte */
+ }
+
+ return sum;
+}
+
+static __be16 ip_checksum_fold(u64 sum)
+{
+ while (sum & ~0xffffffffULL)
+ sum = (sum >> 32) + (sum & 0xffffffffULL);
+ while (sum & 0xffff0000ULL)
+ sum = (sum >> 16) + (sum & 0xffffULL);
+
+ return ~sum;
+}
+
+static u64 tcp_udp_v4_header_checksum_partial(
+ struct in_addr src_ip, struct in_addr dst_ip, u8 protocol, u16 len)
+{
+ /* The IPv4 pseudo-header is defined in RFC 793, Section 3.1. */
+ struct ipv4_pseudo_header_t {
+ /* We use a union here to avoid aliasing issues with gcc -O2 */
+ union {
+ struct header {
+ struct in_addr src_ip;
+ struct in_addr dst_ip;
+ __u8 mbz;
+ __u8 protocol;
+ __be16 length;
+ } __packed fields;
+ u32 words[3];
+ };
+ };
+ struct ipv4_pseudo_header_t pseudo_header;
+ assert(sizeof(pseudo_header) == 12);
+
+ /* Fill in the pseudo-header. */
+ pseudo_header.fields.src_ip = src_ip;
+ pseudo_header.fields.dst_ip = dst_ip;
+ pseudo_header.fields.mbz = 0;
+ pseudo_header.fields.protocol = protocol;
+ pseudo_header.fields.length = htons(len);
+ return ip_checksum_partial(&pseudo_header, sizeof(pseudo_header), 0);
+}
+
+__be16 tcp_udp_v4_checksum(struct in_addr src_ip, struct in_addr dst_ip,
+ u8 protocol, const void *payload, u16 len)
+{
+ u64 sum = tcp_udp_v4_header_checksum_partial(
+ src_ip, dst_ip, protocol, len);
+ sum = ip_checksum_partial(payload, len, sum);
+ return ip_checksum_fold(sum);
+}
+
+/* Calculates and returns IPv4 header checksum. */
+__be16 ipv4_checksum(void *ip_header, size_t ip_header_bytes)
+{
+ return ip_checksum_fold(
+ ip_checksum_partial(ip_header, ip_header_bytes, 0));
+}
+
+static u64 tcp_udp_v6_header_checksum_partial(
+ const struct in6_addr *src_ip,
+ const struct in6_addr *dst_ip,
+ u8 protocol, u32 len)
+{
+ /* The IPv6 pseudo-header is defined in RFC 2460, Section 8.1. */
+ struct ipv6_pseudo_header_t {
+ /* We use a union here to avoid aliasing issues with gcc -O2 */
+ union {
+ struct header {
+ struct in6_addr src_ip;
+ struct in6_addr dst_ip;
+ __be32 length;
+ __u8 mbz[3];
+ __u8 next_header;
+ } __packed fields;
+ u32 words[10];
+ };
+ };
+ struct ipv6_pseudo_header_t pseudo_header;
+ assert(sizeof(pseudo_header) == 40);
+
+ /* Fill in the pseudo-header. */
+ pseudo_header.fields.src_ip = *src_ip;
+ pseudo_header.fields.dst_ip = *dst_ip;
+ pseudo_header.fields.length = htonl(len);
+ memset(pseudo_header.fields.mbz, 0, sizeof(pseudo_header.fields.mbz));
+ pseudo_header.fields.next_header = protocol;
+ return ip_checksum_partial(&pseudo_header, sizeof(pseudo_header), 0);
+}
+
+__be16 tcp_udp_v6_checksum(const struct in6_addr *src_ip,
+ const struct in6_addr *dst_ip,
+ u8 protocol, const void *payload, u32 len)
+{
+ u64 sum = tcp_udp_v6_header_checksum_partial(
+ src_ip, dst_ip, protocol, len);
+ sum = ip_checksum_partial(payload, len, sum);
+ return ip_checksum_fold(sum);
+}
+
+#define CRC32C(c, d) (c = (c>>8) ^ crc_c[(c^(d))&0xFF])
+
+static u32 crc_c[256] = {
+ 0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4,
+ 0xC79A971F, 0x35F1141C, 0x26A1E7E8, 0xD4CA64EB,
+ 0x8AD958CF, 0x78B2DBCC, 0x6BE22838, 0x9989AB3B,
+ 0x4D43CFD0, 0xBF284CD3, 0xAC78BF27, 0x5E133C24,
+ 0x105EC76F, 0xE235446C, 0xF165B798, 0x030E349B,
+ 0xD7C45070, 0x25AFD373, 0x36FF2087, 0xC494A384,
+ 0x9A879FA0, 0x68EC1CA3, 0x7BBCEF57, 0x89D76C54,
+ 0x5D1D08BF, 0xAF768BBC, 0xBC267848, 0x4E4DFB4B,
+ 0x20BD8EDE, 0xD2D60DDD, 0xC186FE29, 0x33ED7D2A,
+ 0xE72719C1, 0x154C9AC2, 0x061C6936, 0xF477EA35,
+ 0xAA64D611, 0x580F5512, 0x4B5FA6E6, 0xB93425E5,
+ 0x6DFE410E, 0x9F95C20D, 0x8CC531F9, 0x7EAEB2FA,
+ 0x30E349B1, 0xC288CAB2, 0xD1D83946, 0x23B3BA45,
+ 0xF779DEAE, 0x05125DAD, 0x1642AE59, 0xE4292D5A,
+ 0xBA3A117E, 0x4851927D, 0x5B016189, 0xA96AE28A,
+ 0x7DA08661, 0x8FCB0562, 0x9C9BF696, 0x6EF07595,
+ 0x417B1DBC, 0xB3109EBF, 0xA0406D4B, 0x522BEE48,
+ 0x86E18AA3, 0x748A09A0, 0x67DAFA54, 0x95B17957,
+ 0xCBA24573, 0x39C9C670, 0x2A993584, 0xD8F2B687,
+ 0x0C38D26C, 0xFE53516F, 0xED03A29B, 0x1F682198,
+ 0x5125DAD3, 0xA34E59D0, 0xB01EAA24, 0x42752927,
+ 0x96BF4DCC, 0x64D4CECF, 0x77843D3B, 0x85EFBE38,
+ 0xDBFC821C, 0x2997011F, 0x3AC7F2EB, 0xC8AC71E8,
+ 0x1C661503, 0xEE0D9600, 0xFD5D65F4, 0x0F36E6F7,
+ 0x61C69362, 0x93AD1061, 0x80FDE395, 0x72966096,
+ 0xA65C047D, 0x5437877E, 0x4767748A, 0xB50CF789,
+ 0xEB1FCBAD, 0x197448AE, 0x0A24BB5A, 0xF84F3859,
+ 0x2C855CB2, 0xDEEEDFB1, 0xCDBE2C45, 0x3FD5AF46,
+ 0x7198540D, 0x83F3D70E, 0x90A324FA, 0x62C8A7F9,
+ 0xB602C312, 0x44694011, 0x5739B3E5, 0xA55230E6,
+ 0xFB410CC2, 0x092A8FC1, 0x1A7A7C35, 0xE811FF36,
+ 0x3CDB9BDD, 0xCEB018DE, 0xDDE0EB2A, 0x2F8B6829,
+ 0x82F63B78, 0x709DB87B, 0x63CD4B8F, 0x91A6C88C,
+ 0x456CAC67, 0xB7072F64, 0xA457DC90, 0x563C5F93,
+ 0x082F63B7, 0xFA44E0B4, 0xE9141340, 0x1B7F9043,
+ 0xCFB5F4A8, 0x3DDE77AB, 0x2E8E845F, 0xDCE5075C,
+ 0x92A8FC17, 0x60C37F14, 0x73938CE0, 0x81F80FE3,
+ 0x55326B08, 0xA759E80B, 0xB4091BFF, 0x466298FC,
+ 0x1871A4D8, 0xEA1A27DB, 0xF94AD42F, 0x0B21572C,
+ 0xDFEB33C7, 0x2D80B0C4, 0x3ED04330, 0xCCBBC033,
+ 0xA24BB5A6, 0x502036A5, 0x4370C551, 0xB11B4652,
+ 0x65D122B9, 0x97BAA1BA, 0x84EA524E, 0x7681D14D,
+ 0x2892ED69, 0xDAF96E6A, 0xC9A99D9E, 0x3BC21E9D,
+ 0xEF087A76, 0x1D63F975, 0x0E330A81, 0xFC588982,
+ 0xB21572C9, 0x407EF1CA, 0x532E023E, 0xA145813D,
+ 0x758FE5D6, 0x87E466D5, 0x94B49521, 0x66DF1622,
+ 0x38CC2A06, 0xCAA7A905, 0xD9F75AF1, 0x2B9CD9F2,
+ 0xFF56BD19, 0x0D3D3E1A, 0x1E6DCDEE, 0xEC064EED,
+ 0xC38D26C4, 0x31E6A5C7, 0x22B65633, 0xD0DDD530,
+ 0x0417B1DB, 0xF67C32D8, 0xE52CC12C, 0x1747422F,
+ 0x49547E0B, 0xBB3FFD08, 0xA86F0EFC, 0x5A048DFF,
+ 0x8ECEE914, 0x7CA56A17, 0x6FF599E3, 0x9D9E1AE0,
+ 0xD3D3E1AB, 0x21B862A8, 0x32E8915C, 0xC083125F,
+ 0x144976B4, 0xE622F5B7, 0xF5720643, 0x07198540,
+ 0x590AB964, 0xAB613A67, 0xB831C993, 0x4A5A4A90,
+ 0x9E902E7B, 0x6CFBAD78, 0x7FAB5E8C, 0x8DC0DD8F,
+ 0xE330A81A, 0x115B2B19, 0x020BD8ED, 0xF0605BEE,
+ 0x24AA3F05, 0xD6C1BC06, 0xC5914FF2, 0x37FACCF1,
+ 0x69E9F0D5, 0x9B8273D6, 0x88D28022, 0x7AB90321,
+ 0xAE7367CA, 0x5C18E4C9, 0x4F48173D, 0xBD23943E,
+ 0xF36E6F75, 0x0105EC76, 0x12551F82, 0xE03E9C81,
+ 0x34F4F86A, 0xC69F7B69, 0xD5CF889D, 0x27A40B9E,
+ 0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E,
+ 0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351
+};
+
+__be32 sctp_crc32c(const void *packet, u32 len)
+{
+ u32 i, crc32c;
+ u8 byte0, byte1, byte2, byte3;
+ const u8 *buf = (const u8 *)packet;
+
+ crc32c = ~0;
+ for (i = 0; i < len; i++)
+ CRC32C(crc32c, buf[i]);
+ crc32c = ~crc32c;
+ byte0 = crc32c & 0xff;
+ byte1 = (crc32c>>8) & 0xff;
+ byte2 = (crc32c>>16) & 0xff;
+ byte3 = (crc32c>>24) & 0xff;
+ crc32c = ((byte0 << 24) | (byte1 << 16) | (byte2 << 8) | byte3);
+ return htonl(crc32c);
+}
diff --git a/test/packetdrill/checksum.h b/test/packetdrill/checksum.h
new file mode 100644
index 0000000..43681d2
--- /dev/null
+++ b/test/packetdrill/checksum.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Helpers to calculate IP, TCP, and UDP checksums.
+ */
+
+#ifndef __CHECKSUM_H__
+#define __CHECKSUM_H__
+
+#include "types.h"
+
+#include <netinet/in.h>
+#include <sys/types.h>
+
+/* IPv4 ... */
+
+/* Calculates and returns IPv4 header checksum (in network byte order). */
+extern __be16 ipv4_checksum(void *ip_header, size_t ip_header_bytes);
+
+/* Calculates TCP or UDP checksum for IPv4 (in network byte order). */
+extern __be16 tcp_udp_v4_checksum(struct in_addr src_ip, struct in_addr dst_ip,
+ u8 protocol, const void *payload, u16 len);
+
+/* IPv6 ... */
+
+/* Calculates TCP, UDP, or ICMP checksum for IPv6 (in network byte order). */
+extern __be16 tcp_udp_v6_checksum(const struct in6_addr *src_ip,
+ const struct in6_addr *dst_ip,
+ u8 protocol, const void *payload, u32 len);
+
+/* SCTP ... */
+
+/* Calculates the CRC32C checksum used by SCTP (in network byte order). */
+extern __be32 sctp_crc32c(const void *packet, u32 len);
+
+#endif /* __CHECKSUM_H__ */
diff --git a/test/packetdrill/checksum_test.c b/test/packetdrill/checksum_test.c
new file mode 100644
index 0000000..08ef2e1
--- /dev/null
+++ b/test/packetdrill/checksum_test.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Unit test for checksum.c.
+ */
+
+#include "checksum.h"
+
+#include <arpa/inet.h>
+#include "assert.h"
+#include "ip.h"
+#include "ipv6.h"
+#include "sctp.h"
+#include "tcp.h"
+
+static void test_tcp_udp_v4_checksum(void)
+{
+ u8 data[] = {
+ 0x45, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0x06, 0xf9, 0x10, 0x01, 0x01, 0x01, 0x01,
+ 0xc0, 0xa8, 0x00, 0x01, 0x04, 0xd2, 0xeb, 0x35,
+ 0x00, 0x00, 0x00, 0x00, 0xc6, 0xf0, 0x56, 0x00,
+ 0xa0, 0x12, 0x16, 0xa0, 0x54, 0x12, 0x00, 0x00,
+ 0x02, 0x04, 0x05, 0xb4, 0x04, 0x02, 0x08, 0x0a,
+ 0x00, 0x00, 0x02, 0xbc, 0x00, 0x06, 0x0a, 0xd8,
+ 0x01, 0x03, 0x03, 0x07,
+ };
+
+ struct in_addr src_ip, dst_ip;
+ struct tcp *tcp = (struct tcp *) (data + sizeof(struct ipv4));
+ int len = sizeof(data) - sizeof(struct ipv4);
+ u16 checksum = 0;
+
+ assert(inet_pton(AF_INET, "1.1.1.1", &src_ip) == 1);
+ assert(inet_pton(AF_INET, "192.168.0.1", &dst_ip) == 1);
+
+ checksum =
+ ntohs(tcp_udp_v4_checksum(src_ip, dst_ip, IPPROTO_TCP, tcp, len));
+ assert(checksum == 0);
+
+ tcp->check = 0;
+ checksum =
+ ntohs(tcp_udp_v4_checksum(src_ip, dst_ip, IPPROTO_TCP, tcp, len));
+ assert(checksum == 0x5412);
+}
+
+static void test_tcp_udp_v6_checksum(void)
+{
+ u8 data[] = {
+ 0x60, 0x00, 0x00, 0x00, 0x00, 0x20, 0x06, 0xff,
+ 0x20, 0x01, 0x0d, 0xb8, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+ 0xfd, 0x3d, 0xfa, 0x7b, 0xd1, 0x7d, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+ 0xd3, 0xe2, 0x1f, 0x90, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x80, 0x02, 0x80, 0x18,
+ 0x06, 0x60, 0x00, 0x00, 0x02, 0x04, 0x03, 0xe8,
+ 0x04, 0x02, 0x01, 0x01, 0x01, 0x03, 0x03, 0x07,
+ };
+
+ struct ipv6 *ipv6 = (struct ipv6 *) (data);
+ struct tcp *tcp = (struct tcp *) (data + sizeof(struct ipv6));
+ int len = sizeof(data) - sizeof(struct ipv6);
+ u16 checksum = 0;
+
+ checksum =
+ ntohs(tcp_udp_v6_checksum(&ipv6->src_ip,
+ &ipv6->dst_ip,
+ IPPROTO_TCP, tcp, len));
+ assert(checksum == 0);
+
+ tcp->check = 0;
+ checksum =
+ ntohs(tcp_udp_v6_checksum(&ipv6->src_ip,
+ &ipv6->dst_ip,
+ IPPROTO_TCP, tcp, len));
+ assert(checksum == 0x0660);
+}
+
+static void test_ipv4_checksum(void)
+{
+ u8 data[] = {
+ 0x45, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0x06, 0xf9, 0x10, 0x01, 0x01, 0x01, 0x01,
+ 0xc0, 0xa8, 0x00, 0x01,
+ };
+ struct ipv4 *ip = (struct ipv4 *) data;
+ u16 checksum = 0;
+
+ checksum = ntohs(ipv4_checksum(data, sizeof(data)));
+ assert(checksum == 0);
+
+ ip->check = 0;
+ checksum = ntohs(ipv4_checksum(data, sizeof(data)));
+ assert(checksum == 0xf910);
+}
+
+static void test_sctp_crc32c(void)
+{
+ u8 data[] = {
+ 0x07, 0xd0, 0xd6, 0x61, 0x11, 0x0c, 0xc5, 0x6c,
+ 0xda, 0xd7, 0x37, 0x74, 0x06, 0x00, 0x00, 0x0f,
+ 0x00, 0x0c, 0x00, 0x0b, 0x47, 0x6f, 0x6f, 0x64,
+ 0x62, 0x79, 0x65, 0x00,
+ };
+ struct sctp_common_header *sctp_common_header;
+ u32 crc32c;
+
+ sctp_common_header = (struct sctp_common_header *)data;
+ sctp_common_header->crc32c = 0;
+ crc32c = ntohl(sctp_crc32c(data, sizeof(data)));
+ assert(crc32c == 0xdad73774);
+}
+
+int main(void)
+{
+ test_tcp_udp_v4_checksum();
+ test_tcp_udp_v6_checksum();
+ test_ipv4_checksum();
+ test_sctp_crc32c();
+ return 0;
+}
diff --git a/test/packetdrill/code.c b/test/packetdrill/code.c
new file mode 100644
index 0000000..0c38e40
--- /dev/null
+++ b/test/packetdrill/code.c
@@ -0,0 +1,777 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Implementation for a module to write out post-processing code that
+ * can run custom programmatic analyses and constraint verification.
+ */
+
+#include "code.h"
+
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "assert.h"
+#include "run.h"
+#include "tcp.h"
+
+/* We emit the following Python preamble at the top of the output
+ * Python code. It defines a custom exception hook so that when an
+ * exception is raised (such as a failed assertion) we print the file
+ * name and line number of the code snippet in the original test
+ * script that caused the error, not just the file name and line
+ * number in the generated Python file (which will be meaningless or
+ * confusing to the user).
+ */
+const char python_preamble[] =
+"import sys\n"
+"import traceback\n"
+"def excepthook(etype, value, tb):\n"
+" sys.stderr.write(\"%s:%d: error in Python code\\n\" %\n"
+" (_file, _line))\n"
+" traceback.print_exception(etype, value, tb)\n"
+"\n"
+"sys.excepthook = excepthook\n"
+"\n";
+
+/* Write out the standard utility routines useful for a given language. */
+static void write_preamble(struct code_state *code)
+{
+ assert(code->format > FORMAT_NONE);
+ assert(code->format < FORMAT_NUM_TYPES);
+ switch (code->format) {
+ case FORMAT_NONE:
+ case FORMAT_NUM_TYPES:
+ assert(!"bad code format type");
+ case FORMAT_PYTHON:
+ fprintf(code->file, "%s\n", python_preamble);
+ break;
+ /* omitting default so compiler catches missing cases */
+ }
+}
+
+#if HAVE_TCP_INFO
+
+/* Write out a formatted text representation of an assignment of the
+ * given value to the given named variable.
+ */
+static void emit_var(struct code_state *code, const char *name, u64 value)
+{
+ assert(code->format > FORMAT_NONE);
+ assert(code->format < FORMAT_NUM_TYPES);
+ switch (code->format) {
+ case FORMAT_NONE:
+ case FORMAT_NUM_TYPES:
+ assert(!"bad code format type");
+ case FORMAT_PYTHON:
+ fprintf(code->file, "%s = %llu\n", name, value);
+ break;
+ /* omitting default so compiler catches missing cases */
+ }
+}
+
+/* Write out a newline to terminate a sequence of variable assignments */
+static void emit_var_end(struct code_state *code)
+{
+ fprintf(code->file, "\n");
+}
+
+/* Write out a formatted representation of useful symbolic names. */
+static void write_symbols(struct code_state *code)
+{
+#ifdef linux
+ /* Emit symbolic names for tcpi_state values. */
+ emit_var(code, "TCP_ESTABLISHED", TCP_ESTABLISHED);
+ emit_var(code, "TCP_SYN_SENT", TCP_SYN_SENT);
+ emit_var(code, "TCP_SYN_RECV", TCP_SYN_RECV);
+ emit_var(code, "TCP_FIN_WAIT1", TCP_FIN_WAIT1);
+ emit_var(code, "TCP_FIN_WAIT2", TCP_FIN_WAIT2);
+ emit_var(code, "TCP_TIME_WAIT", TCP_TIME_WAIT);
+ emit_var(code, "TCP_CLOSE", TCP_CLOSE);
+ emit_var(code, "TCP_CLOSE_WAIT", TCP_CLOSE_WAIT);
+ emit_var(code, "TCP_LAST_ACK", TCP_LAST_ACK);
+ emit_var(code, "TCP_LISTEN", TCP_LISTEN);
+ emit_var(code, "TCP_CLOSING", TCP_CLOSING);
+ /* Emit symbolic names for tcpi_ca_state values. */
+ emit_var(code, "TCP_CA_Open", TCP_CA_Open);
+ emit_var(code, "TCP_CA_Disorder", TCP_CA_Disorder);
+ emit_var(code, "TCP_CA_CWR", TCP_CA_CWR);
+ emit_var(code, "TCP_CA_Recovery", TCP_CA_Recovery);
+ emit_var(code, "TCP_CA_Loss", TCP_CA_Loss);
+#endif /* linux */
+
+ /* tcpi_options flags */
+#ifdef linux
+ emit_var(code, "TCPI_OPT_TIMESTAMPS", TCPI_OPT_TIMESTAMPS);
+ emit_var(code, "TCPI_OPT_WSCALE", TCPI_OPT_WSCALE);
+ emit_var(code, "TCPI_OPT_ECN", TCPI_OPT_ECN);
+ emit_var(code, "TCPI_OPT_SYN_DATA", TCPI_OPT_SYN_DATA);
+#endif /* linux */
+}
+
+#endif /* HAVE_TCP_INFO */
+
+#ifdef linux
+
+/* Write out a formatted representation of the given tcp_info buffer. */
+static void write_tcp_info(struct code_state *code,
+ const struct _tcp_info *info,
+ int len)
+{
+ assert(len >= sizeof(struct _tcp_info));
+
+ write_symbols(code);
+
+ /* Emit the recorded values of tcpi_foo values. */
+ emit_var(code, "tcpi_state", info->tcpi_state);
+ emit_var(code, "tcpi_ca_state", info->tcpi_ca_state);
+ emit_var(code, "tcpi_retransmits", info->tcpi_retransmits);
+ emit_var(code, "tcpi_probes", info->tcpi_probes);
+ emit_var(code, "tcpi_backoff", info->tcpi_backoff);
+ emit_var(code, "tcpi_options", info->tcpi_options);
+ emit_var(code, "tcpi_snd_wscale", info->tcpi_snd_wscale);
+ emit_var(code, "tcpi_rcv_wscale", info->tcpi_rcv_wscale);
+ emit_var(code, "tcpi_delivery_rate_app_limited",
+ info->tcpi_delivery_rate_app_limited);
+ emit_var(code, "tcpi_rto", info->tcpi_rto);
+ emit_var(code, "tcpi_ato", info->tcpi_ato);
+ emit_var(code, "tcpi_snd_mss", info->tcpi_snd_mss);
+ emit_var(code, "tcpi_rcv_mss", info->tcpi_rcv_mss);
+ emit_var(code, "tcpi_unacked", info->tcpi_unacked);
+ emit_var(code, "tcpi_sacked", info->tcpi_sacked);
+ emit_var(code, "tcpi_lost", info->tcpi_lost);
+ emit_var(code, "tcpi_retrans", info->tcpi_retrans);
+ emit_var(code, "tcpi_fackets", info->tcpi_fackets);
+ emit_var(code, "tcpi_last_data_sent", info->tcpi_last_data_sent);
+ emit_var(code, "tcpi_last_ack_sent", info->tcpi_last_ack_sent);
+ emit_var(code, "tcpi_last_data_recv", info->tcpi_last_data_recv);
+ emit_var(code, "tcpi_last_ack_recv", info->tcpi_last_ack_recv);
+ emit_var(code, "tcpi_pmtu", info->tcpi_pmtu);
+ emit_var(code, "tcpi_rcv_ssthresh", info->tcpi_rcv_ssthresh);
+ emit_var(code, "tcpi_rtt", info->tcpi_rtt);
+ emit_var(code, "tcpi_rttvar", info->tcpi_rttvar);
+ emit_var(code, "tcpi_snd_ssthresh", info->tcpi_snd_ssthresh);
+ emit_var(code, "tcpi_snd_cwnd", info->tcpi_snd_cwnd);
+ emit_var(code, "tcpi_advmss", info->tcpi_advmss);
+ emit_var(code, "tcpi_reordering", info->tcpi_reordering);
+ emit_var(code, "tcpi_total_retrans", info->tcpi_total_retrans);
+ emit_var(code, "tcpi_pacing_rate", info->tcpi_pacing_rate);
+ emit_var(code, "tcpi_max_pacing_rate", info->tcpi_max_pacing_rate);
+ emit_var(code, "tcpi_rcv_rtt", info->tcpi_rcv_rtt);
+ emit_var(code, "tcpi_rcv_space", info->tcpi_rcv_space);
+ emit_var(code, "tcpi_bytes_acked", info->tcpi_bytes_acked);
+ emit_var(code, "tcpi_bytes_received", info->tcpi_bytes_received);
+ emit_var(code, "tcpi_segs_out", info->tcpi_segs_out);
+ emit_var(code, "tcpi_segs_in", info->tcpi_segs_in);
+ emit_var(code, "tcpi_notsent_bytes", info->tcpi_notsent_bytes);
+ emit_var(code, "tcpi_min_rtt", info->tcpi_min_rtt);
+ emit_var(code, "tcpi_data_segs_in", info->tcpi_data_segs_in);
+ emit_var(code, "tcpi_data_segs_out", info->tcpi_data_segs_out);
+ emit_var(code, "tcpi_delivery_rate", info->tcpi_delivery_rate);
+ emit_var(code, "tcpi_busy_time", info->tcpi_busy_time);
+ emit_var(code, "tcpi_rwnd_limited", info->tcpi_rwnd_limited);
+ emit_var(code, "tcpi_sndbuf_limited", info->tcpi_sndbuf_limited);
+
+ emit_var_end(code);
+}
+
+/* Write out a formatted representation of the given _tcp_bbr_info buffer. */
+static void write_tcp_bbr_cc_info(struct code_state *code,
+ const union _tcp_cc_info *info,
+ int len)
+{
+ struct _tcp_bbr_info *b = (struct _tcp_bbr_info *)info;
+ u64 bw;
+
+ /* Check for fields in initial BBR release: */
+ if (len < (offsetof(struct _tcp_bbr_info, bbr_cwnd_gain) +
+ sizeof(b->bbr_cwnd_gain)))
+ return;
+ emit_var(code, "bbr_bw_lo", b->bbr_bw_lo);
+ emit_var(code, "bbr_bw_hi", b->bbr_bw_hi);
+ /* "bbr_bw" is made up for convenience */
+ bw = ((u64)b->bbr_bw_hi << 32) + b->bbr_bw_lo;
+ emit_var(code, "bbr_bw", bw);
+ emit_var(code, "bbr_min_rtt", b->bbr_min_rtt);
+ emit_var(code, "bbr_pacing_gain", b->bbr_pacing_gain);
+ emit_var(code, "bbr_cwnd_gain", b->bbr_cwnd_gain);
+}
+
+/* Write out a formatted representation of the given _tcp_dctcp_info buffer. */
+static void write_tcp_dctcp_cc_info(struct code_state *code,
+ const union _tcp_cc_info *info,
+ int len)
+{
+ struct _tcp_dctcp_info *d = (struct _tcp_dctcp_info *)info;
+
+ if (len < (offsetof(struct _tcp_dctcp_info, dctcp_ab_tot) +
+ sizeof(d->dctcp_ab_tot)))
+ return;
+ emit_var(code, "dctcp_enabled", d->dctcp_enabled);
+ emit_var(code, "dctcp_ce_state", d->dctcp_ce_state);
+ emit_var(code, "dctcp_alpha", d->dctcp_alpha);
+ emit_var(code, "dctcp_ab_ecn", d->dctcp_ab_ecn);
+ emit_var(code, "dctcp_ab_tot", d->dctcp_ab_tot);
+}
+
+/* Write out a formatted representation of the given _tcpvegas_info buffer. */
+static void write_tcp_vegas_cc_info(struct code_state *code,
+ const union _tcp_cc_info *info,
+ int len)
+{
+ struct _tcpvegas_info *v = (struct _tcpvegas_info *)info;
+
+ if (len < (offsetof(struct _tcpvegas_info, tcpv_minrtt) +
+ sizeof(v->tcpv_minrtt)))
+ return;
+ emit_var(code, "tcpv_enabled", v->tcpv_enabled);
+ emit_var(code, "tcpv_rttcnt", v->tcpv_rttcnt);
+ emit_var(code, "tcp_rtt", v->tcpv_rtt);
+ emit_var(code, "tcp_minrtt", v->tcpv_minrtt);
+}
+
+/* Write out a formatted representation of the given tcp_cc_info buffer. */
+static void write_tcp_cc_info(struct code_state *code,
+ const union _tcp_cc_info *info,
+ int len)
+{
+ /* getsockopt returns 0 len info if C.C. does not support the opt */
+ write_tcp_bbr_cc_info(code, info, len);
+ write_tcp_dctcp_cc_info(code, info, len);
+ write_tcp_vegas_cc_info(code, info, len);
+ emit_var_end(code);
+}
+
+/* Write out a formatted representation of the given mem_info buffer. */
+static void write_so_meminfo(struct code_state *code,
+ const u32 *mem_info,
+ int len)
+{
+ assert(len >= sizeof(u32) * _SK_MEMINFO_VARS);
+
+ emit_var(code, "SK_MEMINFO_RMEM_ALLOC", mem_info[_SK_MEMINFO_RMEM_ALLOC]);
+ emit_var(code, "SK_MEMINFO_RCVBUF", mem_info[_SK_MEMINFO_RCVBUF]);
+ emit_var(code, "SK_MEMINFO_WMEM_ALLOC", mem_info[_SK_MEMINFO_WMEM_ALLOC]);
+ emit_var(code, "SK_MEMINFO_SNDBUF", mem_info[_SK_MEMINFO_SNDBUF]);
+ emit_var(code, "SK_MEMINFO_FWD_ALLOC", mem_info[_SK_MEMINFO_FWD_ALLOC]);
+ emit_var(code, "SK_MEMINFO_WMEM_QUEUED", mem_info[_SK_MEMINFO_WMEM_QUEUED]);
+ emit_var(code, "SK_MEMINFO_OPTMEM", mem_info[_SK_MEMINFO_OPTMEM]);
+ emit_var(code, "SK_MEMINFO_BACKLOG", mem_info[_SK_MEMINFO_BACKLOG]);
+ emit_var(code, "SK_MEMINFO_DROPS", mem_info[_SK_MEMINFO_DROPS]);
+
+ emit_var_end(code);
+}
+#endif /* linux */
+
+#if defined(__FreeBSD__)
+
+/* Write out a formatted representation of the given tcp_info buffer. */
+static void write_tcp_info(struct code_state *code,
+ const struct _tcp_info *info,
+ int len)
+{
+ assert(len >= sizeof(struct _tcp_info));
+
+ write_symbols(code);
+
+ /* Emit the recorded values of tcpi_foo values. */
+ emit_var(code, "tcpi_state", info->tcpi_state);
+ emit_var(code, "tcpi_options", info->tcpi_options);
+ emit_var(code, "tcpi_snd_wscale", info->tcpi_snd_wscale);
+ emit_var(code, "tcpi_rcv_wscale", info->tcpi_rcv_wscale);
+ emit_var(code, "tcpi_rto", info->tcpi_rto);
+ emit_var(code, "tcpi_snd_mss", info->tcpi_snd_mss);
+ emit_var(code, "tcpi_rcv_mss", info->tcpi_rcv_mss);
+ emit_var(code, "tcpi_last_data_recv", info->tcpi_last_data_recv);
+ emit_var(code, "tcpi_rtt", info->tcpi_rtt);
+ emit_var(code, "tcpi_rttvar", info->tcpi_rttvar);
+ emit_var(code, "tcpi_snd_ssthresh", info->tcpi_snd_ssthresh);
+ emit_var(code, "tcpi_snd_cwnd", info->tcpi_snd_cwnd);
+ emit_var(code, "tcpi_rcv_space", info->tcpi_rcv_space);
+
+ /* FreeBSD extensions to tcp_info. */
+ emit_var(code, "tcpi_snd_wnd", info->tcpi_snd_wnd);
+ emit_var(code, "tcpi_snd_bwnd", info->tcpi_snd_bwnd);
+ emit_var(code, "tcpi_snd_nxt", info->tcpi_snd_nxt);
+ emit_var(code, "tcpi_rcv_nxt", info->tcpi_rcv_nxt);
+ emit_var(code, "tcpi_toe_tid", info->tcpi_toe_tid);
+ emit_var(code, "tcpi_snd_rexmitpack", info->tcpi_snd_rexmitpack);
+ emit_var(code, "tcpi_rcv_ooopack", info->tcpi_rcv_ooopack);
+ emit_var(code, "tcpi_snd_zerowin", info->tcpi_snd_zerowin);
+
+ emit_var_end(code);
+}
+
+#endif /* __FreeBSD__ */
+
+/* Allocate a new empty struct code_text struct. */
+static struct code_text *text_new(void)
+{
+ struct code_text *text = calloc(1, sizeof(struct code_text));
+ return text;
+}
+
+/* Free the given text struct and all storage to which it points. */
+static void text_free(struct code_text *text)
+{
+ free(text->text);
+ free(text->file_name);
+ free(text);
+}
+
+/* Allocate a new empty struct code_data struct. */
+static struct code_data *data_new(void)
+{
+ struct code_data *data = calloc(1, sizeof(struct code_data));
+ return data;
+}
+
+/* Free the given data and all storage to which it points. */
+static void data_free(struct code_data *data)
+{
+ free(data->buffer);
+ free(data);
+}
+
+/* Allocate a new empty fragment. */
+static struct code_fragment *fragment_new(void)
+{
+ struct code_fragment *fragment =
+ calloc(1, sizeof(struct code_fragment));
+ return fragment;
+}
+
+/* Free the given fragment and all storage to which it points. */
+static void fragment_free(struct code_fragment *fragment)
+{
+ assert(fragment->type > FRAGMENT_NONE);
+ assert(fragment->type < FRAGMENT_NUM_TYPES);
+ switch (fragment->type) {
+ case FRAGMENT_NONE:
+ case FRAGMENT_NUM_TYPES:
+ assert(!"bad code fragment type");
+ break;
+ case FRAGMENT_TEXT:
+ text_free(fragment->contents.text);
+ break;
+ case FRAGMENT_DATA:
+ data_free(fragment->contents.data);
+ break;
+ /* omitting default so compiler catches missing cases */
+ }
+ free(fragment);
+}
+
+/* Write out the text to the given file. */
+static void write_text(struct code_state *code, struct code_text *text)
+{
+ assert(code->format > FORMAT_NONE);
+ assert(code->format < FORMAT_NUM_TYPES);
+ switch (code->format) {
+ case FORMAT_NONE:
+ case FORMAT_NUM_TYPES:
+ assert(!"bad code format type");
+ case FORMAT_PYTHON:
+ fprintf(code->file,
+ "_file = '%s'\n"
+ "_line = %d\n"
+ "%s\n\n",
+ text->file_name, text->line_number, text->text);
+ break;
+ /* omitting default so compiler catches missing cases */
+ }
+}
+
+/* Write out a textual representation of the data to the given file. */
+static void write_data(struct code_state *code, struct code_data *data)
+{
+ assert(data->type > DATA_NONE);
+ assert(data->type < DATA_NUM_TYPES);
+ switch (data->type) {
+ case DATA_NONE:
+ case DATA_NUM_TYPES:
+ assert(!"bad data type");
+ break;
+#if HAVE_TCP_INFO
+ case DATA_TCP_INFO:
+ write_tcp_info(code, data->buffer, data->len);
+ break;
+#endif /* HAVE_TCP_INFO */
+#if HAVE_TCP_CC_INFO
+ case DATA_TCP_CC_INFO:
+ write_tcp_cc_info(code, data->buffer, data->len);
+ break;
+#endif /* HAVE_TCP_CC_INFO */
+#if HAVE_SO_MEMINFO
+ case DATA_SO_MEMINFO:
+ write_so_meminfo(code, data->buffer, data->len);
+ break;
+#endif /* HAVE_SO_MEMINFO */
+ /* omitting default so compiler catches missing cases */
+ }
+}
+
+/* Write out a textual representation of the fragment to the given file. */
+static void write_fragment(struct code_state *code,
+ struct code_fragment *fragment)
+{
+ assert(fragment->type > FRAGMENT_NONE);
+ assert(fragment->type < FRAGMENT_NUM_TYPES);
+ switch (fragment->type) {
+ case FRAGMENT_NONE:
+ case FRAGMENT_NUM_TYPES:
+ assert(!"bad code fragment type");
+ break;
+ case FRAGMENT_TEXT:
+ write_text(code, fragment->contents.text);
+ break;
+ case FRAGMENT_DATA:
+ write_data(code, fragment->contents.data);
+ break;
+ /* omitting default so compiler catches missing cases */
+ }
+}
+
+/* Format and write out all the code fragments. */
+static void write_all_fragments(struct code_state *code)
+{
+ struct code_fragment *fragment = NULL;
+ for (fragment = code->list_head; fragment != NULL;
+ fragment = fragment->next) {
+ write_fragment(code, fragment);
+ }
+}
+
+/* Append the code fragment to the end of the list of code fragments. */
+static void append_fragment(struct code_state *code,
+ struct code_fragment *fragment)
+{
+ *(code->list_tail) = fragment;
+ code->list_tail = &(fragment->next);
+}
+
+/* Append a literal ASCII text code snippet that we should emit.
+ * Takes ownership of the malloc-allocated text memory and frees it.
+ */
+static void append_text(struct code_state *code,
+ const char *file_name, int line_number,
+ char *text_buffer)
+{
+ struct code_text *text = text_new();
+ text->text = text_buffer;
+ text->file_name = strdup(file_name);
+ text->line_number = line_number;
+
+ struct code_fragment *fragment = fragment_new();
+ fragment->type = FRAGMENT_TEXT;
+ fragment->contents.text = text;
+ append_fragment(code, fragment);
+}
+
+/* Append a live binary buffer that we should translate into the
+ * format configured earlier by the user for this script.
+ * Takes ownership of the malloc-allocated buffer and frees it.
+ */
+static void append_data(struct code_state *code, enum code_data_t data_type,
+ void *data_buffer, int data_len)
+{
+ struct code_data *data = data_new();
+ data->buffer = data_buffer;
+ data->type = data_type;
+ data->len = data_len;
+
+ struct code_fragment *fragment = fragment_new();
+ fragment->type = FRAGMENT_DATA;
+ fragment->contents.data = data;
+ append_fragment(code, fragment);
+}
+
+struct code_state *code_new(struct config *config)
+{
+ struct code_state *code = calloc(1, sizeof(struct code_state));
+
+ /* Set up the pointer to the tail of the empty linked list. */
+ code->list_tail = &(code->list_head);
+
+ if (strcmp(config->code_format, "python") == 0)
+ code->format = FORMAT_PYTHON;
+ else
+ die("unsupported --code_format '%s'\n", config->code_format);
+
+ /* See which getsockopt we should use to get data for our code. */
+ if (strcmp(config->code_sockopt, "") == 0) {
+ code->data_type = DATA_NONE; /* auto-detect */
+#if HAVE_TCP_INFO
+ } else if (strcmp(config->code_sockopt, "TCP_INFO") == 0) {
+ code->data_type = DATA_TCP_INFO;
+#endif
+#if HAVE_TCP_CC_INFO
+ } else if (strcmp(config->code_sockopt, "TCP_CC_INFO") == 0) {
+ code->data_type = DATA_TCP_CC_INFO;
+#endif /* HAVE_TCP_CC_INFO */
+#if HAVE_SO_MEMINFO
+ } else if (strcmp(config->code_sockopt, "SO_MEMINFO") == 0) {
+ code->data_type = DATA_SO_MEMINFO;
+#endif /* HAVE_SO_MEMINFO */
+ } else {
+ die("unsupported --code_sockopt '%s'\n", config->code_sockopt);
+ }
+
+ code->command_line = strdup(config->code_command_line);
+ code->verbose = config->verbose;
+
+ return code;
+}
+
+void code_free(struct code_state *code)
+{
+ if (code->command_line != NULL)
+ free(code->command_line);
+ if (code->path != NULL)
+ free(code->path);
+
+ /* Free all the code fragments. */
+ struct code_fragment *fragment = code->list_head;
+ while (fragment != NULL) {
+ struct code_fragment *dead_fragment = fragment;
+ fragment = fragment->next;
+ fragment_free(dead_fragment);
+ }
+
+ memset(code, 0, sizeof(*code)); /* paranoia to help catch bugs */
+ free(code);
+}
+
+/* Write all the code fragments to a newly-chosen temporary file and
+ * store the name of the file in code->path.
+ */
+static void write_code_file(struct code_state *code)
+{
+ /* mkstemp will fill this in with the actual unique path name. */
+ char path_template[] = "/tmp/code_XXXXXX";
+ int code_fd = mkstemp(path_template);
+ if (code_fd < 0)
+ die_perror("error making temp output file for code: mkstemp");
+
+ assert(code->path == NULL);
+ code->path = strdup(path_template);
+
+ code->file = fdopen(code_fd, "w");
+ if (code->file == NULL)
+ die_perror("error opening temp output file for code: fdopen");
+
+ write_preamble(code);
+ write_all_fragments(code);
+
+ if (fclose(code->file) != 0)
+ die_perror("error closing temp output file for code: fclose");
+
+ code->file = NULL;
+}
+
+/* Execute the code in the file at code->path by executing the
+ * configured command line. On success, returns STATUS_OK. On error
+ * returns STATUS_ERR and fills in *error.
+ */
+static int execute_code_command_line(struct code_state *code, char **error)
+{
+ int result = STATUS_ERR; /* return value */
+ char *full_command_line = NULL;
+ asprintf(&full_command_line, "%s %s", code->command_line, code->path);
+
+ /* For verbose debugging we dump the full output file. */
+ if (code->verbose) {
+ char *verbose_command_line = NULL;
+ asprintf(&verbose_command_line, "cat %s", code->path);
+ system(verbose_command_line);
+ free(verbose_command_line);
+ printf("running: '%s'\n", full_command_line);
+ }
+
+ int status = system(full_command_line);
+ if (status == -1) {
+ asprintf(error, "error running '%s' with system(3): %s",
+ code->command_line, strerror(errno));
+ goto out;
+ }
+ if (WIFSIGNALED(status) &&
+ (WTERMSIG(status) == SIGINT || WTERMSIG(status) == SIGQUIT)) {
+ asprintf(error, "'%s' got signal %d (%s)",
+ code->command_line,
+ WTERMSIG(status), strsignal(WTERMSIG(status)));
+ goto out;
+ }
+ if (WEXITSTATUS(status) != 0) {
+ asprintf(error, "'%s' returned non-zero status %d",
+ code->command_line, WEXITSTATUS(status));
+ goto out;
+ }
+ result = STATUS_OK;
+
+out:
+ free(full_command_line);
+ return result;
+}
+
+/* Delete the temporary file at code->path. */
+static void delete_code_file(struct code_state *code)
+{
+ if ((code->path != NULL) && (unlink(code->path) != 0))
+ die_perror("error deleting code file: unlink:");
+}
+
+/* Write out the code to a file, execute the code, and delete the file. */
+int code_execute(struct code_state *code, char **error)
+{
+ if (code->list_head == NULL)
+ return STATUS_OK; /* no code to execute */
+
+ write_code_file(code);
+ int result = execute_code_command_line(code, error);
+ delete_code_file(code);
+ return result;
+}
+
+/* Run a getsockopt for the given fd to grab data of the given type.
+ * On success, return a pointer the filled-in buffer (allocated by malloc);
+ * on failure, return NULL.
+ */
+static void *get_data(struct state *state, struct event *event,
+ int fd, enum code_data_t data_type, int *len)
+{
+ int opt_name = 0;
+ int data_len = 0;
+ int level;
+
+ assert(data_type > DATA_NONE);
+ assert(data_type < DATA_NUM_TYPES);
+ switch (data_type) {
+ case DATA_NONE:
+ case DATA_NUM_TYPES:
+ assert(!"bad data type");
+ break;
+#if HAVE_TCP_INFO
+ case DATA_TCP_INFO:
+ opt_name = TCP_INFO;
+ data_len = sizeof(struct _tcp_info);
+ level = SOL_TCP;
+ break;
+#endif /* HAVE_TCP_INFO */
+#if HAVE_TCP_CC_INFO
+ case DATA_TCP_CC_INFO:
+ opt_name = TCP_CC_INFO;
+ data_len = sizeof(union _tcp_cc_info);
+ level = SOL_TCP;
+ break;
+#endif /* HAVE_TCP_CC_INFO */
+#if HAVE_SO_MEMINFO
+ case DATA_SO_MEMINFO:
+ opt_name = SO_MEMINFO;
+ data_len = sizeof(u32) * _SK_MEMINFO_VARS;
+ level = SOL_SOCKET;
+ break;
+#endif /* HAVE_SO_MEMINFO */
+ /* omitting default so compiler catches missing cases */
+ }
+ assert(opt_name != 0);
+ assert(data_len > 0);
+ socklen_t opt_len = data_len;
+ void *data = calloc(1, data_len);
+
+ int result = getsockopt(fd, level, opt_name, data, &opt_len);
+ if (result < 0) {
+ free(data);
+ return NULL;
+ }
+ *len = opt_len;
+ return data;
+}
+
+void run_code_event(struct state *state, struct event *event,
+ const char *text)
+{
+ DEBUGP("%d: run code event\n", event->line_number);
+
+ char *error = NULL;
+
+ /* Wait for the right time before firing off this event. */
+ wait_for_event(state);
+
+ if (state->socket_under_test == NULL) {
+ asprintf(&error, "no socket to use for code");
+ goto error_out;
+ }
+ int fd = state->socket_under_test->fd.live_fd;
+ struct code_state *code = state->code;
+
+ void *data = NULL;
+ void *data_ext = NULL;
+ void *data_meminfo = NULL;
+ int data_len = 0;
+#if HAVE_TCP_INFO
+ code->data_type = DATA_TCP_INFO;
+ data = get_data(state, event, fd, code->data_type, &data_len);
+ if (data)
+ append_data(code, code->data_type, data, data_len);
+#endif /* HAVE_TCP_INFO */
+ if (data == NULL && data_ext == NULL) {
+ asprintf(&error,
+ "can't find getsockopt to get TCP info");
+ goto error_out;
+ }
+#if HAVE_TCP_CC_INFO
+ code->data_type = DATA_TCP_CC_INFO;
+ data = get_data(state, event, fd, code->data_type, &data_len);
+ if (data) {
+ append_data(code, code->data_type, data, data_len);
+ } else {
+ asprintf(&error,
+ "can't find getsockopt to get TCP_CC_INFO");
+ goto error_out;
+ }
+#endif /* HAVE_TCP_CC_INFO */
+#if HAVE_SO_MEMINFO
+ code->data_type = DATA_SO_MEMINFO;
+ data_meminfo = get_data(state, event, fd, code->data_type, &data_len);
+ if (data_meminfo)
+ append_data(code, code->data_type, data_meminfo, data_len);
+ if (data_meminfo == NULL) {
+ asprintf(&error,
+ "can't find getsockopt to get sk_meminfo");
+ goto error_out;
+ }
+#endif
+
+ append_text(code, state->config->script_path, event->line_number,
+ strdup(text));
+
+ return;
+
+error_out:
+ die("%s:%d: runtime error in code: %s\n",
+ state->config->script_path, event->line_number, error);
+ free(error);
+}
diff --git a/test/packetdrill/code.h b/test/packetdrill/code.h
new file mode 100644
index 0000000..ef626bb
--- /dev/null
+++ b/test/packetdrill/code.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Interface for a module to write out post-processing code that
+ * can run custom programmatic analyses and constraint verification.
+ */
+
+#ifndef __CODE_H__
+#define __CODE_H__
+
+#include "types.h"
+
+#include "config.h"
+#include "script.h"
+
+/* Post-processing format syntax variants we support. */
+enum code_format_t {
+ FORMAT_NONE, /* uninitialized or no code so far */
+ FORMAT_PYTHON, /* Python syntax: var_name = 123 */
+ FORMAT_NUM_TYPES, /* number of types of format */
+};
+
+/* The type of a particular fragment of code. */
+enum code_fragment_t {
+ FRAGMENT_NONE, /* uninitialized or none so far */
+ FRAGMENT_TEXT, /* literal code text to emit */
+ FRAGMENT_DATA, /* binary buffer to dump as text */
+ FRAGMENT_NUM_TYPES, /* number of types of fragments */
+};
+
+/* The type of a particular binary data buffer. */
+enum code_data_t {
+ DATA_NONE, /* uninitialized or none so far */
+#if HAVE_TCP_INFO
+ DATA_TCP_INFO, /* binary tcp_info */
+#endif /* HAVE_TCP_INFO */
+#if HAVE_TCP_CC_INFO
+ DATA_TCP_CC_INFO, /* binary tcp_cc_info */
+#endif /* HAVE_SO_MEMINFO */
+#if HAVE_SO_MEMINFO
+ DATA_SO_MEMINFO, /* binary so_memfino */
+#endif /* HAVE_SO_MEMINFO */
+ DATA_NUM_TYPES, /* number of types of fragments */
+};
+
+/* Info about a textual code snippet to encode in the post-processing code. */
+struct code_text {
+ char *text; /* the code snippet string */
+ char *file_name; /* name of script text was read from */
+ int line_number; /* line on which text started */
+};
+
+/* Info about a data buffer to encode in the post-processing code. */
+struct code_data {
+ void *buffer; /* malloc-allocated buffer */
+ enum code_data_t type; /* type of data in the buffer */
+ int len; /* length of data in buffer */
+};
+
+/* Info about a fragment to insert in the post-processing code. */
+struct code_fragment {
+ enum code_fragment_t type; /* what's in this fragment? */
+ union {
+ struct code_text *text; /* ASCII text code snippet */
+ struct code_data *data; /* typed binary data buffer */
+ } contents;
+ struct code_fragment *next; /* next in linked list */
+};
+
+/* Internal state for the code execution module. */
+struct code_state {
+ bool verbose; /* print debug info? */
+ enum code_format_t format; /* language syntax to emit */
+ enum code_data_t data_type; /* data to get for snippets */
+ char *command_line; /* system(3) command to run */
+ char *path; /* path where we write code */
+ FILE *file; /* output file we're writing */
+ struct code_fragment *list_head; /* linked list head */
+ struct code_fragment **list_tail; /* pointer to tail */
+};
+
+/* Allocate and return a new code executor using the given config. */
+extern struct code_state *code_new(struct config *config);
+
+/* Tear down a code executor and free up the resources it has allocated. */
+extern void code_free(struct code_state *code);
+
+/* Run the TCP_INFO getsockopt on the current socket under test to
+ * get a snapshot of socket state, and stash the resulting data and
+ * code snippet so that at the end of the test we can emit the data
+ * and the code snippet, and then execute both.
+ */
+struct state;
+extern void run_code_event(struct state *state,
+ struct event *event, const char *text);
+
+/* Call this at the end of test execution to run the code by writing
+ * out the text of the code and invoking the command line supplied by
+ * the user. On success, returns STATUS_OK. On error returns
+ * STATUS_ERR and fills in *error.
+ */
+extern int code_execute(struct code_state *code, char **error);
+
+#endif /* __CODE_H__ */
diff --git a/test/packetdrill/config.c b/test/packetdrill/config.c
new file mode 100644
index 0000000..37e2eb0
--- /dev/null
+++ b/test/packetdrill/config.c
@@ -0,0 +1,605 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Helper functions for configuration information for a test run.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "config.h"
+#include "logging.h"
+#include "ip_prefix.h"
+
+/* For the sake of clarity, we require long option names, e.g. --foo,
+ * for all options except -v.
+ */
+enum option_codes {
+ OPT_IP_VERSION = 256,
+ OPT_BIND_PORT,
+ OPT_CODE_COMMAND,
+ OPT_CODE_FORMAT,
+ OPT_CODE_SOCKOPT,
+ OPT_CONNECT_PORT,
+ OPT_REMOTE_IP,
+ OPT_LOCAL_IP,
+ OPT_GATEWAY_IP,
+ OPT_NETMASK_IP,
+ OPT_SPEED,
+ OPT_MSS,
+ OPT_MTU,
+ OPT_INIT_SCRIPTS,
+ OPT_TOLERANCE_USECS,
+ OPT_WIRE_CLIENT,
+ OPT_WIRE_SERVER,
+ OPT_WIRE_SERVER_IP,
+ OPT_WIRE_SERVER_PORT,
+ OPT_WIRE_CLIENT_DEV,
+ OPT_WIRE_SERVER_DEV,
+ OPT_SO_FILENAME,
+ OPT_SO_FLAGS,
+ OPT_TCP_TS_ECR_SCALED,
+ OPT_TCP_TS_TICK_USECS,
+ OPT_STRICT_SEGMENTS,
+ OPT_NON_FATAL,
+ OPT_DRY_RUN,
+ OPT_IS_ANYIP,
+ OPT_SEND_OMIT_FREE,
+ OPT_DEFINE = 'D', /* a '-D' single-letter option */
+ OPT_VERBOSE = 'v', /* a '-v' single-letter option */
+};
+
+/* Specification of command line options for getopt_long(). */
+struct option options[] = {
+ { "ip_version", .has_arg = true, NULL, OPT_IP_VERSION },
+ { "bind_port", .has_arg = true, NULL, OPT_BIND_PORT },
+ { "code_command", .has_arg = true, NULL, OPT_CODE_COMMAND },
+ { "code_format", .has_arg = true, NULL, OPT_CODE_FORMAT },
+ { "code_sockopt", .has_arg = true, NULL, OPT_CODE_SOCKOPT },
+ { "connect_port", .has_arg = true, NULL, OPT_CONNECT_PORT },
+ { "remote_ip", .has_arg = true, NULL, OPT_REMOTE_IP },
+ { "local_ip", .has_arg = true, NULL, OPT_LOCAL_IP },
+ { "gateway_ip", .has_arg = true, NULL, OPT_GATEWAY_IP },
+ { "netmask_ip", .has_arg = true, NULL, OPT_NETMASK_IP },
+ { "speed", .has_arg = true, NULL, OPT_SPEED },
+ { "mss", .has_arg = true, NULL, OPT_MSS },
+ { "mtu", .has_arg = true, NULL, OPT_MTU },
+ { "init_scripts", .has_arg = true, NULL, OPT_INIT_SCRIPTS },
+ { "tolerance_usecs", .has_arg = true, NULL, OPT_TOLERANCE_USECS },
+ { "wire_client", .has_arg = false, NULL, OPT_WIRE_CLIENT },
+ { "wire_server", .has_arg = false, NULL, OPT_WIRE_SERVER },
+ { "wire_server_ip", .has_arg = true, NULL, OPT_WIRE_SERVER_IP },
+ { "wire_server_port", .has_arg = true, NULL, OPT_WIRE_SERVER_PORT },
+ { "wire_client_dev", .has_arg = true, NULL, OPT_WIRE_CLIENT_DEV },
+ { "wire_server_dev", .has_arg = true, NULL, OPT_WIRE_SERVER_DEV },
+ { "so_filename", .has_arg = true, NULL, OPT_SO_FILENAME },
+ { "so_flags", .has_arg = true, NULL, OPT_SO_FLAGS },
+ { "tcp_ts_ecr_scaled", .has_arg = false, NULL, OPT_TCP_TS_ECR_SCALED },
+ { "tcp_ts_tick_usecs", .has_arg = true, NULL, OPT_TCP_TS_TICK_USECS },
+ { "strict_segments", .has_arg = false, NULL, OPT_STRICT_SEGMENTS },
+ { "non_fatal", .has_arg = true, NULL, OPT_NON_FATAL },
+ { "dry_run", .has_arg = false, NULL, OPT_DRY_RUN },
+ { "is_anyip", .has_arg = false, NULL, OPT_IS_ANYIP },
+ { "send_omit_free", .has_arg = false, NULL, OPT_SEND_OMIT_FREE },
+ { "define", .has_arg = true, NULL, OPT_DEFINE },
+ { "verbose", .has_arg = false, NULL, OPT_VERBOSE },
+ { NULL },
+};
+
+void show_usage(void)
+{
+ fprintf(stderr, "Usage: packetdrill\n"
+ "\t[--ip_version=[ipv4,ipv4-mapped-ipv6,ipv6]]\n"
+ "\t[--bind_port=bind_port]\n"
+ "\t[--code_command=code_command]\n"
+ "\t[--code_format=code_format]\n"
+ "\t[--code_sockopt=TCP_INFO]\n"
+ "\t[--connect_port=connect_port]\n"
+ "\t[--remote_ip=remote_ip]\n"
+ "\t[--local_ip=local_ip]\n"
+ "\t[--gateway_ip=gateway_ip]\n"
+ "\t[--netmask_ip=netmask_ip]\n"
+ "\t[--init_scripts=<comma separated filenames>\n"
+ "\t[--speed=<speed in Mbps>\n"
+ "\t[--mss=<MSS in bytes>\n"
+ "\t[--mtu=<MTU in bytes>\n"
+ "\t[--tolerance_usecs=tolerance_usecs]\n"
+ "\t[--tcp_ts_ecr_scaled]\n"
+ "\t[--tcp_ts_tick_usecs=<microseconds per TCP TS val tick>]\n"
+ "\t[--strict_segments]\n"
+ "\t[--non_fatal=<comma separated types: packet,syscall>]\n"
+ "\t[--wire_client]\n"
+ "\t[--wire_server]\n"
+ "\t[--wire_server_ip=<server_ipv4_address>]\n"
+ "\t[--wire_server_port=<server_port>]\n"
+ "\t[--wire_client_dev=<eth_dev_name>]\n"
+ "\t[--wire_server_dev=<eth_dev_name>]\n"
+ "\t[--so_filename=<filename>]\n"
+ "\t[--so_flags=<flags passed to SO init function>]\n"
+ "\t[--dry_run]\n"
+ "\t[--is_anyip]\n"
+ "\t[--send_omit_free]\n"
+ "\t[--define symbol1=val1 --define symbol2=val2 ...]\n"
+ "\t[--verbose|-v]\n"
+ "\tscript_path ...\n");
+}
+
+/* Address Configuration for IPv4
+ *
+ * For IPv4, we use the 192.168.0.0/16 RFC 1918 private IP space for
+ * our tun interface. To avoid accidents and confusion we want remote
+ * addresses to be permanently unallocated addresses outside of the
+ * private/unroutable RFC 1918 ranges (kernel code can behave
+ * differently for private addresses). So for remote addresses we use
+ * the 192.0.2.0/24 TEST-NET-1 range (see RFC 5737).
+ *
+ * Summary for IPv4:
+ * - local address: 192.168.0.0/16 private IP space (RFC 1918)
+ * - remote address: 192.0.2.0/24 TEST-NET-1 range (RFC 5737)
+ */
+
+#define DEFAULT_V4_LIVE_REMOTE_IP_STRING "192.0.2.1/24"
+#define DEFAULT_V4_LIVE_LOCAL_IP_STRING "192.168.0.0"
+/* Note : generate_random_ipv4_addr() assumes the gateway is .1
+ */
+#define DEFAULT_V4_LIVE_GATEWAY_IP_STRING "192.168.0.1"
+#define DEFAULT_V4_LIVE_NETMASK_IP_STRING "255.255.0.0"
+
+/* Address Configuration for IPv6
+ *
+ * For IPv6 we use a ULA (unique local address) for our local (tun)
+ * interface, and the RFC 3849 documentation space for our remote
+ * address.
+ *
+ * Summary for IPv6:
+ * - local address: fd3d:fa7b:d17d::/48 in unique local address space (RFC 4193)
+ * - remote address: 2001:DB8::/32 documentation prefix (RFC 3849)
+ */
+
+#define DEFAULT_V6_LIVE_REMOTE_IP_STRING "2001:DB8::1/32"
+#define DEFAULT_V6_LIVE_LOCAL_IP_STRING "fd3d:fa7b:d17d::0"
+#define DEFAULT_V6_LIVE_GATEWAY_IP_STRING "fd3d:fa7b:d17d:8888::0"
+#define DEFAULT_V6_LIVE_PREFIX_LEN 48
+
+/* Fill in any as-yet-unspecified IP address attributes using IPv4 defaults. */
+static void set_ipv4_defaults(struct config *config)
+{
+ if (strlen(config->live_remote_ip_string) == 0)
+ strcpy(config->live_remote_ip_string,
+ DEFAULT_V4_LIVE_REMOTE_IP_STRING);
+ if (strlen(config->live_netmask_ip_string) == 0)
+ strcpy(config->live_netmask_ip_string,
+ DEFAULT_V4_LIVE_NETMASK_IP_STRING);
+ if (strlen(config->live_local_ip_string) == 0)
+ generate_random_ipv4_addr(config->live_local_ip_string,
+ DEFAULT_V4_LIVE_LOCAL_IP_STRING,
+ config->live_netmask_ip_string);
+ if (strlen(config->live_gateway_ip_string) == 0)
+ strcpy(config->live_gateway_ip_string,
+ DEFAULT_V4_LIVE_GATEWAY_IP_STRING);
+}
+
+/* Fill in any as-yet-unspecified IP address attributes using IPv6 defaults. */
+static void set_ipv6_defaults(struct config *config)
+{
+ if (strlen(config->live_remote_ip_string) == 0)
+ strcpy(config->live_remote_ip_string,
+ DEFAULT_V6_LIVE_REMOTE_IP_STRING);
+ if (strlen(config->live_local_ip_string) == 0)
+ generate_random_ipv6_addr(config->live_local_ip_string,
+ DEFAULT_V6_LIVE_LOCAL_IP_STRING,
+ DEFAULT_V6_LIVE_PREFIX_LEN);
+ if (strlen(config->live_gateway_ip_string) == 0)
+ strcpy(config->live_gateway_ip_string,
+ DEFAULT_V6_LIVE_GATEWAY_IP_STRING);
+}
+
+/* Set default configuration before we begin parsing. */
+void set_default_config(struct config *config)
+{
+ memset(config, 0, sizeof(*config));
+ config->code_command_line = "/usr/bin/python";
+ config->code_format = "python";
+ config->code_sockopt = ""; /* auto-detect */
+ config->ip_version = IP_VERSION_4;
+ config->live_bind_port = 8080;
+ config->live_connect_port = 8080;
+ config->tolerance_usecs = 4000;
+ config->speed = TUN_DRIVER_SPEED_CUR;
+ config->mtu = TUN_DRIVER_DEFAULT_MTU;
+
+ config->tcp_ts_ecr_scaled = false;
+
+ /* For now, by default we disable checks of outbound TS val
+ * values, since there are timestamp val bugs in the tests and
+ * kernel. TODO(ncardwell): Switch default tcp_ts_tick_usecs
+ * to 1000 when TCP timestamp val bugs have been eradicated
+ * from kernel and tests.
+ */
+ config->tcp_ts_tick_usecs = 0; /* disable checks of TS val */
+
+ config->live_remote_ip_string[0] = '\0';
+ config->live_local_ip_string[0] = '\0';
+ config->live_gateway_ip_string[0] = '\0';
+ config->live_netmask_ip_string[0] = '\0';
+
+ config->init_scripts = NULL;
+
+ config->wire_server_port = 8081;
+ config->wire_client_device = "eth0";
+ config->wire_server_device = "eth0";
+}
+
+static void set_remote_ip_and_prefix(struct config *config)
+{
+ config->live_remote_ip = config->live_remote_prefix.ip;
+ ip_to_string(&config->live_remote_ip,
+ config->live_remote_ip_string);
+
+ ip_prefix_normalize(&config->live_remote_prefix);
+ ip_prefix_to_string(&config->live_remote_prefix,
+ config->live_remote_prefix_string);
+}
+
+/* Here's a table summarizing the types of various entities in the
+ * different flavors of IP that we support:
+ *
+ * flavor socket_domain bind/connect/accept IP local/remote IP
+ * -------- ------------- ------------------------- ---------------
+ * 4 AF_INET AF_INET AF_INET
+ * 4-mapped-6 AF_INET6 AF_INET6 mapped from IPv4 AF_INET
+ * 6 AF_INET6 AF_INET6 AF_INET6
+ */
+
+/* Calculate final configuration values needed for IPv4 */
+static void finalize_ipv4_config(struct config *config)
+{
+ set_ipv4_defaults(config);
+
+ config->live_local_ip = ipv4_parse(config->live_local_ip_string);
+
+ config->live_remote_prefix =
+ ipv4_prefix_parse(config->live_remote_ip_string);
+ set_remote_ip_and_prefix(config);
+
+ config->live_prefix_len =
+ netmask_to_prefix(config->live_netmask_ip_string);
+ config->live_gateway_ip = ipv4_parse(config->live_gateway_ip_string);
+ config->live_bind_ip = config->live_local_ip;
+ config->live_connect_ip = config->live_remote_ip;
+ config->socket_domain = AF_INET;
+ config->wire_protocol = AF_INET;
+}
+
+/* Calculate final configuration values needed for ipv4-mapped-ipv6 */
+static void finalize_ipv4_mapped_ipv6_config(struct config *config)
+{
+ set_ipv4_defaults(config);
+
+ config->live_local_ip = ipv4_parse(config->live_local_ip_string);
+
+ config->live_remote_prefix =
+ ipv4_prefix_parse(config->live_remote_ip_string);
+ set_remote_ip_and_prefix(config);
+
+ config->live_prefix_len =
+ netmask_to_prefix(config->live_netmask_ip_string);
+ config->live_gateway_ip = ipv4_parse(config->live_gateway_ip_string);
+ config->live_bind_ip = ipv6_map_from_ipv4(config->live_local_ip);
+ config->live_connect_ip = ipv6_map_from_ipv4(config->live_remote_ip);
+ config->socket_domain = AF_INET6;
+ config->wire_protocol = AF_INET;
+}
+
+/* Calculate final configuration values needed for IPv6 */
+static void finalize_ipv6_config(struct config *config)
+{
+ set_ipv6_defaults(config);
+
+ config->live_local_ip = ipv6_parse(config->live_local_ip_string);
+
+ config->live_remote_prefix =
+ ipv6_prefix_parse(config->live_remote_ip_string);
+ set_remote_ip_and_prefix(config);
+
+ config->live_prefix_len = DEFAULT_V6_LIVE_PREFIX_LEN;
+ config->live_gateway_ip = ipv6_parse(config->live_gateway_ip_string);
+ config->live_bind_ip = config->live_local_ip;
+ config->live_connect_ip = config->live_remote_ip;
+ config->socket_domain = AF_INET6;
+ config->wire_protocol = AF_INET6;
+}
+
+void finalize_config(struct config *config)
+{
+ assert(config->ip_version >= IP_VERSION_4);
+ assert(config->ip_version <= IP_VERSION_6);
+ switch (config->ip_version) {
+ case IP_VERSION_4:
+ finalize_ipv4_config(config);
+ break;
+ case IP_VERSION_4_MAPPED_6:
+ finalize_ipv4_mapped_ipv6_config(config);
+ break;
+ case IP_VERSION_6:
+ finalize_ipv6_config(config);
+ break;
+ /* omitting default so compiler will catch missing cases */
+ }
+}
+
+/* Expect that arg is comma-delimited, allowing for spaces. */
+void parse_non_fatal_arg(char *arg, struct config *config)
+{
+ char *argdup, *saveptr, *token;
+
+ if (arg == NULL || strlen(arg) == 0)
+ return;
+
+ argdup = strdup(arg);
+ token = strtok_r(argdup, ", ", &saveptr);
+ while (token != NULL) {
+ if (strcmp(token, "packet") == 0)
+ config->non_fatal_packet = true;
+ else if (strcmp(token, "syscall") == 0)
+ config->non_fatal_syscall = true;
+ token = strtok_r(NULL, ", ", &saveptr);
+ }
+
+ free(argdup);
+}
+
+
+/* Process a command line option */
+static void process_option(int opt, char *optarg, struct config *config,
+ char *where)
+{
+ int port = 0;
+ char *end = NULL, *equals = NULL, *symbol = NULL, *value = NULL;
+ unsigned long speed = 0;
+
+ DEBUGP("process_option %d ('%c') = %s\n",
+ opt, (char)opt, optarg);
+
+ switch (opt) {
+ case OPT_IP_VERSION:
+ if (strcmp(optarg, "ipv4") == 0)
+ config->ip_version = IP_VERSION_4;
+ else if (strcmp(optarg, "ipv4-mapped-ipv6") == 0)
+ config->ip_version = IP_VERSION_4_MAPPED_6;
+ else if (strcmp(optarg, "ipv6") == 0)
+ config->ip_version = IP_VERSION_6;
+ else
+ die("%s: bad --ip_version: %s\n", where, optarg);
+ break;
+ case OPT_BIND_PORT:
+ port = atoi(optarg);
+ if ((port <= 0) || (port > 0xffff))
+ die("%s: bad --bind_port: %s\n", where, optarg);
+ config->live_bind_port = port;
+ break;
+ case OPT_CODE_COMMAND:
+ config->code_command_line = optarg;
+ break;
+ case OPT_CODE_FORMAT:
+ config->code_format = optarg;
+ break;
+ case OPT_CODE_SOCKOPT:
+ config->code_sockopt = optarg;
+ break;
+ case OPT_CONNECT_PORT:
+ port = atoi(optarg);
+ if ((port <= 0) || (port > 0xffff))
+ die("%s: bad --connect_port: %s\n", where, optarg);
+ config->live_connect_port = port;
+ break;
+ case OPT_REMOTE_IP:
+ strncpy(config->live_remote_ip_string, optarg, ADDR_STR_LEN-1);
+ break;
+ case OPT_LOCAL_IP:
+ strncpy(config->live_local_ip_string, optarg, ADDR_STR_LEN-1);
+ break;
+ case OPT_GATEWAY_IP:
+ strncpy(config->live_gateway_ip_string, optarg, ADDR_STR_LEN-1);
+ break;
+ case OPT_MSS:
+ config->mss = atoi(optarg);
+ if (config->mss <= 0)
+ die("%s: bad --mss: %s\n", where, optarg);
+ break;
+ case OPT_MTU:
+ config->mtu = atoi(optarg);
+ if (config->mtu < 0)
+ die("%s: bad --mtu: %s\n", where, optarg);
+ break;
+ case OPT_NETMASK_IP:
+ strncpy(config->live_netmask_ip_string, optarg, ADDR_STR_LEN-1);
+ break;
+ case OPT_INIT_SCRIPTS:
+ config->init_scripts = optarg;
+ break;
+ case OPT_NON_FATAL:
+ parse_non_fatal_arg(optarg, config);
+ break;
+ case OPT_SPEED:
+ speed = strtoul(optarg, &end, 10);
+ if (end == optarg || *end || !is_valid_u32(speed))
+ die("%s: bad --speed: %s\n", where, optarg);
+ config->speed = speed;
+ break;
+ case OPT_TOLERANCE_USECS:
+ config->tolerance_usecs = atoi(optarg);
+ if (config->tolerance_usecs <= 0)
+ die("%s: bad --tolerance_usecs: %s\n", where, optarg);
+ break;
+ case OPT_TCP_TS_ECR_SCALED:
+ config->tcp_ts_ecr_scaled = true;
+ break;
+ case OPT_TCP_TS_TICK_USECS:
+ config->tcp_ts_tick_usecs = atoi(optarg);
+ if (config->tcp_ts_tick_usecs < 0 ||
+ config->tcp_ts_tick_usecs > 1000000)
+ die("%s: bad --tcp_ts_tick_usecs: %s\n", where, optarg);
+ break;
+ case OPT_STRICT_SEGMENTS:
+ config->strict_segments = true;
+ break;
+ case OPT_WIRE_CLIENT:
+ config->is_wire_client = true;
+ break;
+ case OPT_WIRE_SERVER:
+ config->is_wire_server = true;
+ break;
+ case OPT_WIRE_SERVER_IP:
+ config->wire_server_ip_string = strdup(optarg);
+ config->wire_server_ip =
+ ipv4_parse(config->wire_server_ip_string);
+ break;
+ case OPT_WIRE_SERVER_PORT:
+ port = atoi(optarg);
+ if ((port <= 0) || (port > 0xffff))
+ die("%s: bad --wire_server_port: %s\n", where, optarg);
+ config->wire_server_port = port;
+ break;
+ case OPT_WIRE_CLIENT_DEV:
+ config->wire_client_device = strdup(optarg);
+ break;
+ case OPT_WIRE_SERVER_DEV:
+ config->wire_server_device = strdup(optarg);
+ break;
+ case OPT_SO_FILENAME:
+ config->so_filename = strdup(optarg);
+ break;
+ case OPT_SO_FLAGS:
+ config->so_flags = strdup(optarg);
+ break;
+ case OPT_DRY_RUN:
+ config->dry_run = true;
+ break;
+ case OPT_IS_ANYIP:
+ config->is_anyip = true;
+ break;
+ case OPT_SEND_OMIT_FREE:
+ config->send_omit_free = true;
+ break;
+ case OPT_DEFINE:
+ equals = strstr(optarg, "=");
+ if (equals == optarg || equals == NULL)
+ die("%s: bad definition: %s\n", where, optarg);
+ symbol = strndup(optarg, equals - optarg);
+ value = strdup(equals + 1);
+ definition_set(&config->defines, symbol, value);
+ break;
+ case OPT_VERBOSE:
+ config->verbose = true;
+ break;
+ default:
+ show_usage();
+ exit(EXIT_FAILURE);
+ }
+}
+
+
+/* Parse command line options. Returns a pointer to the first argument
+ * beyond the options.
+ */
+char **parse_command_line_options(int argc, char *argv[],
+ struct config *config)
+{
+ int c = 0;
+ int i = 0;
+
+ DEBUGP("parse_command_line_options argc=%d\n", argc);
+ for (i = 0; i < argc; ++i)
+ DEBUGP("argv[%d] = '%s'\n", i, argv[i]);
+
+ /* Make a copy of our arguments for later, in case we need to
+ * pass our options to a server. We use argc+1 here because,
+ * following main() calling conventions, we make the array
+ * element at argv[argc] a NULL pointer.
+ */
+ config->argv = calloc(argc + 1, sizeof(char *));
+ for (i = 0; argv[i]; ++i)
+ config->argv[i] = strdup(argv[i]);
+
+ /* Parse the arguments. */
+ optind = 0;
+ while ((c = getopt_long(argc, argv, "vD:", options, NULL)) > 0)
+ process_option(c, optarg, config, "Command Line");
+ return argv + optind;
+}
+
+static void parse_script_options(struct config *config,
+ struct option_list *option_list)
+{
+ struct option_list *opt = option_list;
+ while (opt != NULL) {
+ int i;
+ int c = 0;
+ for (i = 0; options[i].name != NULL; i++) {
+ if (strcmp(options[i].name, opt->name) == 0) {
+ c = options[i].val;
+ break;
+ }
+ }
+
+ if (!c)
+ die("%s: option '%s' unknown\n",
+ config->script_path, opt->name);
+ if (opt->value && !options[i].has_arg)
+ die("%s: option '%s' forbids an argument\n",
+ config->script_path, opt->name);
+ if (!opt->value && options[i].has_arg)
+ die("%s: option '%s' requires an argument\n",
+ config->script_path, opt->name);
+
+ process_option(options[i].val,
+ opt->value, config,
+ config->script_path);
+
+ opt = opt->next;
+ }
+}
+
+/* The parser calls this callback after it finishes parsing all
+ * --foo=bar options inside the script. At this point we know all
+ * command line and in-script options, and can finalize our
+ * configuration. Notably, this allows us to know when we parse a TCP
+ * packet line in the script whether we should create an IPv4 or IPv6
+ * packet.
+ */
+void parse_and_finalize_config(struct invocation *invocation)
+{
+ DEBUGP("parse_and_finalize_config\n");
+
+ /* Parse options in script */
+ parse_script_options(invocation->config,
+ invocation->script->option_list);
+
+ /* Command line options overwrite options in script */
+ parse_command_line_options(invocation->argc, invocation->argv,
+ invocation->config);
+
+ /* Now take care of the last details */
+ finalize_config(invocation->config);
+}
diff --git a/test/packetdrill/config.h b/test/packetdrill/config.h
new file mode 100644
index 0000000..649a8c4
--- /dev/null
+++ b/test/packetdrill/config.h
@@ -0,0 +1,204 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Configuration information for a test run, and helper functions.
+ */
+
+#ifndef __CONFIG_H__
+#define __CONFIG_H__
+
+#include "types.h"
+
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#include <getopt.h>
+#include "ip_address.h"
+#include "ip_prefix.h"
+#include "script.h"
+
+#define TUN_DRIVER_SPEED_CUR 0 /* don't change current speed */
+#define TUN_DRIVER_DEFAULT_MTU 1500 /* default MTU for tun device */
+
+extern struct option options[];
+
+/* A linked list of symbol->value (FOO=bar) definitions from command line. */
+struct definition {
+ char *symbol; /* name of the symbol; owns the string */
+ char *value; /* value of the symbol; owns the string */
+ struct definition *next; /* link for linked list */
+};
+
+/* Return the definition in the linked list with a matching symbol, or NULL */
+static inline struct definition *definition_find(struct definition *defs,
+ char *symbol)
+{
+ struct definition *def = NULL;
+
+ for (def = defs; def != NULL; def = def->next) {
+ if (strcmp(def->symbol, symbol) == 0)
+ return def;
+ }
+ return NULL;
+}
+
+/* Set the value of the given symbol to the given value. */
+static inline void definition_set(struct definition **defs,
+ char *symbol, char *value)
+{
+ struct definition *def = definition_find(*defs, symbol);
+
+ if (def) {
+ free(def->value);
+ def->value = value;
+ } else {
+ def = calloc(1, sizeof(struct definition));
+ def->symbol = symbol;
+ def->value = value;
+ def->next = *defs; /* link to existing entries */
+ *defs = def; /* insert at head of linked list */
+ }
+}
+
+/* Return the value of the given symbol, or NULL if not found. */
+static inline char *definition_get(struct definition *defs, char *symbol)
+{
+ struct definition *def = definition_find(defs, symbol);
+
+ return def ? def->value : NULL;
+}
+
+struct config {
+ const char **argv; /* a copy of process argv */
+
+ enum ip_version_t ip_version; /* v4, v4-mapped-v6, v6 */
+ int socket_domain; /* AF_INET or AF_INET6 */
+ int wire_protocol; /* AF_INET or AF_INET6 */
+
+ u16 live_bind_port; /* local port for bind() */
+ u16 live_connect_port; /* remote port for connect() */
+
+ struct ip_address live_bind_ip; /* address for bind() */
+ struct ip_address live_connect_ip; /* address for connect() */
+
+ struct ip_address live_local_ip; /* local interface IP */
+ struct ip_address live_remote_ip; /* remote interface IP */
+ struct ip_prefix live_remote_prefix; /* remote prefix under test */
+ struct ip_address live_gateway_ip; /* gateway interface IP */
+
+ char live_local_ip_string[ADDR_STR_LEN]; /* human-readable IP */
+ char live_remote_ip_string[ADDR_STR_LEN]; /* human-readable IP */
+ char live_remote_prefix_string[ADDR_STR_LEN]; /* <addr>/<prefixlen> */
+
+ char live_gateway_ip_string[ADDR_STR_LEN]; /* local gateway IP */
+ char live_netmask_ip_string[ADDR_STR_LEN]; /* local netmask */
+
+ int live_prefix_len; /* IPv4/IPv6 interface prefix len */
+
+ int tolerance_usecs; /* tolerance for time divergence */
+ bool tcp_ts_ecr_scaled; /* scale arbitrary inbound TS ECR? */
+ int tcp_ts_tick_usecs; /* microseconds per TS val tick */
+
+ u32 speed; /* speed reported by tun driver;
+ * may require special tun driver
+ */
+ int mss; /* gso_size for GRO packets to tun device */
+ int mtu; /* MTU of tun device */
+
+ bool strict_segments; /* check exact segmentation? */
+
+ bool non_fatal_packet; /* treat packet asserts as non-fatal */
+ bool non_fatal_syscall; /* treat syscall asserts as non-fatal */
+ bool send_omit_free; /* do not call free() */
+
+ bool dry_run; /* parse script but don't execute? */
+
+ bool verbose; /* print detailed debug info? */
+ char *script_path; /* pathname of script file */
+
+ /* Shell command to invoke via system(3) to run post-processing code */
+ char *code_command_line;
+
+ /* Language to emit when generating post-processing code */
+ char *code_format;
+
+ /* setsockopt option number (TCP_INFO, ...) for code */
+ char *code_sockopt;
+
+ /* File scripts to run at beginning of test (using system) */
+ char *init_scripts;
+
+ /* For remote on-the-wire testing using a real NIC. */
+ bool is_wire_client; /* use a real NIC and be client? */
+ bool is_wire_server; /* use a real NIC and be server? */
+ char *wire_client_device; /* iface name for send/receive */
+ char *wire_server_device; /* iface name for send/receive */
+ struct ip_address wire_server_ip; /* IP of on-the-wire server */
+ char *wire_server_ip_string; /* malloc-ed server IP string */
+ u16 wire_server_port; /* the port the server listens on */
+
+ /* For testing against a shared object (*.so) file. */
+ char *so_filename;
+ char *so_flags;
+
+ /* For anyip testing */
+ bool is_anyip;
+
+ /* List of FOO=bar definitions from command line. */
+ struct definition *defines;
+};
+
+/* Top-level info about the invocation of a test script */
+struct invocation {
+ int argc; /* count of process command line args */
+ char **argv; /* process command line args */
+ struct config *config; /* run-time configuration */
+ struct script *script; /* parse tree of the script to run */
+};
+
+/* Set default configuration */
+extern void set_default_config(struct config *config);
+
+/* Parse the "non-fatal" command line options given the (comma-delimited) string
+ * from the command line. Modifies the associated booleans in the given
+ * config.
+ */
+extern void parse_non_fatal_arg(char *arg, struct config *config);
+
+/* Perform configuration processing that can only be done after we've
+ * seen the full config. For example, we only know how to use IP
+ * addresses after we know if we're doing ipv4, ipv4-mapped-ipv6, or
+ * ipv6. Call this after all options have been parsed.
+ */
+extern void finalize_config(struct config *config);
+
+extern void show_usage(void);
+
+/* Parse command line options. Returns a pointer to the first argument
+ * beyond the options.
+ */
+extern char **parse_command_line_options(int argc, char *argv[],
+ struct config *config);
+
+/* The parser calls this function to finalize processing of config info. */
+extern void parse_and_finalize_config(struct invocation *invocation);
+
+#endif /* __CONFIG_H__ */
diff --git a/test/packetdrill/configure b/test/packetdrill/configure
new file mode 100755
index 0000000..e32ffc6
--- /dev/null
+++ b/test/packetdrill/configure
@@ -0,0 +1,3 @@
+#!/bin/sh
+
+ln -sf Makefile.`uname` Makefile
diff --git a/test/packetdrill/contrib/packetdrill.el b/test/packetdrill/contrib/packetdrill.el
new file mode 100644
index 0000000..3b71058
--- /dev/null
+++ b/test/packetdrill/contrib/packetdrill.el
@@ -0,0 +1,45 @@
+(setq packetdrill-keywords '("sa_family" "sin_port" "sin_addr" "msg_name" "msg_iov" "msg_flags" "fd" "events" "revents" "htons" "icmp" "udp" "inet_addr" "inet6_addr" "ack" "eol" "ecr" "mss" "mtu" "nop" "sack" "sackOK" "TS" "FO" "FOEXP" "val" "win" "wscale" "ect01" "ect0" "ect1" "noecn" "ce"))
+
+(setq packetdrill-constants '("AF_INET" "AF_INET6" "PF_INET" "PF_INET6" "SOCK_STREAM" "SOCK_DGRAM" "IPPROTO_IP" "IPPROTO_IPV6" "IPPROTO_ICMP" "IPPROTO_TCP" "IPPROTO_UDP" "SOL_SOCKET" "SOL_IP" "SOL_IPV6" "SOL_TCP" "SOL_UDP" "SO_ACCEPTCONN" "SO_ATTACH_FILTER" "SO_BINDTODEVICE" "SO_BROADCAST" "SO_BSDCOMPAT" "SO_DEBUG" "SO_DETACH_FILTER" "SO_DONTROUTE" "SO_ERROR" "SO_KEEPALIVE" "SO_LINGER" "SO_NO_CHECK" "SO_OOBINLINE" "SO_PASSCRED" "SO_PEERCRED" "SO_PEERNAME" "SO_PEERSEC" "SO_PRIORITY" "SO_RCVBUF" "SO_RCVLOWAT" "SO_RCVTIMEO" "SO_REUSEADDR" "SO_REUSEPORT" "SO_SECURITY_AUTHENTICATION" "SO_SECURITY_ENCRYPTION_NETWORK" "SO_SECURITY_ENCRYPTION_TRANSPORT" "SO_SNDBUF" "SO_SNDLOWAT" "SO_SNDTIMEO" "SO_TIMESTAMP" "SO_TYPE" "SO_MAX_PACING_RATE" "SO_ZEROCOPY" "IP_TOS" "IP_MTU_DISCOVER" "IP_PMTUDISC_WANT" "IP_PMTUDISC_DONT" "IP_PMTUDISC_DO" "IP_PMTUDISC_PROBE" "IP_MTU" "IPV6_MTU" "TCP_NODELAY" "TCP_MAXSEG" "TCP_CORK" "TCP_KEEPIDLE" "TCP_KEEPINTVL" "TCP_KEEPCNT" "TCP_SYNCNT" "TCP_LINGER2" "TCP_DEFER_ACCEPT" "TCP_INFO" "TCP_QUICKACK" "TCP_CONGESTION" "TCP_MD5SIG" "TCP_COOKIE_TRANSACTIONS" "TCP_THIN_LINEAR_TIMEOUTS" "TCP_THIN_DUPACK" "TCP_USER_TIMEOUT" "TCP_CWND" "TCP_SAVE_SYN" "TCP_SAVED_SYN" "TCP_FASTOPEN" "TCP_FASTOPEN_CONNECT" "TCP_MULTIPLE_CONNECTIONS" "O_RDONLY" "O_WRONLY" "O_RDWR" "O_ACCMODE" "O_CREAT" "O_EXCL" "O_NOCTTY" "O_TRUNC" "O_APPEND" "O_NONBLOCK" "F_DUPFD" "F_GETFD" "F_SETFD" "F_GETFL" "F_SETFL" "F_GETLK" "F_SETLK" "F_SETLKW" "F_GETOWN" "F_SETOWN" "F_SETSIG" "F_GETSIG" "F_GETOWN" "F_SETOWN" "F_SETLK" "F_SETLKW" "F_GETLK" "F_SETLK64" "F_SETLKW64" "F_GETLK64" "F_SETLEASE" "F_GETLEASE" "F_NOTIFY" "F_DUPFD_CLOEXEC" "FD_CLOEXEC" "LOCK_SH" "LOCK_EX" "LOCK_NB" "LOCK_UN" "F_RDLCK" "F_WRLCK" "F_UNLCK" "F_EXLCK" "F_SHLCK" "SEEK_SET" "SEEK_CUR" "SEEK_END" "MSG_OOB" "MSG_DONTROUTE" "MSG_PEEK" "MSG_CTRUNC" "MSG_PROXY" "MSG_EOR" "MSG_WAITALL" "MSG_TRUNC" "MSG_CTRUNC" "MSG_ERRQUEUE" "MSG_DONTWAIT" "MSG_CONFIRM" "MSG_FIN" "MSG_SYN" "MSG_RST" "MSG_NOSIGNAL" "MSG_MORE" "MSG_CMSG_CLOEXEC" "MSG_FASTOPEN" "MSG_ZEROCOPY" "SIOCINQ" "FIONREAD" "POLLIN" "POLLPRI" "POLLOUT" "POLLRDNORM" "POLLRDBAND" "POLLWRNORM" "POLLWRBAND" "POLLMSG" "POLLREMOVE" "POLLRDHUP" "POLLERR" "POLLHUP" "POLLNVAL" "EPERM" "ENOENT" "ESRCH" "EINTR" "EIO" "ENXIO" "E2BIG" "ENOEXEC" "EBADF" "ECHILD" "EAGAIN" "ENOMEM" "EACCES" "EFAULT" "ENOTBLK" "EBUSY" "EEXIST" "EXDEV" "ENODEV" "ENOTDIR" "EISDIR" "EINVAL" "ENFILE" "EMFILE" "ENOTTY" "ETXTBSY" "EFBIG" "ENOSPC" "ESPIPE" "EROFS" "EMLINK" "EPIPE" "EDOM" "ERANGE" "EDEADLK" "ENAMETOOLONG" "ENOLCK" "ENOSYS" "ENOTEMPTY" "ELOOP" "EWOULDBLOCK" "ENOMSG" "EIDRM" "ECHRNG" "EL2NSYNC" "EL3HLT" "EL3RST" "ELNRNG" "EUNATCH" "ENOCSI" "EL2HLT" "EBADE" "EBADR" "EXFULL" "ENOANO" "EBADRQC" "EBADSLT" "EDEADLOCK" "EBFONT" "ENOSTR" "ENODATA" "ETIME" "ENOSR" "ENONET" "ENOPKG" "EREMOTE" "ENOLINK" "EADV" "ESRMNT" "ECOMM" "EPROTO" "EMULTIHOP" "EDOTDOT" "EBADMSG" "EOVERFLOW" "ENOTUNIQ" "EBADFD" "EREMCHG" "ELIBACC" "ELIBBAD" "ELIBSCN" "ELIBMAX" "ELIBEXEC" "EILSEQ" "ERESTART" "ESTRPIPE" "EUSERS" "ENOTSOCK" "EDESTADDRREQ" "EMSGSIZE" "EPROTOTYPE" "ENOPROTOOPT" "EPROTONOSUPPORT" "ESOCKTNOSUPPORT" "EOPNOTSUPP" "EPFNOSUPPORT" "EAFNOSUPPORT" "EADDRINUSE" "EADDRNOTAVAIL" "ENETDOWN" "ENETUNREACH" "ENETRESET" "ECONNABORTED" "ECONNRESET" "ENOBUFS" "EISCONN" "ENOTCONN" "ESHUTDOWN" "ETOOMANYREFS" "ETIMEDOUT" "ECONNREFUSED" "EHOSTDOWN" "EHOSTUNREACH" "EALREADY" "EINPROGRESS" "ESTALE" "EUCLEAN" "ENOTNAM" "ENAVAIL" "EISNAM" "EREMOTEIO" "EDQUOT" "ENOMEDIUM" "EMEDIUMTYPE" "ECANCELED" "ENOKEY" "EKEYEXPIRED" "EKEYREVOKED" "EKEYREJECTED" "EOWNERDEAD" "ENOTRECOVERABLE" "ERFKILL" "POLLIN" "POLLPRI" "POLLOUT" "POLLRDNORM" "POLLRDBAND" "POLLWRNORM" "POLLWRBAND" "POLLMSG" "POLLREMOVE" "POLLRDHUP" "POLLERR" "POLLHUP" "POLLNVAL"))
+
+(setq packetdrill-functions '("accept" "bind" "close" "connect" "fcntl" "getsockopt" "ioctl" "listen" "poll" "read" "readv" "recv" "recvfrom" "recvmsg" "send" "sendmsg" "sendto" "setsockopt" "shutdown" "socket" "write" "writev"))
+
+;; create the regex string for each class of keywords
+(setq packetdrill-keywords-regexp (regexp-opt packetdrill-keywords 'words))
+(setq packetdrill-constant-regexp (regexp-opt packetdrill-constants 'words))
+(setq packetdrill-functions-regexp (regexp-opt packetdrill-functions 'words))
+
+;; clear memory
+(setq packetdrill-keywords nil)
+(setq packetdrill-constants nil)
+(setq packetdrill-functions nil)
+
+;; create the list for font-lock.
+;; each class of keyword is given a particular face
+(setq packetdrill-font-lock-keywords
+ `(
+ ("%{\\(.*\\n?\\)*}%" . font-lock-string-face)
+ ("`\\(.*\\n?\\)*`" . font-lock-warning-face)
+ ("\\.\\.\\." . font-lock-type-face)
+ ("\\s-<\\s-" . font-lock-warning-face)
+ ("\\s->\\s-" . font-lock-keyword-face)
+ (,packetdrill-constant-regexp . font-lock-constant-face)
+ (,packetdrill-functions-regexp . font-lock-function-name-face)
+ (,packetdrill-keywords-regexp . font-lock-preprocessor-face)
+ ))
+
+;; define the mode
+(define-derived-mode packetdrill-mode c-mode
+ "packetdrill mode"
+ "Major mode for editing packetdrill scripts"
+ ;; code for syntax highlighting
+ (setq font-lock-defaults '((packetdrill-font-lock-keywords)))
+
+ ;; clear memory
+ (setq packetdrill-keywords-regexp nil)
+ (setq packetdrill-types-regexp nil)
+ (setq packetdrill-constants-regexp nil)
+ (setq packetdrill-functions-regexp nil)
+ )
+
+(provide 'packetdrill-mode)
diff --git a/test/packetdrill/contrib/packetdrill.vim b/test/packetdrill/contrib/packetdrill.vim
new file mode 100644
index 0000000..a45da1e
--- /dev/null
+++ b/test/packetdrill/contrib/packetdrill.vim
@@ -0,0 +1,125 @@
+" Vim syntax file
+" Language: Packetdrill
+" Maintainer: Barath Raghavan <barath@google.com>
+" Last Change: 2013 Jul 27
+
+" Quit when a (custom) syntax file was already loaded
+if exists("b:current_syntax")
+ finish
+endif
+
+let s:cpo_save = &cpo
+set cpo&vim
+
+syn keyword pKeyword sa_family sin_port sin_addr msg_name msg_iov msg_flags fd events revents htons icmp udp inet_addr ack eol ecr mss mtu nop sack sackOK TS FO FOEXP val win wscale ect01 ect0 ect1 noecn ce
+syn keyword pConstant AF_INET AF_INET6 PF_INET PF_INET6 SOCK_STREAM SOCK_DGRAM IPPROTO_IP IPPROTO_IPV6 IPPROTO_ICMP IPPROTO_TCP IPPROTO_UDP SOL_SOCKET SOL_IP SOL_IPV6 SOL_TCP SOL_UDP SO_ACCEPTCONN SO_ATTACH_FILTER SO_BINDTODEVICE SO_BROADCAST SO_BSDCOMPAT SO_DEBUG SO_DETACH_FILTER SO_DONTROUTE SO_ERROR SO_KEEPALIVE SO_LINGER SO_NO_CHECK SO_OOBINLINE SO_PASSCRED SO_PEERCRED SO_PEERNAME SO_PEERSEC SO_PRIORITY SO_RCVBUF SO_RCVLOWAT SO_RCVTIMEO SO_REUSEADDR SO_REUSEPORT SO_SECURITY_AUTHENTICATION SO_SECURITY_ENCRYPTION_NETWORK SO_SECURITY_ENCRYPTION_TRANSPORT SO_SNDBUF SO_SNDLOWAT SO_SNDTIMEO SO_TIMESTAMP SO_TYPE SO_MAX_PACING_RATE SO_ZEROCOPY IP_TOS IP_MTU_DISCOVER IP_PMTUDISC_WANT IP_PMTUDISC_DONT IP_PMTUDISC_DO IP_PMTUDISC_PROBE IP_MTU IPV6_MTU TCP_NODELAY TCP_MAXSEG TCP_CORK TCP_KEEPIDLE TCP_KEEPINTVL TCP_KEEPCNT TCP_SYNCNT TCP_LINGER2 TCP_DEFER_ACCEPT TCP_INFO TCP_QUICKACK TCP_CONGESTION TCP_MD5SIG TCP_COOKIE_TRANSACTIONS TCP_THIN_LINEAR_TIMEOUTS TCP_THIN_DUPACK TCP_USER_TIMEOUT TCP_CWND TCP_SAVE_SYN TCP_SAVED_SYN TCP_FASTOPEN TCP_MULTIPLE_CONNECTIONS
+syn keyword pConstant O_RDONLY O_WRONLY O_RDWR O_ACCMODE O_CREAT O_EXCL O_NOCTTY O_TRUNC O_APPEND O_NONBLOCK F_DUPFD F_GETFD F_SETFD F_GETFL F_SETFL F_GETLK F_SETLK F_SETLKW F_GETOWN F_SETOWN F_SETSIG F_GETSIG F_GETOWN F_SETOWN F_SETLK F_SETLKW F_GETLK F_SETLK64 F_SETLKW64 F_GETLK64 F_SETLEASE F_GETLEASE F_NOTIFY F_DUPFD_CLOEXEC FD_CLOEXEC LOCK_SH LOCK_EX LOCK_NB LOCK_UN F_RDLCK F_WRLCK F_UNLCK F_EXLCK F_SHLCK SEEK_SET SEEK_CUR SEEK_END MSG_OOB MSG_DONTROUTE MSG_PEEK MSG_CTRUNC MSG_PROXY MSG_EOR MSG_WAITALL MSG_TRUNC MSG_CTRUNC MSG_ERRQUEUE MSG_DONTWAIT MSG_CONFIRM MSG_FIN MSG_SYN MSG_RST MSG_NOSIGNAL MSG_MORE MSG_CMSG_CLOEXEC MSG_FASTOPEN MSG_ZEROCOPY SIOCINQ FIONREAD POLLIN POLLPRI POLLOUT POLLRDNORM POLLRDBAND POLLWRNORM POLLWRBAND POLLMSG POLLREMOVE POLLRDHUP POLLERR POLLHUP POLLNVAL EPERM ENOENT ESRCH EINTR EIO ENXIO E2BIG ENOEXEC EBADF ECHILD EAGAIN ENOMEM EACCES EFAULT ENOTBLK EBUSY EEXIST EXDEV ENODEV ENOTDIR EISDIR EINVAL ENFILE EMFILE ENOTTY ETXTBSY EFBIG ENOSPC ESPIPE EROFS EMLINK EPIPE EDOM ERANGE EDEADLK ENAMETOOLONG ENOLCK ENOSYS ENOTEMPTY ELOOP EWOULDBLOCK ENOMSG EIDRM ECHRNG EL2NSYNC EL3HLT EL3RST ELNRNG EUNATCH ENOCSI EL2HLT EBADE EBADR EXFULL ENOANO EBADRQC EBADSLT EDEADLOCK EBFONT ENOSTR ENODATA ETIME ENOSR ENONET ENOPKG EREMOTE ENOLINK EADV ESRMNT ECOMM EPROTO EMULTIHOP EDOTDOT EBADMSG EOVERFLOW ENOTUNIQ EBADFD EREMCHG ELIBACC ELIBBAD ELIBSCN ELIBMAX ELIBEXEC EILSEQ ERESTART ESTRPIPE EUSERS ENOTSOCK EDESTADDRREQ EMSGSIZE EPROTOTYPE ENOPROTOOPT EPROTONOSUPPORT ESOCKTNOSUPPORT EOPNOTSUPP EPFNOSUPPORT EAFNOSUPPORT EADDRINUSE EADDRNOTAVAIL ENETDOWN ENETUNREACH ENETRESET ECONNABORTED ECONNRESET ENOBUFS EISCONN ENOTCONN ESHUTDOWN ETOOMANYREFS ETIMEDOUT ECONNREFUSED EHOSTDOWN EHOSTUNREACH EALREADY EINPROGRESS ESTALE EUCLEAN ENOTNAM ENAVAIL EISNAM EREMOTEIO EDQUOT ENOMEDIUM EMEDIUMTYPE ECANCELED ENOKEY EKEYEXPIRED EKEYREVOKED EKEYREJECTED EOWNERDEAD ENOTRECOVERABLE ERFKILL POLLIN POLLPRI POLLOUT POLLRDNORM POLLRDBAND POLLWRNORM POLLWRBAND POLLMSG POLLREMOVE POLLRDHUP POLLERR POLLHUP POLLNVAL
+syn keyword pSyscall accept bind close connect fcntl getsockopt ioctl listen poll read readv recv recvfrom recvmsg send sendmsg sendto setsockopt shutdown socket write writev
+syn keyword pPythonCmds contained assert print
+syn region pPython start='%{' end='}%' contains=pPythonCmds
+syn keyword pShellCmds contained sysctl
+syn region pShell start='`' end='`' contains=pShellCmds
+syn keyword pEllipsis '...'
+syn match pInputPkt "\s\+\zs<\ze\s\+"
+syn match pOutputPkt "\s\+\zs>\ze\s\+"
+
+" Below is stuff inherited from C, suitably modified.
+" String and Character constants
+" Highlight special characters (those which have a backslash) differently
+syn match cSpecial display contained "\\\(x\x\+\|\o\{1,3}\|.\|$\)"
+syn match cFormat display "%\(\d\+\$\)\=[-+' #0*]*\(\d*\|\*\|\*\d\+\$\)\(\.\(\d*\|\*\|\*\d\+\$\)\)\=\([hlLjzt]\|ll\|hh\)\=\([aAbdiuoxXDOUfFeEgGcCsSpn]\|\[\^\=.[^]]*\]\)" contained
+syn match cFormat display "%%" contained
+syn region cString start=+L\="+ skip=+\\\\\|\\"+ end=+"+ contains=cSpecial,cFormat,@Spell
+" cCppString: same as cString, but ends at end of line
+syn region cCppString start=+L\="+ skip=+\\\\\|\\"\|\\$+ excludenl end=+"+ end='$' contains=cSpecial,cFormat,@Spell
+
+" This should be before cErrInParen to avoid problems with #define ({ xxx })
+syn match cCurlyError "}"
+syn region cBlock start="{" end="}" contains=ALLBUT,cBadBlock,cCurlyError,@cParenGroup,cErrInParen,cCppParen,cErrInBracket,cCppBracket,cCppString,@Spell fold
+
+"catch errors caused by wrong parenthesis and brackets
+" also accept <% for {, %> for }, <: for [ and :> for ] (C99)
+" But avoid matching <::.
+syn cluster cParenGroup contains=cParenError,cSpecial,cCommentSkip,cCommentString,cComment2String,@cCommentGroup,cCommentStartError,cUserCont,cBitField,cOctalZero,@cCppOutInGroup,cFormat,cNumber,cFloat,cOctal,cOctalError,cNumbersCom
+syn region cParen transparent start='(' end=')' end='}'me=s-1 contains=ALLBUT,@cParenGroup,cCppParen,cErrInBracket,cCppBracket,cCppString,@Spell
+" cCppParen: same as cParen but ends at end-of-line; used in cDefine
+syn region cCppParen transparent start='(' skip='\\$' excludenl end=')' end='$' contained contains=ALLBUT,@cParenGroup,cErrInBracket,cParen,cBracket,cString,@Spell
+syn match cParenError display "[\])]"
+"syn match cErrInParen display contained "[\]{}]\|<%\|%>"
+syn region cBracket transparent start='\[\|<::\@!' end=']\|:>' end='}'me=s-1 contains=ALLBUT,@cParenGroup,cErrInParen,cCppParen,cCppBracket,cCppString,@Spell
+" cCppBracket: same as cParen but ends at end-of-line; used in cDefine
+syn region cCppBracket transparent start='\[\|<::\@!' skip='\\$' excludenl end=']\|:>' end='$' contained contains=ALLBUT,@cParenGroup,cErrInParen,cParen,cBracket,cString,@Spell
+"syn match cErrInBracket display contained "[);{}]\|<%\|%>"
+
+"integer number, or floating point number without a dot and with "f".
+syn case ignore
+syn match cNumbers display transparent "\<\d\|\.\d" contains=cNumber,cFloat,cOctalError,cOctal
+" Same, but without octal error (for comments)
+syn match cNumbersCom display contained transparent "\<\d\|\.\d" contains=cNumber,cFloat,cOctal
+syn match cNumber display contained "\d\+\(u\=l\{0,2}\|ll\=u\)\>"
+"hex number
+syn match cNumber display contained "0x\x\+\(u\=l\{0,2}\|ll\=u\)\>"
+" Flag the first zero of an octal number as something special
+syn match cOctal display contained "0\o\+\(u\=l\{0,2}\|ll\=u\)\>" contains=cOctalZero
+syn match cOctalZero display contained "\<0"
+syn match cFloat display contained "\d\+f"
+"floating point number, with dot, optional exponent
+syn match cFloat display contained "\d\+\.\d*\(e[-+]\=\d\+\)\=[fl]\="
+"floating point number, starting with a dot, optional exponent
+syn match cFloat display contained "\.\d\+\(e[-+]\=\d\+\)\=[fl]\=\>"
+"floating point number, without dot, with exponent
+syn match cFloat display contained "\d\+e[-+]\=\d\+[fl]\=\>"
+
+" flag an octal number with wrong digits
+syn match cOctalError display contained "0\o*[89]\d*"
+syn case match
+
+syn region cCommentL start="//" skip="\\$" end="$" keepend contains=@cCommentGroup,cSpaceError,@Spell
+syn region cComment matchgroup=cCommentStart start="/\*" end="\*/" contains=@cCommentGroup,cCommentStartError,cSpaceError,@Spell extend
+
+" keep a // comment separately, it terminates a preproc. conditional
+syn match cCommentError display "\*/"
+syn match cCommentStartError display "/\*"me=e-1 contained
+
+" Define the default highlighting.
+" Only used when an item doesn't have highlighting yet
+hi def link pKeyword Conditional
+hi def link pConstant Constant
+hi def link pSyscall Type
+hi def link pPythonCmds Label
+hi def link pPython PreProc
+hi def link pShellCmds Label
+hi def link pShell PreCondit
+hi def link pEllipsis String
+hi def link pInputPkt Todo
+hi def link pOutputPkt Error
+
+hi def link cFormat cSpecial
+hi def link cCppString cString
+hi def link cCommentL cComment
+hi def link cCommentStart cComment
+hi def link cNumber Number
+hi def link cOctal Number
+hi def link cOctalZero PreProc " link this to Error if you want
+hi def link cFloat Float
+hi def link cOctalError cError
+hi def link cParenError cError
+hi def link cErrInParen cError
+hi def link cErrInBracket cError
+hi def link cCommentError cError
+hi def link cCommentStartError cError
+hi def link cSpecialError cError
+hi def link cError Error
+hi def link cCommentString cString
+hi def link cComment2String cString
+hi def link cCommentSkip cComment
+hi def link cString String
+hi def link cComment Comment
+hi def link cSpecial SpecialChar
+hi def link cCppOut Comment
+
+let b:current_syntax = "packetdrill"
+
+let &cpo = s:cpo_save
+unlet s:cpo_save
+" vim: ts=8
diff --git a/test/packetdrill/epoll.c b/test/packetdrill/epoll.c
new file mode 100644
index 0000000..5e3e79c
--- /dev/null
+++ b/test/packetdrill/epoll.c
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: weiwan@google.com (Wei Wang)
+ *
+ * Implementation for the epoll fd related state and logic.
+ */
+
+#include "epoll.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include "run.h"
+
+void epoll_free(struct epoll *epoll)
+{
+ memset(epoll, 0, sizeof(*epoll));
+ free(epoll);
+}
+
+void epoll_close(struct state *state, struct fd_state *fd)
+{
+ epoll_free(fd_to_epoll(fd));
+}
+
+/* Global info about epoll descriptors that point to epolls. */
+struct fd_ops epoll_ops = {
+ .type = FD_EPOLL,
+ .close = epoll_close,
+};
+
+struct epoll *epoll_new(struct state *state)
+{
+ struct epoll *epoll = calloc(1, sizeof(struct epoll));
+
+ epoll->fd.ops = &epoll_ops;
+ state_add_fd(state, to_fd(epoll));
+ return epoll;
+}
diff --git a/test/packetdrill/epoll.h b/test/packetdrill/epoll.h
new file mode 100644
index 0000000..dac032a
--- /dev/null
+++ b/test/packetdrill/epoll.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: weiwan@google.com (Wei Wang)
+ *
+ * Interface for tracking epolls in the kernel under test.
+ */
+
+#ifndef __EPOLL_HDR_H__
+#define __EPOLL_HDR_H__
+
+#include "types.h"
+
+#include "fd_state.h"
+
+/* Type specification for epoll_event->data */
+enum epoll_data_type_t {
+ EPOLL_DATA_PTR = 1,
+ EPOLL_DATA_FD,
+ EPOLL_DATA_U32,
+ EPOLL_DATA_U64,
+};
+
+/* The runtime state for epoll */
+struct epoll {
+ /* NOTE: struct fd_state must be first field in all fd flavors. */
+ struct fd_state fd; /* info about fd for this epoll event */
+};
+
+/* Convert to epoll pointer if the fd has type FD_EPOLL,
+ * otherwise return NULL.
+ */
+static inline struct epoll *fd_to_epoll(struct fd_state *fd)
+{
+ if (fd && fd->ops->type == FD_EPOLL)
+ return (struct epoll *)fd;
+ else
+ return NULL;
+}
+
+struct state;
+
+/* Allocate and return a new epoll object. */
+extern struct epoll *epoll_new(struct state *state);
+
+#endif /* __EPOLL_HDR_H__ */
diff --git a/test/packetdrill/ethernet.h b/test/packetdrill/ethernet.h
new file mode 100644
index 0000000..5713d04
--- /dev/null
+++ b/test/packetdrill/ethernet.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Ethernet-related declarations.
+ *
+ * We cannot include the kernel's linux/if_ether.h because this tool
+ * tries to compile and work for basically any Linux/BSD kernel
+ * version. So we have our version of the Ethernet-related
+ * declarations we require here.
+ */
+
+#ifndef __ETHERNET_H__
+#define __ETHERNET_H__
+
+#include "types.h"
+
+/* Bytes in an Ethernet address. */
+#define ETH_ALEN 6
+
+/* Ethernet header ether_type values. */
+#define ETHERTYPE_IP 0x0800 /* IP protocol version 4 */
+#define ETHERTYPE_IPV6 0x86dd /* IP protocol version 6 */
+#define ETHERTYPE_MPLS_UC 0x8847 /* MPLS unicast */
+#define ETHERTYPE_MPLS_MC 0x8848 /* MPLS multicast */
+
+/* To tell a packet socket that you want traffic for all protocols. */
+#define ETH_P_ALL 0x0003
+
+/* Ethernet address. */
+struct ether_addr {
+ u8 ether_addr_octet[ETH_ALEN];
+} __attribute__ ((__packed__));
+
+/* Ethernet header. */
+struct ether_header {
+ u8 ether_dhost[ETH_ALEN]; /* destination Ethernet address */
+ u8 ether_shost[ETH_ALEN]; /* source Ethernet address */
+ u16 ether_type; /* packet type ID field */
+} __attribute__ ((__packed__));
+
+static inline void ether_copy(void *dst, const void *src)
+{
+ memcpy(dst, src, sizeof(struct ether_addr));
+}
+
+/* Return the ether_type field for packets of the given address family. */
+static inline u16 ether_type_for_family(int address_family)
+{
+ if (address_family == AF_INET)
+ return ETHERTYPE_IP;
+ else if (address_family == AF_INET6)
+ return ETHERTYPE_IPV6;
+ else
+ assert(!"bad address family");
+}
+
+#endif /* __ETHERNET_H__ */
diff --git a/test/packetdrill/fd_state.h b/test/packetdrill/fd_state.h
new file mode 100644
index 0000000..f08a559
--- /dev/null
+++ b/test/packetdrill/fd_state.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Interface for tracking file descriptors in the kernel under test.
+ */
+
+#ifndef __FD_STATE_H__
+#define __FD_STATE_H__
+
+#include "types.h"
+
+/* The types of file descriptor objects packetdrill can test. */
+enum fd_type_t {
+ FD_SOCKET = 1,
+ FD_FILE,
+ FD_PIPE,
+ FD_EPOLL,
+};
+
+struct state;
+struct fd_state;
+
+/* Global info about a particular kind of file descriptor. */
+struct fd_ops {
+ enum fd_type_t type; /* type of this file descriptor */
+
+ /* Handler for closing fd. */
+ void (*close)(struct state *state, struct fd_state *fd);
+};
+
+/* State for a file descriptor during script execution. */
+struct fd_state {
+ struct fd_ops *ops; /* info/ops for this type of fd */
+ int script_fd; /* file descriptor in the script source */
+ int live_fd; /* file descriptor in packetdrill runtime */
+ bool is_closed; /* has app called close(2) ? */
+ struct fd_state *next; /* next fd in linked list */
+};
+
+/* To cast any type of fd to the base classs. */
+static inline struct fd_state *to_fd(void *fd)
+{
+ return (struct fd_state *)fd;
+}
+
+#endif /* __FD_STATE_H__ */
diff --git a/test/packetdrill/file.c b/test/packetdrill/file.c
new file mode 100644
index 0000000..56e6cf3
--- /dev/null
+++ b/test/packetdrill/file.c
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Implementation for the file-related state and logic.
+ */
+
+#include "file.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include "run.h"
+
+void file_free(struct file *file)
+{
+ memset(file, 0, sizeof(*file)); /* paranoia to help catch bugs */
+ free(file);
+}
+
+void file_close(struct state *state, struct fd_state *fd)
+{
+ file_free(fd_to_file(fd));
+}
+
+/* Global info about file descriptors that point to files. */
+struct fd_ops file_ops = {
+ .type = FD_FILE,
+ .close = file_close,
+};
+
+struct file *file_new(struct state *state)
+{
+ struct file *file = calloc(1, sizeof(struct file));
+
+ file->fd.ops = &file_ops;
+ state_add_fd(state, to_fd(file));
+ return file;
+}
diff --git a/test/packetdrill/file.h b/test/packetdrill/file.h
new file mode 100644
index 0000000..22d084f
--- /dev/null
+++ b/test/packetdrill/file.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Interface for tracking files in the kernel under test.
+ */
+
+#ifndef __FILE_H__
+#define __FILE_H__
+
+#include "types.h"
+
+#include "fd_state.h"
+
+/* The runtime state for a file */
+struct file {
+ /* NOTE: struct fd_state must be first field in all fd flavors. */
+ struct fd_state fd; /* info about fd for this file */
+};
+
+/* Convert to file pointer if the fd is a file, otherwise return NULL. */
+static inline struct file *fd_to_file(struct fd_state *fd)
+{
+ if (fd && fd->ops->type == FD_FILE)
+ return (struct file *)fd;
+ else
+ return NULL;
+}
+
+struct state;
+
+/* Allocate and return a new file object. */
+extern struct file *file_new(struct state *state);
+
+#endif /* __FILE_H__ */
diff --git a/test/packetdrill/fmemopen.c b/test/packetdrill/fmemopen.c
new file mode 100644
index 0000000..5ed2be5
--- /dev/null
+++ b/test/packetdrill/fmemopen.c
@@ -0,0 +1,81 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * FreeBSD does not have an fmemopen(), so we roll our own minimalist
+ * implementation here.
+ */
+
+#include "types.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "assert.h"
+#include "fmemopen.h"
+
+#if !defined(HAVE_FMEMOPEN)
+
+struct fmemopen_read_state {
+ char *next; /* the next byte to return */
+ char *end; /* the byte after the end of the string */
+};
+
+static int fmemopen_readfn(void *cookie, char *buf, int len)
+{
+ struct fmemopen_read_state *read_cookie =
+ (struct fmemopen_read_state *)cookie;
+ int bytes = 0;
+
+ assert(read_cookie->next <= read_cookie->end);
+ if (read_cookie->next == read_cookie->end)
+ return 0;
+
+ bytes = read_cookie->end - read_cookie->next;
+ if (len < bytes)
+ bytes = len;
+
+ memcpy(buf, read_cookie->next, bytes);
+ read_cookie->next += bytes;
+
+ return bytes;
+}
+
+FILE *fmemopen(char *str, size_t size, const char *mode)
+{
+ FILE *f = NULL;
+ struct fmemopen_read_state *read_cookie;
+
+ assert(strcmp(mode, "r") == 0); /* only support read for now */
+
+ read_cookie = calloc(1, sizeof(struct fmemopen_read_state));
+ read_cookie->next = str;
+ read_cookie->end = str + size;
+
+ f = fropen(read_cookie, fmemopen_readfn);
+ if (!f) {
+ free(read_cookie);
+ return NULL;
+ }
+
+ return f;
+}
+
+#endif /* HAVE_FMEMOPEN */
diff --git a/test/packetdrill/fmemopen.h b/test/packetdrill/fmemopen.h
new file mode 100644
index 0000000..f4bdbbb
--- /dev/null
+++ b/test/packetdrill/fmemopen.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * FreeBSD does not have an fmemopen(), so we roll our own minimalist
+ * implementation here.
+ */
+
+#ifndef __FMEMOPEN_H__
+#define __FMEMOPEN_H__
+
+#ifndef HAVE_FMEMOPEN
+
+#include "types.h"
+
+extern FILE *fmemopen(char *buf, size_t size, const char *mode);
+
+#endif /* HAVE_FMEMOPEN */
+
+#endif /* __FMEMOPEN_H__ */
diff --git a/test/packetdrill/gre.h b/test/packetdrill/gre.h
new file mode 100644
index 0000000..8947ccd
--- /dev/null
+++ b/test/packetdrill/gre.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Our own GRE header declarations, so we have something that's
+ * portable and somewhat more readable than a typical system header
+ * file.
+ *
+ * We cannot include the kernel's GRE .h files because this tool tries
+ * to compile and work for basically any Linux/BSD kernel version. So
+ * we declare our own version of various GRE-related definitions here.
+ */
+
+#ifndef __GRE_HEADERS_H__
+#define __GRE_HEADERS_H__
+
+#include "types.h"
+
+/* GRE header. See RFC 1701. */
+
+#define GRE_MINLEN 4 /* smallest possible GRE header */
+
+#define GRE_FLAG_C 0x8000 /* checksum */
+#define GRE_FLAG_R 0x4000 /* routing */
+#define GRE_FLAG_K 0x2000 /* key */
+#define GRE_FLAG_S 0x1000 /* sequence */
+
+struct gre {
+ union {
+ __be16 flags;
+
+ struct {
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+ __u16 recursion_control:3,
+ strict_route:1,
+ has_seq:1,
+ has_key:1,
+ has_routing:1,
+ has_checksum:1,
+ version:3,
+ reserved:4,
+ ack:1;
+#elif __BYTE_ORDER == __BIG_ENDIAN
+ __u16 has_checksum:1,
+ has_routing:1,
+ has_key:1,
+ has_seq:1,
+ strict_route:1,
+ recursion_control:3,
+ ack:1,
+ reserved:4,
+ version:3;
+#else
+# error "Please fix endianness defines"
+#endif
+ };
+ };
+ __be16 proto;
+
+ /* The optional header fields live here. */
+ union {
+ __be16 be16[6];
+ __be32 be32[3];
+ };
+};
+
+/* Return the length in bytes of a GRE header. */
+static inline int gre_len(const struct gre *gre)
+{
+ int bytes = GRE_MINLEN;
+
+ assert(gre->version == 0); /* we only support v0 */
+ assert(!gre->has_routing); /* routing info is variable-length! */
+
+ if (gre->has_checksum || gre->has_routing)
+ bytes += 4;
+ if (gre->has_key)
+ bytes += 4;
+ if (gre->has_seq)
+ bytes += 4;
+
+ return bytes;
+}
+
+#endif /* __GRE_HEADERS_H__ */
diff --git a/test/packetdrill/gre_packet.c b/test/packetdrill/gre_packet.c
new file mode 100644
index 0000000..235e6d9
--- /dev/null
+++ b/test/packetdrill/gre_packet.c
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Implementation for module for formatting GRE packets.
+ */
+
+#include "gre_packet.h"
+
+#include "ip_packet.h"
+#include "gre.h"
+
+int gre_header_append(struct packet *packet, const struct gre *gre, char **error)
+{
+ struct header *header;
+
+ header = packet_append_header(packet, HEADER_GRE, gre_len(gre));
+ if (header == NULL) {
+ asprintf(error, "too many headers");
+ return STATUS_ERR;
+ }
+
+ memcpy(header->h.gre, gre, gre_len(gre));
+
+ return STATUS_OK;
+}
+
+int gre_header_finish(struct packet *packet,
+ struct header *header, struct header *next_inner)
+{
+ struct gre *gre = header->h.gre;
+ int gre_bytes = gre_len(gre) + next_inner->total_bytes;
+
+ gre->proto = htons(header_type_info(next_inner->type)->eth_proto);
+
+ header->total_bytes = gre_bytes;
+
+ return STATUS_OK;
+}
diff --git a/test/packetdrill/gre_packet.h b/test/packetdrill/gre_packet.h
new file mode 100644
index 0000000..bceee2d
--- /dev/null
+++ b/test/packetdrill/gre_packet.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Interface for module for formatting GRE packets.
+ */
+
+#ifndef __GRE_PACKET_H__
+#define __GRE_PACKET_H__
+
+#include "types.h"
+
+#include "packet.h"
+
+/* Append a GRE header to the end of the given packet. On success,
+ * return STATUS_OK; on error return STATUS_ERR and fill in a
+ * malloc-allocated error message in *error.
+ */
+extern int gre_header_append(struct packet *packet,
+ const struct gre *gre, char **error);
+
+/* Finalize the GRE header by filling in all necessary fields that
+ * were not filled in at parse time.
+ */
+extern int gre_header_finish(struct packet *packet,
+ struct header *header, struct header *next_inner);
+
+#endif /* __GRE_PACKET_H__ */
diff --git a/test/packetdrill/hash.c b/test/packetdrill/hash.c
new file mode 100644
index 0000000..7cfcd1a
--- /dev/null
+++ b/test/packetdrill/hash.c
@@ -0,0 +1,430 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*---------------------------------------------------------------------------
+ * From public domain code at:
+ * http://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
+ */
+
+/*---------------------------------------------------------------------------
+ * MurmurHash3 was written by Austin Appleby, and is placed in the public
+ * domain. The author hereby disclaims copyright to this source code.
+ *
+ * Note - The x86 and x64 versions do _not_ produce the same results, as the
+ * algorithms are optimized for their respective platforms. You can still
+ * compile and run any of them on any platform, but your performance with the
+ * non-native version will be less than optimal.
+ */
+#include "hash.h"
+
+/*---------------------------------------------------------------------------
+ * Platform-specific functions and macros
+ */
+
+static __always_inline u32 rotl32(u32 x, s8 r)
+{
+ return (x << r) | (x >> (32 - r));
+}
+
+static __always_inline u64 rotl64(u64 x, s8 r)
+{
+ return (x << r) | (x >> (64 - r));
+}
+
+#define ROTL32(x, y) rotl32(x, y)
+#define ROTL64(x, y) rotl64(x, y)
+
+#define BIG_CONSTANT(x) (x##LLU)
+
+/*---------------------------------------------------------------------------
+ * Block read - if your platform needs to do endian-swapping or can only
+ * handle aligned reads, do the conversion here
+ */
+
+static __always_inline u32 getblock_32(const u32 *p, int i)
+{
+ return p[i];
+}
+
+static __always_inline u64 getblock_64(const u64 *p, int i)
+{
+ return p[i];
+}
+
+/*---------------------------------------------------------------------------
+ * Finalization mix - force all bits of a hash block to avalanche
+ */
+
+static __always_inline u32 fmix_32(u32 h)
+{
+ h ^= h >> 16;
+ h *= 0x85ebca6b;
+ h ^= h >> 13;
+ h *= 0xc2b2ae35;
+ h ^= h >> 16;
+
+ return h;
+}
+
+/*---------*/
+
+static __always_inline u64 fmix_64(u64 k)
+{
+ k ^= k >> 33;
+ k *= BIG_CONSTANT(0xff51afd7ed558ccd);
+ k ^= k >> 33;
+ k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
+ k ^= k >> 33;
+
+ return k;
+}
+
+/*---------------------------------------------------------------------------*/
+
+void MurmurHash3_x86_32(const void *key, int len, u32 seed, void *out)
+{
+ const u8 *data = (const u8 *)key;
+ const int nblocks = len / 4;
+
+ u32 h1 = seed;
+
+ u32 c1 = 0xcc9e2d51;
+ u32 c2 = 0x1b873593;
+
+ /*---------*/
+ /* body */
+
+ const u32 *blocks = (const u32 *)(data + nblocks * 4);
+
+ int i;
+ for (i = -nblocks; i; i++) {
+ u32 k1 = getblock_32(blocks, i);
+
+ k1 *= c1;
+ k1 = ROTL32(k1, 15);
+ k1 *= c2;
+
+ h1 ^= k1;
+ h1 = ROTL32(h1, 13);
+ h1 = h1 * 5 + 0xe6546b64;
+ }
+
+ /*---------*/
+ /* tail */
+
+ const u8 *tail = (const u8 *)(data + nblocks * 4);
+
+ u32 k1 = 0;
+
+ switch (len & 3) {
+ case 3:
+ k1 ^= tail[2] << 16;
+ case 2:
+ k1 ^= tail[1] << 8;
+ case 1:
+ k1 ^= tail[0];
+ k1 *= c1;
+ k1 = ROTL32(k1, 15);
+ k1 *= c2;
+ h1 ^= k1;
+ };
+
+ /*---------*/
+ /* finalization */
+
+ h1 ^= len;
+
+ h1 = fmix_32(h1);
+
+ *(u32 *) out = h1;
+}
+
+/*---------------------------------------------------------------------------*/
+
+void MurmurHash3_x86_128(const void *key, const int len, u32 seed, void *out)
+{
+ const u8 *data = (const u8 *)key;
+ const int nblocks = len / 16;
+
+ u32 h1 = seed;
+ u32 h2 = seed;
+ u32 h3 = seed;
+ u32 h4 = seed;
+
+ u32 c1 = 0x239b961b;
+ u32 c2 = 0xab0e9789;
+ u32 c3 = 0x38b34ae5;
+ u32 c4 = 0xa1e38b93;
+
+ /*---------*/
+ /* body */
+
+ const u32 *blocks = (const u32 *)(data + nblocks * 16);
+
+ int i;
+ for (i = -nblocks; i; i++) {
+ u32 k1 = getblock_32(blocks, i * 4 + 0);
+ u32 k2 = getblock_32(blocks, i * 4 + 1);
+ u32 k3 = getblock_32(blocks, i * 4 + 2);
+ u32 k4 = getblock_32(blocks, i * 4 + 3);
+
+ k1 *= c1;
+ k1 = ROTL32(k1, 15);
+ k1 *= c2;
+ h1 ^= k1;
+
+ h1 = ROTL32(h1, 19);
+ h1 += h2;
+ h1 = h1 * 5 + 0x561ccd1b;
+
+ k2 *= c2;
+ k2 = ROTL32(k2, 16);
+ k2 *= c3;
+ h2 ^= k2;
+
+ h2 = ROTL32(h2, 17);
+ h2 += h3;
+ h2 = h2 * 5 + 0x0bcaa747;
+
+ k3 *= c3;
+ k3 = ROTL32(k3, 17);
+ k3 *= c4;
+ h3 ^= k3;
+
+ h3 = ROTL32(h3, 15);
+ h3 += h4;
+ h3 = h3 * 5 + 0x96cd1c35;
+
+ k4 *= c4;
+ k4 = ROTL32(k4, 18);
+ k4 *= c1;
+ h4 ^= k4;
+
+ h4 = ROTL32(h4, 13);
+ h4 += h1;
+ h4 = h4 * 5 + 0x32ac3b17;
+ }
+
+ /*---------*/
+ /* tail */
+
+ const u8 *tail = (const u8 *)(data + nblocks * 16);
+
+ u32 k1 = 0;
+ u32 k2 = 0;
+ u32 k3 = 0;
+ u32 k4 = 0;
+
+ switch (len & 15) {
+ case 15:
+ k4 ^= tail[14] << 16;
+ case 14:
+ k4 ^= tail[13] << 8;
+ case 13:
+ k4 ^= tail[12] << 0;
+ k4 *= c4;
+ k4 = ROTL32(k4, 18);
+ k4 *= c1;
+ h4 ^= k4;
+
+ case 12:
+ k3 ^= tail[11] << 24;
+ case 11:
+ k3 ^= tail[10] << 16;
+ case 10:
+ k3 ^= tail[9] << 8;
+ case 9:
+ k3 ^= tail[8] << 0;
+ k3 *= c3;
+ k3 = ROTL32(k3, 17);
+ k3 *= c4;
+ h3 ^= k3;
+
+ case 8:
+ k2 ^= tail[7] << 24;
+ case 7:
+ k2 ^= tail[6] << 16;
+ case 6:
+ k2 ^= tail[5] << 8;
+ case 5:
+ k2 ^= tail[4] << 0;
+ k2 *= c2;
+ k2 = ROTL32(k2, 16);
+ k2 *= c3;
+ h2 ^= k2;
+
+ case 4:
+ k1 ^= tail[3] << 24;
+ case 3:
+ k1 ^= tail[2] << 16;
+ case 2:
+ k1 ^= tail[1] << 8;
+ case 1:
+ k1 ^= tail[0] << 0;
+ k1 *= c1;
+ k1 = ROTL32(k1, 15);
+ k1 *= c2;
+ h1 ^= k1;
+ };
+
+ /*---------*/
+ /* finalization */
+
+ h1 ^= len;
+ h2 ^= len;
+ h3 ^= len;
+ h4 ^= len;
+
+ h1 += h2;
+ h1 += h3;
+ h1 += h4;
+ h2 += h1;
+ h3 += h1;
+ h4 += h1;
+
+ h1 = fmix_32(h1);
+ h2 = fmix_32(h2);
+ h3 = fmix_32(h3);
+ h4 = fmix_32(h4);
+
+ h1 += h2;
+ h1 += h3;
+ h1 += h4;
+ h2 += h1;
+ h3 += h1;
+ h4 += h1;
+
+ ((u32 *) out)[0] = h1;
+ ((u32 *) out)[1] = h2;
+ ((u32 *) out)[2] = h3;
+ ((u32 *) out)[3] = h4;
+}
+
+/*---------------------------------------------------------------------------*/
+
+void MurmurHash3_x64_128(const void *key, const int len,
+ const u32 seed, void *out)
+{
+ const u8 *data = (const u8 *)key;
+ const int nblocks = len / 16;
+
+ u64 h1 = seed;
+ u64 h2 = seed;
+
+ u64 c1 = BIG_CONSTANT(0x87c37b91114253d5);
+ u64 c2 = BIG_CONSTANT(0x4cf5ad432745937f);
+
+ /*---------*/
+ /* body */
+
+ const u64 *blocks = (const u64 *)(data);
+
+ int i;
+ for (i = 0; i < nblocks; i++) {
+ u64 k1 = getblock_64(blocks, i * 2 + 0);
+ u64 k2 = getblock_64(blocks, i * 2 + 1);
+
+ k1 *= c1;
+ k1 = ROTL64(k1, 31);
+ k1 *= c2;
+ h1 ^= k1;
+
+ h1 = ROTL64(h1, 27);
+ h1 += h2;
+ h1 = h1 * 5 + 0x52dce729;
+
+ k2 *= c2;
+ k2 = ROTL64(k2, 33);
+ k2 *= c1;
+ h2 ^= k2;
+
+ h2 = ROTL64(h2, 31);
+ h2 += h1;
+ h2 = h2 * 5 + 0x38495ab5;
+ }
+
+ /*---------*/
+ /* tail */
+
+ const u8 *tail = (const u8 *)(data + nblocks * 16);
+
+ u64 k1 = 0;
+ u64 k2 = 0;
+
+ switch (len & 15) {
+ case 15:
+ k2 ^= (u64) (tail[14]) << 48;
+ case 14:
+ k2 ^= (u64) (tail[13]) << 40;
+ case 13:
+ k2 ^= (u64) (tail[12]) << 32;
+ case 12:
+ k2 ^= (u64) (tail[11]) << 24;
+ case 11:
+ k2 ^= (u64) (tail[10]) << 16;
+ case 10:
+ k2 ^= (u64) (tail[9]) << 8;
+ case 9:
+ k2 ^= (u64) (tail[8]) << 0;
+ k2 *= c2;
+ k2 = ROTL64(k2, 33);
+ k2 *= c1;
+ h2 ^= k2;
+
+ case 8:
+ k1 ^= (u64) (tail[7]) << 56;
+ case 7:
+ k1 ^= (u64) (tail[6]) << 48;
+ case 6:
+ k1 ^= (u64) (tail[5]) << 40;
+ case 5:
+ k1 ^= (u64) (tail[4]) << 32;
+ case 4:
+ k1 ^= (u64) (tail[3]) << 24;
+ case 3:
+ k1 ^= (u64) (tail[2]) << 16;
+ case 2:
+ k1 ^= (u64) (tail[1]) << 8;
+ case 1:
+ k1 ^= (u64) (tail[0]) << 0;
+ k1 *= c1;
+ k1 = ROTL64(k1, 31);
+ k1 *= c2;
+ h1 ^= k1;
+ };
+
+ /*---------*/
+ /* finalization */
+
+ h1 ^= len;
+ h2 ^= len;
+
+ h1 += h2;
+ h2 += h1;
+
+ h1 = fmix_64(h1);
+ h2 = fmix_64(h2);
+
+ h1 += h2;
+ h2 += h1;
+
+ ((u64 *) out)[0] = h1;
+ ((u64 *) out)[1] = h2;
+}
+
+/*---------------------------------------------------------------------------*/
diff --git a/test/packetdrill/hash.h b/test/packetdrill/hash.h
new file mode 100644
index 0000000..ab2ba52
--- /dev/null
+++ b/test/packetdrill/hash.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/* From: http://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.h */
+
+/*---------------------------------------------------------------------------
+ * MurmurHash3 was written by Austin Appleby, and is placed in the public
+ * domain. The author hereby disclaims copyright to this source code.
+ */
+
+#ifndef _MURMURHASH3_H_
+#define _MURMURHASH3_H_
+
+#include "types.h"
+
+#include <stdint.h>
+
+/*---------------------------------------------------------------------------*/
+
+void MurmurHash3_x86_32(const void *key, int len, u32 seed, void *out);
+
+void MurmurHash3_x86_128(const void *key, int len, u32 seed, void *out);
+
+void MurmurHash3_x64_128(const void *key, int len, u32 seed, void *out);
+
+/*---------------------------------------------------------------------------*/
+
+#endif /* _MURMURHASH3_H_ */
diff --git a/test/packetdrill/hash_map.c b/test/packetdrill/hash_map.c
new file mode 100644
index 0000000..c18af55
--- /dev/null
+++ b/test/packetdrill/hash_map.c
@@ -0,0 +1,162 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Implementation for a simple hash map mapping u32 keys to u32 values.
+ */
+
+#include "hash_map.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include "hash.h"
+
+static const size_t MAX_BUCKETS = 1ULL << 30; /* max 1B buckets */
+
+/* Hash a key. We use the fast, public-domain MurmurHash3.*/
+static inline size_t hash_key(u32 key)
+{
+ u32 hash;
+ MurmurHash3_x86_32(&key, sizeof(key), 0, &hash);
+ return hash;
+}
+
+/* Find the bucket number for a key. */
+static inline size_t hash_bucket_num(const struct hash_map *map, u32 key)
+{
+ size_t bucket_num = hash_key(key) & map->bucket_mask;
+ return bucket_num;
+}
+
+/* Try to find the smallest bucket count that is a power of 2 and is
+ * greater than the given number of keys.
+ */
+static inline size_t hash_map_pick_bucket_count(size_t num_keys)
+{
+ size_t buckets = 1;
+ while ((buckets < num_keys) && (buckets < MAX_BUCKETS))
+ buckets <<= 1;
+ return buckets;
+}
+
+struct hash_map *hash_map_new(size_t num_keys)
+{
+ struct hash_map *map = calloc(1, sizeof(struct hash_map));
+ map->num_buckets = hash_map_pick_bucket_count(num_keys);
+ map->bucket_mask = map->num_buckets - 1;
+ map->buckets = calloc(map->num_buckets, sizeof(struct hash_node *));
+ return map;
+}
+
+void hash_map_free(struct hash_map *map)
+{
+ /* Walk through the buckets and free nodes. */
+ int bucket_num;
+ for (bucket_num = 0; bucket_num < map->num_buckets; ++bucket_num) {
+ struct hash_node *node = NULL;
+ struct hash_node *next = NULL;
+ for (node = map->buckets[bucket_num]; node != NULL;
+ node = next) {
+ next = node->next;
+ free(node);
+ }
+ }
+
+ free(map->buckets);
+ memset(map, 0, sizeof(*map)); /* paranoia to help catch bugs */
+ free(map);
+}
+
+/* Link the given node into the correct bucket linked list in the hash map. */
+static void hash_map_link(struct hash_map *map,
+ struct hash_node *node)
+{
+ const size_t bucket_num = hash_bucket_num(map, node->key);
+ node->next = map->buckets[bucket_num];
+ map->buckets[bucket_num] = node;
+}
+
+/* Create a new array of buckets that's twice the size of the current
+ * array. Then Walk through the old buckets and move all the nodes to
+ * the new buckets.
+ */
+static void hash_map_grow(struct hash_map *map)
+{
+ const size_t old_num_buckets = map->num_buckets;
+ map->num_buckets *= 2;
+ map->bucket_mask = map->num_buckets - 1;
+ struct hash_node **old_buckets = map->buckets;
+ map->buckets = calloc(map->num_buckets, sizeof(struct hash_node *));
+
+ size_t old_bucket_num = 0;
+ for (old_bucket_num = 0; old_bucket_num < old_num_buckets;
+ ++old_bucket_num) {
+ struct hash_node *node = NULL;
+ struct hash_node *next = NULL;
+ for (node = old_buckets[old_bucket_num]; node != NULL;
+ node = next) {
+ next = node->next;
+ hash_map_link(map, node);
+ }
+ }
+
+ free(old_buckets);
+}
+
+/* Insert a new node in the hash map, first growing the map if needed. */
+static void hash_map_insert(struct hash_map *map, u32 key, u32 value)
+{
+ /* To keep things simple, we target a load factor of 1.0. */
+ if ((map->num_keys >= map->num_buckets) &&
+ (map->num_buckets < MAX_BUCKETS)) {
+ hash_map_grow(map);
+ }
+ ++map->num_keys;
+ struct hash_node *node = calloc(1, sizeof(struct hash_node));
+ node->key = key;
+ node->value = value;
+ hash_map_link(map, node);
+}
+
+void hash_map_set(struct hash_map *map, u32 key, u32 value)
+{
+ const size_t bucket_num = hash_bucket_num(map, key);
+ struct hash_node *node = NULL;
+ for (node = map->buckets[bucket_num]; node != NULL; node = node->next) {
+ if (node->key == key) {
+ node->value = value;
+ return;
+ }
+ }
+ hash_map_insert(map, key, value);
+}
+
+bool hash_map_get(const struct hash_map *map, u32 key, u32 *value)
+{
+ const size_t bucket_num = hash_bucket_num(map, key);
+ struct hash_node *node = NULL;
+ for (node = map->buckets[bucket_num]; node != NULL; node = node->next) {
+ if (node->key == key) {
+ *value = node->value;
+ return true;
+ }
+ }
+ return false;
+}
diff --git a/test/packetdrill/hash_map.h b/test/packetdrill/hash_map.h
new file mode 100644
index 0000000..f6805e2
--- /dev/null
+++ b/test/packetdrill/hash_map.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Interface and data structure declarations for a simple hash map
+ * mapping u32 keys to u32 values.
+ */
+
+#ifndef __HASH_MAP_H__
+#define __HASH_MAP_H__
+
+#include "types.h"
+
+/* Node for hash table buckets; maps u32 key to u32 value. */
+struct hash_node {
+ u32 key;
+ u32 value;
+ struct hash_node *next;
+};
+
+/* Hash map mapping u32 to u32. */
+struct hash_map {
+ size_t num_keys; /* number of keys */
+ size_t num_buckets; /* number of buckets (a power of 2) */
+ size_t bucket_mask; /* bit mask to find bucket number */
+ struct hash_node **buckets; /* array of hash buckets */
+};
+
+extern struct hash_map *hash_map_new(size_t num_keys);
+
+extern void hash_map_free(struct hash_map *map);
+
+extern void hash_map_set(struct hash_map *map,
+ u32 key, u32 value);
+
+extern bool hash_map_get(const struct hash_map *map,
+ u32 key, u32 *value);
+
+#endif /* __HASH_MAP_H__ */
diff --git a/test/packetdrill/header.h b/test/packetdrill/header.h
new file mode 100644
index 0000000..bfd339f
--- /dev/null
+++ b/test/packetdrill/header.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Interface and type declarations for packetdrill's representation of
+ * packet headers. We support multi-layer encapsulation. In order to
+ * make it easier to iterate through all the headers in a packet, we
+ * keep separate, explicit metadata about the types and locations of
+ * headers in a packet.
+ */
+
+#ifndef __HEADER_H__
+#define __HEADER_H__
+
+#include "types.h"
+
+#include <sys/time.h>
+#include "assert.h"
+#include "gre.h"
+#include "icmp.h"
+#include "icmpv6.h"
+#include "ip.h"
+#include "ipv6.h"
+#include "mpls.h"
+#include "tcp.h"
+#include "udp.h"
+
+struct packet;
+
+/* The type of a header in a packet. */
+enum header_t {
+ HEADER_NONE,
+ HEADER_IPV4,
+ HEADER_IPV6,
+ HEADER_GRE,
+ HEADER_MPLS,
+ HEADER_TCP,
+ HEADER_UDP,
+ HEADER_ICMPV4,
+ HEADER_ICMPV6,
+ HEADER_NUM_TYPES
+};
+
+/* Metadata about a header in a packet. We support multi-layer encapsulation. */
+struct header {
+ enum header_t type; /* type of this header */
+ u32 header_bytes; /* length of this header */
+ u32 total_bytes; /* length of header plus data inside */
+ union {
+ u8 *ptr; /* a pointer to the header bits */
+ struct ipv4 *ipv4;
+ struct ipv6 *ipv6;
+ struct gre *gre;
+ struct mpls *mpls;
+ struct tcp *tcp;
+ struct udp *udp;
+ struct icmpv4 *icmpv4;
+ struct icmpv6 *icmpv6;
+ } h;
+};
+
+/* Info for a particular type of header. */
+struct header_type_info {
+ const char* name; /* human-readable protocol name */
+ u8 ip_proto; /* IP protocol code */
+ u16 eth_proto; /* Ethernet protocol code */
+
+ /* Call this to finalize the header once we know what's inside... */
+ int (*finish)(struct packet *packet,
+ struct header *header, struct header *next_inner);
+};
+
+/* Return the info for the given type of header. */
+extern struct header_type_info *header_type_info(enum header_t header_type);
+
+#endif /* __HEADER_H__ */
diff --git a/test/packetdrill/icmp.h b/test/packetdrill/icmp.h
new file mode 100644
index 0000000..a35c8b5
--- /dev/null
+++ b/test/packetdrill/icmp.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Our own ICMPv4 header declarations, so we have something that's
+ * portable and somewhat more readable than a typical system header
+ * file.
+ */
+
+#ifndef __ICMP_HEADERS_H__
+#define __ICMP_HEADERS_H__
+
+#include "types.h"
+
+/* Most ICMPv6 message types include a copy of the outbound IP header
+ * and the first few bytes inside, to allow the receiver to demux by
+ * TCP/UDP port. The following constant specifies the number of bytes
+ * of TCP header that we will echo. We echo 8 bytes because that
+ * is the minimum number of bytes that the Linux TCP stack needs to
+ * read the source and destination TCP port and TCP sequence number,
+ * which it needs to properly demux an incoming ICMP packet to a
+ * specific TCP connection.
+ */
+#define ICMP_ECHO_BYTES 8
+
+struct icmpv4 {
+ __u8 type;
+ __u8 code;
+ __sum16 checksum;
+ union {
+ struct {
+ __be16 id;
+ __be16 sequence;
+ } echo;
+ __be32 gateway;
+ struct {
+ __be16 unused;
+ __be16 mtu;
+ } frag; /* PMTU discovery, RFC 1191 */
+ } message;
+};
+
+/* Our own ICMP definitions, since the names vary between platforms. */
+
+/* ICMPv4 types */
+#define ICMP_ECHOREPLY 0
+#define ICMP_DEST_UNREACH 3
+#define ICMP_SOURCE_QUENCH 4
+#define ICMP_REDIRECT 5
+#define ICMP_ECHO 8
+#define ICMP_TIME_EXCEEDED 11
+#define ICMP_PARAMETERPROB 12
+#define ICMP_TIMESTAMP 13
+#define ICMP_TIMESTAMPREPLY 14
+#define ICMP_INFO_REQUEST 15
+#define ICMP_INFO_REPLY 16
+#define ICMP_ADDRESS 17
+#define ICMP_ADDRESSREPLY 18
+#define NR_ICMP_TYPES 18
+
+/* Codes for ICMP_DEST_UNREACH */
+#define ICMP_NET_UNREACH 0
+#define ICMP_HOST_UNREACH 1
+#define ICMP_PROT_UNREACH 2
+#define ICMP_PORT_UNREACH 3
+#define ICMP_FRAG_NEEDED 4
+#define ICMP_SR_FAILED 5
+#define ICMP_NET_UNKNOWN 6
+#define ICMP_HOST_UNKNOWN 7
+#define ICMP_HOST_ISOLATED 8
+#define ICMP_NET_ANO 9
+#define ICMP_HOST_ANO 10
+#define ICMP_NET_UNR_TOS 11
+#define ICMP_HOST_UNR_TOS 12
+#define ICMP_PKT_FILTERED 13
+#define ICMP_PREC_VIOLATION 14
+#define ICMP_PREC_CUTOFF 15
+#define NR_ICMP_UNREACH 15
+
+#endif /* __ICMP_HEADERS_H__ */
diff --git a/test/packetdrill/icmp_packet.c b/test/packetdrill/icmp_packet.c
new file mode 100644
index 0000000..6dc5f9b
--- /dev/null
+++ b/test/packetdrill/icmp_packet.c
@@ -0,0 +1,406 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Implementation for module for formatting ICMP packets.
+ */
+
+#include "icmp_packet.h"
+
+#include "icmp.h"
+#include "icmpv6.h"
+#include "ip_packet.h"
+
+/* A table entry mapping an ICMP code string to byte. */
+struct icmp_code_info {
+ u8 code_byte; /* type byte on the wire */
+ const char *code_string; /* human-readable code */
+};
+
+/* A table entry mapping an ICMP type string to byte and code table. */
+struct icmp_type_info {
+ u8 type_byte; /* type byte on the wire */
+ const char *type_string; /* human-readable type */
+ const struct icmp_code_info *code_table; /* codes for this type */
+};
+
+/* Values for the 'code' byte of an IPv4 ICMP_DEST_UNREACH header (RFC 1700). */
+struct icmp_code_info icmpv4_unreachable_codes[] = {
+ { ICMP_NET_UNREACH, "net_unreachable" },
+ { ICMP_HOST_UNREACH, "host_unreachable" },
+ { ICMP_PROT_UNREACH, "protocol_unreachable" },
+ { ICMP_PORT_UNREACH, "port_unreachable" },
+ { ICMP_FRAG_NEEDED, "frag_needed" },
+ { ICMP_SR_FAILED, "source_route_failed" },
+ { ICMP_NET_UNKNOWN, "net_unknown" },
+ { ICMP_HOST_UNKNOWN, "host_unknown" },
+ { ICMP_HOST_ISOLATED, "source_host_isolated" },
+ { ICMP_NET_ANO, "net_prohibited" },
+ { ICMP_HOST_ANO, "host_prohibited" },
+ { ICMP_NET_UNR_TOS, "net_unreachable_for_tos" },
+ { ICMP_HOST_UNR_TOS, "host_unreachable_for_tos" },
+ { ICMP_PKT_FILTERED, "packet_filtered" },
+ { ICMP_PREC_VIOLATION, "precedence_violation" },
+ { ICMP_PREC_CUTOFF, "precedence_cutoff" },
+ { 0, NULL },
+};
+
+/* Information about the supported types of ICMPv4 header (RFC 1700). */
+struct icmp_type_info icmpv4_types[] = {
+ { ICMP_ECHOREPLY, "echo_reply" },
+ { ICMP_DEST_UNREACH, "unreachable", icmpv4_unreachable_codes },
+ { ICMP_SOURCE_QUENCH, "source_quench" },
+ { ICMP_REDIRECT, "redirect" },
+ { ICMP_ECHO, "echo_request" },
+ { ICMP_TIME_EXCEEDED, "time_exceeded" },
+ { ICMP_PARAMETERPROB, "parameter_problem" },
+ { ICMP_TIMESTAMP, "timestamp_request" },
+ { ICMP_TIMESTAMPREPLY, "timestamp_reply" },
+ { ICMP_INFO_REQUEST, "information_request" },
+ { ICMP_INFO_REPLY, "information_reply" },
+ { ICMP_ADDRESS, "address_mask_request" },
+ { ICMP_ADDRESSREPLY, "address_mask_reply" },
+ { 0, NULL, NULL },
+};
+
+/* Values for the 'code' byte of an ICMPV6_DEST_UNREACH header (RFC 2463). */
+struct icmp_code_info icmpv6_unreachable_codes[] = {
+ { ICMP_NET_UNREACH, "net_unreachable" },
+ { ICMPV6_NOROUTE, "no_route" },
+ { ICMPV6_ADM_PROHIBITED, "admin_prohibited" },
+ { ICMPV6_NOT_NEIGHBOUR, "not_neighbour" },
+ { ICMPV6_ADDR_UNREACH, "address_unreachable" },
+ { ICMPV6_PORT_UNREACH, "port_unreachable" },
+ { 0, NULL },
+};
+
+/* Values for the 'code' byte of an ICMPV6_TIME_EXCEED header (RFC 2463). */
+struct icmp_code_info icmpv6_time_exceed_codes[] = {
+ { ICMPV6_EXC_HOPLIMIT, "exceeded_hop_limit" },
+ { ICMPV6_EXC_FRAGTIME, "exceeded_frag_time" },
+ { 0, NULL },
+};
+
+/* Values for the 'code' byte of an ICMPV6_PARAMPROB header (RFC 2463). */
+struct icmp_code_info icmpv6_paramprob_codes[] = {
+ { ICMPV6_HDR_FIELD, "header_field" },
+ { ICMPV6_UNK_NEXTHDR, "unknown_next_header" },
+ { ICMPV6_UNK_OPTION, "unknown_option" },
+ { 0, NULL },
+};
+
+/* Information about the supported types of ICMPv6 header (RFC 2463). */
+struct icmp_type_info icmpv6_types[] = {
+ { ICMPV6_DEST_UNREACH, "unreachable", icmpv6_unreachable_codes },
+ { ICMPV6_PKT_TOOBIG, "packet_too_big" },
+ { ICMPV6_TIME_EXCEED, "time_exceeded", icmpv6_time_exceed_codes },
+ { ICMPV6_PARAMPROB, "parameter_problem", icmpv6_paramprob_codes },
+ { ICMPV6_ECHO_REQUEST, "echo_request" },
+ { ICMPV6_ECHO_REPLY, "echo_reply" },
+ { 0, NULL, NULL },
+};
+
+/* Return the ICMP protocol number for the given address family. */
+static int icmp_protocol(int address_family)
+{
+ if (address_family == AF_INET)
+ return IPPROTO_ICMP;
+ else if (address_family == AF_INET6)
+ return IPPROTO_ICMPV6;
+ else
+ assert(!"bad ip version");
+ return 0;
+}
+
+/* Return the length in bytes of the ICMP header. */
+static int icmp_header_len(int address_family)
+{
+ if (address_family == AF_INET)
+ return sizeof(struct icmpv4);
+ else if (address_family == AF_INET6)
+ return sizeof(struct icmpv6);
+ else
+ assert(!"bad ip version");
+ return 0;
+}
+
+/* Fill in ICMPv4 header fields. */
+static int set_icmpv4_header(struct icmpv4 *icmpv4, u8 type, u8 code,
+ s64 mtu, u16 echo_id, char **error)
+{
+ icmpv4->type = type;
+ icmpv4->code = code;
+ icmpv4->checksum = htons(0);
+
+ if (mtu >= 0) {
+ if ((type != ICMP_DEST_UNREACH) || (code != ICMP_FRAG_NEEDED)) {
+ asprintf(error,
+ "ICMPv4 MTU is only valid for "
+ "unreachable-frag_needed");
+ return STATUS_ERR;
+ }
+ if (!is_valid_u16(mtu)) {
+ asprintf(error, "ICMPv4 MTU out of 16-bit range");
+ return STATUS_ERR;
+ }
+ icmpv4->message.frag.mtu = htons(mtu);
+ }
+ if (echo_id > 0)
+ icmpv4->message.echo.id = htons(echo_id);
+
+ return STATUS_OK;
+}
+
+/* Fill in ICMPv4 header fields. */
+static int set_icmpv6_header(struct icmpv6 *icmpv6, u8 type, u8 code,
+ s64 mtu, u16 echo_id, char **error)
+{
+ icmpv6->type = type;
+ icmpv6->code = code;
+ icmpv6->checksum = htons(0);
+
+ if (mtu >= 0) {
+ if ((type != ICMPV6_PKT_TOOBIG) || (code != 0)) {
+ asprintf(error,
+ "ICMPv6 MTU is only valid for "
+ "packet_too_big-0");
+ return STATUS_ERR;
+ }
+ if (!is_valid_u32(mtu)) {
+ asprintf(error, "ICMPv6 MTU out of 32-bit range");
+ return STATUS_ERR;
+ }
+ icmpv6->message.packet_too_big.mtu = htonl(mtu);
+ }
+ if (echo_id > 0) {
+ icmpv6->message.u_echo.identifier = htons(echo_id);
+ }
+ return STATUS_OK;
+}
+
+/* Populate ICMP header fields. */
+static int set_packet_icmp_header(struct packet *packet, void *icmp,
+ int address_family, int icmp_bytes,
+ u8 type, u8 code, s64 mtu, u16 echo_id,
+ char **error)
+{
+ struct header *icmp_header = NULL;
+
+ if (address_family == AF_INET) {
+ struct icmpv4 *icmpv4 = (struct icmpv4 *) icmp;
+ packet->icmpv4 = icmpv4;
+ assert(packet->icmpv6 == NULL);
+ icmp_header = packet_append_header(packet, HEADER_ICMPV4,
+ sizeof(*icmpv4));
+ icmp_header->total_bytes = icmp_bytes;
+ return set_icmpv4_header(icmpv4, type, code, mtu, echo_id, error);
+ } else if (address_family == AF_INET6) {
+ struct icmpv6 *icmpv6 = (struct icmpv6 *) icmp;
+ packet->icmpv6 = icmpv6;
+ assert(packet->icmpv4 == NULL);
+ icmp_header = packet_append_header(packet, HEADER_ICMPV6,
+ sizeof(*icmpv6));
+ icmp_header->total_bytes = icmp_bytes;
+ return set_icmpv6_header(icmpv6, type, code, mtu, echo_id, error);
+ } else {
+ assert(!"bad ip_version in config");
+ }
+ return STATUS_ERR;
+}
+
+/* Parse the given ICMP type and code strings, and fill in the
+ * *type and *code with the results. If there is an error during
+ * parsing, fill in *error and return STATUS_ERR; otherwise return
+ * STATUS_OK.
+ */
+static int parse_icmp_type_and_code(int address_family,
+ const char *type_string,
+ const char *code_string,
+ s32 *type, s32 *code, char **error)
+{
+ int i = 0;
+ const struct icmp_type_info *icmp_types = NULL;
+ const struct icmp_code_info *code_table = NULL; /* for this type */
+
+ if (address_family == AF_INET)
+ icmp_types = icmpv4_types;
+ else if (address_family == AF_INET6)
+ icmp_types = icmpv6_types;
+ else
+ assert(!"bad ip_version in config");
+
+ /* Parse the type string. */
+ if (sscanf(type_string, "type_%d", type) == 1) {
+ /* Legal but non-standard type in tcpdump-inspired notation. */
+ } else {
+ /* Look in our table of known types. */
+ for (i = 0; icmp_types[i].type_string != NULL; ++i) {
+ if (!strcmp(type_string, icmp_types[i].type_string)) {
+ *type = icmp_types[i].type_byte;
+ code_table = icmp_types[i].code_table;
+ }
+ }
+ }
+ if (!is_valid_u8(*type)) {
+ asprintf(error, "bad ICMP type %s", type_string);
+ return STATUS_ERR;
+ }
+
+ /* Parse the code string. */
+ if (code_string == NULL) {
+ *code = 0; /* missing code means code = 0 */
+ } else if (sscanf(code_string, "code_%d", code) == 1) {
+ /* Legal but non-standard code in tcpdump-inspired notation. */
+ } else if (code_table != NULL) {
+ /* Look in our table of known codes. */
+ for (i = 0; code_table[i].code_string != NULL; ++i) {
+ if (!strcmp(code_string, code_table[i].code_string))
+ *code = code_table[i].code_byte;
+ }
+ }
+ if (!is_valid_u8(*code)) {
+ asprintf(error, "bad ICMP code %s", code_string);
+ return STATUS_ERR;
+ }
+
+ return STATUS_OK;
+}
+
+struct packet *new_icmp_packet(int address_family,
+ enum direction_t direction,
+ const char *type_string,
+ const char *code_string,
+ int protocol,
+ u32 tcp_start_sequence,
+ u32 payload_bytes,
+ struct ip_info ip_info,
+ s64 mtu,
+ s64 echo_id,
+ char **error)
+{
+ s32 type = -1; /* bad type; means "unknown so far" */
+ s32 code = -1; /* bad code; means "unknown so far" */
+
+ struct packet *packet = NULL; /* the newly-allocated result packet */
+ /* Calculate lengths in bytes of all sections of the packet.
+ * For TCP/UDP, for now we only support the most common ICMP message
+ * format, which includes at the end the original outgoing IP
+ * header and the first 8 bytes after that (which will
+ * typically have the port info needed to demux the message).
+ * For RAW, we pad the icmp packet with 0 and the total length is
+ * payload_bytes.
+ */
+ const int ip_fixed_bytes = ip_header_min_len(address_family);
+ const int ip_option_bytes = 0;
+ const int ip_header_bytes = ip_fixed_bytes + ip_option_bytes;
+ int echoed_bytes = 0;
+ int icmp_bytes = 0;
+ int ip_bytes = 0;
+
+ if (protocol == IPPROTO_TCP || protocol == IPPROTO_UDP) {
+ echoed_bytes = ip_fixed_bytes + ICMP_ECHO_BYTES;
+ icmp_bytes = icmp_header_len(address_family) + echoed_bytes;
+ ip_bytes = ip_header_bytes + icmp_bytes;
+ } else if (protocol == IPPROTO_RAW) {
+ echoed_bytes = 0;
+ icmp_bytes = payload_bytes;
+ ip_bytes = ip_header_bytes + payload_bytes;
+ }
+
+ /* Sanity-check on echo_id to make sure it fits in u16 */
+ if (echo_id < 0 || echo_id > 65535) {
+ asprintf(error,
+ "invalid echo_id, must be between 0 and 65535");
+ goto error_out;
+ }
+
+ /* Sanity-check all the various lengths */
+ if (ip_option_bytes & 0x3) {
+ asprintf(error, "IP options are not padded correctly "
+ "to ensure IP header is a multiple of 4 bytes: "
+ "%d excess bytes", ip_option_bytes & 0x3);
+ goto error_out;
+ }
+ assert((ip_header_bytes & 0x3) == 0);
+ if (icmp_bytes < icmp_header_len(address_family)) {
+ asprintf(error, "icmp_bytes %d smaller than icmp header "
+ "length %d",
+ icmp_bytes, icmp_header_len(address_family));
+ goto error_out;
+ }
+
+
+ /* Parse the ICMP type and code */
+ if (parse_icmp_type_and_code(address_family, type_string, code_string,
+ &type, &code, error))
+ goto error_out;
+ assert(is_valid_u8(type));
+ assert(is_valid_u8(code));
+
+ /* Allocate and zero out a packet object of the desired size */
+ packet = packet_new(ip_bytes);
+ memset(packet->buffer, 0, ip_bytes);
+
+ packet->direction = direction;
+ packet->flags = 0;
+ packet->tos_chk = ip_info.tos.check;
+
+ /* Set IP header fields */
+ set_packet_ip_header(packet, address_family, ip_bytes, ip_info.tos.value,
+ ip_info.flow_label, ip_info.ttl,
+ icmp_protocol(address_family));
+
+ /* Find the start of the ICMP header and then populate common fields. */
+ void *icmp_header = ip_start(packet) + ip_header_bytes;
+ if (set_packet_icmp_header(packet, icmp_header, address_family,
+ icmp_bytes, type, code, mtu, echo_id, error))
+ goto error_out;
+
+ /* All ICMP message types currently supported by this tool
+ * include a copy of the outbound IP header and the first few
+ * bytes inside. To ensure that the inbound ICMP message gets
+ * demuxed to the correct socket in the kernel, here we
+ * construct enough of a basic IP header and during test
+ * execution we fill in the port numbers and (if specified)
+ * TCP sequence number in the TCP header.
+ */
+ if (echoed_bytes) {
+ u8 *echoed_ip = packet_echoed_ip_header(packet);
+ const int echoed_ip_bytes = (ip_fixed_bytes +
+ layer4_header_len(protocol) +
+ payload_bytes);
+ set_ip_header(echoed_ip, address_family, echoed_ip_bytes,
+ 0, 0, 0, protocol);
+ if (protocol == IPPROTO_TCP) {
+ u32 *seq = packet_echoed_tcp_seq(packet);
+ *seq = htonl(tcp_start_sequence);
+ }
+ packet->echoed_header = true;
+ } else
+ packet->echoed_header = false;
+
+ packet->ip_bytes = ip_bytes;
+ return packet;
+
+error_out:
+ if (packet != NULL)
+ packet_free(packet);
+ return NULL;
+}
diff --git a/test/packetdrill/icmp_packet.h b/test/packetdrill/icmp_packet.h
new file mode 100644
index 0000000..020e5bc
--- /dev/null
+++ b/test/packetdrill/icmp_packet.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Interface for module for formatting ICMP packets.
+ */
+
+#ifndef __ICMP_PACKET_H__
+#define __ICMP_PACKET_H__
+
+#include "types.h"
+
+#include "packet.h"
+
+/* Create and initialize a new struct packet containing an ICMP
+ * packet. The 'type_string' identifies the ICMP type. The
+ * 'code_string' identifies the ICMP code (and NULL means no code was
+ * provided, in which case we assume a default code of 0).
+ * The 'protocol' is either IPPROTO_UDP or IPPROTO_TCP.
+ * The 'tcp_start_sequence' and 'payload_bytes' describe the TCP or UDP
+ * packet echoed inside the ICMP message. The 'mtu' specifies the MTU
+ * advertised in "packet is too big" ICMP message, or -1 for no
+ * MTU. On success, returns a newly-allocated packet. On failure,
+ * returns NULL and fills in *error with an error message.
+ */
+extern struct packet *new_icmp_packet(int address_family,
+ enum direction_t direction,
+ const char *type_string,
+ const char *code_string,
+ int protocol,
+ u32 tcp_start_sequence,
+ u32 payload_bytes,
+ struct ip_info ip_info,
+ s64 mtu,
+ s64 echo_id,
+ char **error);
+
+#endif /* __ICMP_PACKET_H__ */
diff --git a/test/packetdrill/icmpv6.h b/test/packetdrill/icmpv6.h
new file mode 100644
index 0000000..047f90d
--- /dev/null
+++ b/test/packetdrill/icmpv6.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Our own ICMPv6 header declarations, so we have something that's
+ * portable and somewhat more readable than a typical system header
+ * file.
+ */
+
+#ifndef __ICMPV6_HEADERS_H__
+#define __ICMPV6_HEADERS_H__
+
+#include "types.h"
+
+/* ICMPv6 hader. See RFC 4443. */
+struct icmpv6 {
+ __u8 type;
+ __u8 code;
+ __sum16 checksum;
+ union {
+ struct {
+ __be32 unused;
+ } unreachable;
+ struct {
+ __be32 mtu;
+ } packet_too_big;
+ struct {
+ __be32 unused;
+ } time_exceeded;
+ struct {
+ __be32 pointer;
+ } parameter_problem;
+ struct icmpv6_echo {
+ __be16 identifier;
+ __be16 sequence;
+ } u_echo;
+ } message;
+};
+
+/* Supported ICMPv6 types */
+#define ICMPV6_DEST_UNREACH 1
+#define ICMPV6_PKT_TOOBIG 2
+#define ICMPV6_TIME_EXCEED 3
+#define ICMPV6_PARAMPROB 4
+#define ICMPV6_ECHO_REQUEST 128
+#define ICMPV6_ECHO_REPLY 129
+
+/* Codes for ICMPV6 Destination Unreachable */
+#define ICMPV6_NOROUTE 0
+#define ICMPV6_ADM_PROHIBITED 1
+#define ICMPV6_NOT_NEIGHBOUR 2
+#define ICMPV6_ADDR_UNREACH 3
+#define ICMPV6_PORT_UNREACH 4
+
+/* Codes for ICMPV6 Time Exceeded */
+#define ICMPV6_EXC_HOPLIMIT 0
+#define ICMPV6_EXC_FRAGTIME 1
+
+/* Codes for ICMPV6 Parameter Problem */
+#define ICMPV6_HDR_FIELD 0
+#define ICMPV6_UNK_NEXTHDR 1
+#define ICMPV6_UNK_OPTION 2
+
+#endif /* __ICMPV6_HEADERS_H__ */
diff --git a/test/packetdrill/ip.h b/test/packetdrill/ip.h
new file mode 100644
index 0000000..0ffcf12
--- /dev/null
+++ b/test/packetdrill/ip.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Our own IPv4 header declarations, so we have something that's
+ * portable and somewhat more readable than a typical system header
+ * file.
+ */
+
+#ifndef __IP_HEADERS_H__
+#define __IP_HEADERS_H__
+
+#include "types.h"
+
+struct ipv4 {
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+ __u8 ihl:4,
+ version:4;
+#elif __BYTE_ORDER == __BIG_ENDIAN
+ __u8 version:4,
+ ihl:4;
+#else
+# error "Please fix endianness defines"
+#endif
+ __u8 tos;
+ __be16 tot_len;
+ __be16 id;
+ __be16 frag_off;
+ __u8 ttl;
+ __u8 protocol;
+ __sum16 check;
+ struct in_addr src_ip;
+ struct in_addr dst_ip;
+};
+
+/* ----------------------- IP socket option values -------------------- */
+
+/* Oddly enough, Linux distributions are typically missing even some
+ * of the older and more common IP socket options, such as IP_MTU.
+ */
+#ifdef linux
+#define IP_TOS 1
+#define IP_TTL 2
+#define IP_HDRINCL 3
+#define IP_OPTIONS 4
+#define IP_ROUTER_ALERT 5
+#define IP_RECVOPTS 6
+#define IP_RETOPTS 7
+#define IP_PKTINFO 8
+#define IP_PKTOPTIONS 9
+#define IP_MTU_DISCOVER 10
+#define IP_RECVERR 11
+#define IP_RECVTTL 12
+#define IP_RECVTOS 13
+#define IP_MTU 14
+#define IP_FREEBIND 15
+#define IP_IPSEC_POLICY 16
+#define IP_XFRM_POLICY 17
+#define IP_PASSSEC 18
+#define IP_TRANSPARENT 19
+#endif /* linux */
+
+/* ECN: RFC 3168: http://tools.ietf.org/html/rfc3168 */
+#define IP_ECN_MASK 3
+#define IP_ECN_NONE 0
+#define IP_ECN_ECT1 1
+#define IP_ECN_ECT0 2
+#define IP_ECN_CE 3
+
+static inline u8 ipv4_tos_byte(const struct ipv4 *ipv4)
+{
+ return ipv4->tos;
+}
+
+static inline u8 ipv4_ttl_byte(const struct ipv4 *ipv4)
+{
+ return ipv4->ttl;
+}
+
+static inline int ipv4_header_len(const struct ipv4 *ipv4)
+{
+ return ipv4->ihl * sizeof(u32);
+}
+
+/* IP fragmentation bit flags */
+#define IP_RF 0x8000 /* reserved fragment flag */
+#define IP_DF 0x4000 /* don't fragment flag */
+#define IP_MF 0x2000 /* more fragments flag */
+#define IP_OFFMASK 0x1FFF /* mask for fragmenting bits */
+
+#endif /* __IP_HEADERS_H__ */
diff --git a/test/packetdrill/ip_address.c b/test/packetdrill/ip_address.c
new file mode 100644
index 0000000..9518f17
--- /dev/null
+++ b/test/packetdrill/ip_address.c
@@ -0,0 +1,379 @@
+/*
+ * Copyright 2013-2015 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Implementation for operations for IPv4 and IPv6 addresses.
+ */
+
+#include "ip_address.h"
+
+#include <ifaddrs.h>
+#include <net/if.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include "logging.h"
+
+/* IPv6 prefix for IPv4-mapped addresses. These are in the
+ * ::FFFF:0:0/96 space, i.e. 10 bytes of 0x00 and 2 bytes of 0xFF. See
+ * RFC 4291 ("IPv6 Addressing Architecture") section 2.5.5.2
+ * ("IPv4-Mapped IPv6 Address").
+ */
+const u8 ipv4_mapped_prefix[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF };
+
+int ip_address_length(int address_family)
+{
+ switch (address_family) {
+ case AF_INET:
+ return sizeof(struct in_addr);
+ case AF_INET6:
+ return sizeof(struct in6_addr);
+ default:
+ die("ip_address_length: bad address family: %d\n",
+ address_family);
+ break;
+ }
+ return 0; /* not reached */
+}
+
+int sockaddr_length(int address_family)
+{
+ switch (address_family) {
+ case AF_INET:
+ return sizeof(struct sockaddr_in);
+ case AF_INET6:
+ return sizeof(struct sockaddr_in6);
+ default:
+ die("sockaddr_length: bad address family: %d\n",
+ address_family);
+ break;
+ }
+ return 0; /* not reached */
+}
+
+static void ipv4_init(struct ip_address *ipv4)
+{
+ memset(ipv4, 0, sizeof(*ipv4));
+ ipv4->address_family = AF_INET;
+}
+
+static void ipv6_init(struct ip_address *ipv6)
+{
+ memset(ipv6, 0, sizeof(*ipv6));
+ ipv6->address_family = AF_INET6;
+}
+
+void ip_from_ipv4(const struct in_addr *ipv4, struct ip_address *ip)
+{
+ ipv4_init(ip);
+ ip->ip.v4 = *ipv4;
+}
+
+void ip_from_ipv6(const struct in6_addr *ipv6, struct ip_address *ip)
+{
+ ipv6_init(ip);
+ ip->ip.v6 = *ipv6;
+}
+
+void ip_to_ipv4(const struct ip_address *ip, struct in_addr *ipv4)
+{
+ *ipv4 = ip->ip.v4;
+}
+
+void ip_to_ipv6(const struct ip_address *ip, struct in6_addr *ipv6)
+{
+ *ipv6 = ip->ip.v6;
+}
+
+struct ip_address ipv4_parse(const char *ip_string)
+{
+ struct ip_address ipv4;
+ ipv4_init(&ipv4);
+
+ if (inet_pton(AF_INET, ip_string, &ipv4.ip.v4) != 1)
+ die("bad IPv4 address: %s\n", ip_string);
+
+ return ipv4;
+}
+
+struct ip_address ipv6_parse(const char *ip_string)
+{
+ struct ip_address ipv6;
+ ipv6_init(&ipv6);
+
+ if (inet_pton(AF_INET6, ip_string, &ipv6.ip.v6) != 1)
+ die("bad IPv6 address: %s\n", ip_string);
+
+ return ipv6;
+}
+
+const char *ip_to_string(const struct ip_address *ip, char *buffer)
+{
+ if (!inet_ntop(ip->address_family, &ip->ip, buffer, ADDR_STR_LEN))
+ die_perror("inet_ntop");
+
+ return buffer;
+}
+
+struct ip_address ipv6_map_from_ipv4(const struct ip_address ipv4)
+{
+ struct ip_address ipv6;
+ ipv6_init(&ipv6);
+
+ assert(sizeof(ipv4.ip.v4) + sizeof(ipv4_mapped_prefix) ==
+ sizeof(ipv6.ip.v6));
+ memcpy(ipv6.ip.v6.s6_addr, ipv4_mapped_prefix,
+ sizeof(ipv4_mapped_prefix));
+ memcpy(ipv6.ip.v6.s6_addr + sizeof(ipv4_mapped_prefix),
+ &ipv4.ip.v4, sizeof(ipv4.ip.v4));
+ return ipv6;
+}
+
+int ipv6_map_to_ipv4(const struct ip_address ipv6, struct ip_address *ipv4)
+{
+ if (memcmp(&ipv6.ip.v6.s6_addr,
+ ipv4_mapped_prefix, sizeof(ipv4_mapped_prefix)) == 0) {
+ ipv4_init(ipv4);
+ memcpy(&ipv4->ip.v4,
+ ipv6.ip.v6.s6_addr + sizeof(ipv4_mapped_prefix),
+ sizeof(ipv4->ip.v4));
+ return STATUS_OK;
+ } else {
+ return STATUS_ERR;
+ }
+}
+
+/* Fill in a sockaddr struct and socklen_t using the given IPv4
+ * address and port.
+ */
+static void ipv4_to_sockaddr(const struct ip_address *ipv4, u16 port,
+ struct sockaddr *address, socklen_t *length)
+{
+ struct sockaddr_in sa_v4;
+ memset(&sa_v4, 0, sizeof(sa_v4));
+#ifndef linux
+ sa_v4.sin_len = sizeof(sa_v4);
+#endif
+ sa_v4.sin_family = AF_INET;
+ sa_v4.sin_port = htons(port);
+ memcpy(&sa_v4.sin_addr, &ipv4->ip.v4, sizeof(sa_v4.sin_addr));
+ *length = sizeof(sa_v4);
+ memcpy(address, &sa_v4, *length);
+}
+
+/* Fill in a sockaddr struct and socklen_t using the given IPv6
+ * address and port.
+ */
+static void ipv6_to_sockaddr(const struct ip_address *ipv6, u16 port,
+ struct sockaddr *address, socklen_t *length)
+{
+ struct sockaddr_in6 sa_v6;
+ memset(&sa_v6, 0, sizeof(sa_v6));
+#ifndef linux
+ sa_v6.sin6_len = sizeof(sa_v6);
+#endif
+ sa_v6.sin6_family = AF_INET6;
+ sa_v6.sin6_port = htons(port);
+ memcpy(&sa_v6.sin6_addr, &ipv6->ip.v6, sizeof(sa_v6.sin6_addr));
+ *length = sizeof(sa_v6);
+ memcpy(address, &sa_v6, *length);
+}
+
+void ip_to_sockaddr(const struct ip_address *ip, u16 port,
+ struct sockaddr *address, socklen_t *length)
+{
+ switch (ip->address_family) {
+ case AF_INET:
+ ipv4_to_sockaddr(ip, port, address, length);
+ break;
+ case AF_INET6:
+ ipv6_to_sockaddr(ip, port, address, length);
+ break;
+ default:
+ die("ip_to_sockaddr: bad address family: %d\n",
+ ip->address_family);
+ break;
+ }
+}
+
+/* Extract and return the IPv4 address and port from the given sockaddr. */
+static void ipv4_from_sockaddr(const struct sockaddr *address, socklen_t length,
+ struct ip_address *ipv4, u16 *port)
+{
+ assert(address->sa_family == AF_INET);
+ ipv4_init(ipv4);
+
+ struct sockaddr_in sa_v4;
+ assert(length == sizeof(sa_v4));
+ memcpy(&sa_v4, address, length); /* to avoid aliasing issues */
+ ipv4->ip.v4 = sa_v4.sin_addr;
+ *port = ntohs(sa_v4.sin_port);
+}
+
+/* Extract and return the IPv6 address and port from the given sockaddr. */
+static void ipv6_from_sockaddr(const struct sockaddr *address, socklen_t length,
+ struct ip_address *ipv4, u16 *port)
+{
+ assert(address->sa_family == AF_INET6);
+ ipv6_init(ipv4);
+
+ struct sockaddr_in6 sa_v6;
+ assert(length == sizeof(sa_v6));
+ memcpy(&sa_v6, address, length); /* to avoid aliasing issues */
+ ipv4->ip.v6 = sa_v6.sin6_addr;
+ *port = ntohs(sa_v6.sin6_port);
+}
+
+void ip_from_sockaddr(const struct sockaddr *address, socklen_t length,
+ struct ip_address *ip, u16 *port)
+{
+ switch (address->sa_family) {
+ case AF_INET:
+ ipv4_from_sockaddr(address, length, ip, port);
+ break;
+ case AF_INET6:
+ ipv6_from_sockaddr(address, length, ip, port);
+ break;
+ default:
+ die("ip_from_sockaddr: bad address family: %d\n",
+ address->sa_family);
+ break;
+ }
+}
+
+int get_ip_device(const struct ip_address *ip, char *dev_name)
+{
+ struct ifaddrs *ifaddr_list, *ifaddr;
+ bool is_local = false;
+
+ if (getifaddrs(&ifaddr_list))
+ die_perror("getifaddrs");
+
+ for (ifaddr = ifaddr_list; ifaddr != NULL; ifaddr = ifaddr->ifa_next) {
+ int family;
+ struct ip_address interface_ip;
+ u16 port;
+
+ if (ifaddr->ifa_addr == NULL)
+ continue;
+
+ family = ifaddr->ifa_addr->sa_family;
+ if (family != ip->address_family)
+ continue;
+
+ ip_from_sockaddr(ifaddr->ifa_addr, sockaddr_length(family),
+ &interface_ip, &port);
+ if (is_equal_ip(ip, &interface_ip)) {
+ assert(ifaddr->ifa_name);
+ assert(strlen(ifaddr->ifa_name) < IFNAMSIZ);
+ strcpy(dev_name, ifaddr->ifa_name);
+ is_local = true;
+ break;
+ }
+ }
+
+ freeifaddrs(ifaddr_list);
+
+ return is_local;
+}
+
+int is_ip_local(const struct ip_address *ip)
+{
+ char dev_name[IFNAMSIZ];
+
+ return get_ip_device(ip, dev_name);
+}
+
+int netmask_to_prefix(const char *netmask)
+{
+ int pos;
+ struct ip_address mask = ipv4_parse(netmask);
+ u32 mask_addr = ntohl(mask.ip.v4.s_addr);
+ int prefix_len = 0;
+
+ for (pos = 31; pos >= 0; --pos) {
+ if (!(mask_addr & (1<<pos)))
+ break;
+ ++prefix_len;
+ }
+ return prefix_len;
+}
+
+static int urandom_read(void *buffer, int sz)
+{
+ static int fd_urandom = -1;
+
+ if (fd_urandom == -1)
+ fd_urandom = open("/dev/urandom", O_RDONLY);
+ return read(fd_urandom, buffer, sz);
+}
+
+void generate_random_ipv4_addr(char *result, const char *base,
+ const char *netmask)
+{
+ int prefix_len = netmask_to_prefix(netmask);
+ struct ip_address addr = ipv4_parse(base);
+
+ if (prefix_len < 31) {
+ unsigned int rnd;
+
+ if (urandom_read(&rnd, sizeof(rnd)) == sizeof(rnd)) {
+ if (prefix_len) {
+ u32 mask = (1U << (32 - prefix_len)) - 1;
+
+ rnd &= mask;
+ /* .0 is reserved for network address.
+ * .1 is reserved for the gateway
+ */
+ if (rnd < 2)
+ rnd = 2;
+ /* .255.255 is reserved for net broadcast */
+ if (rnd == mask)
+ rnd--;
+ }
+ addr.ip.v4.s_addr |= htonl(rnd);
+ }
+ }
+ ip_to_string(&addr, result);
+}
+
+/* In this version, we randomize last 32bits (or less) of the address.
+ * There is no need to fully use RFC 4193 range.
+ * ( fd3d:fa7b:d17d::/48 in unique local address space )
+ */
+void generate_random_ipv6_addr(char *result, const char *base, int prefixlen)
+{
+ struct ip_address addr = ipv6_parse(base);
+ unsigned int mask = ~0U, rnd = 0;
+
+ urandom_read(&rnd, sizeof(rnd));
+ if (prefixlen > 128 - 32) {
+ mask = (1U << (128 - prefixlen)) - 1;
+ rnd &= mask;
+ }
+ if (!rnd)
+ rnd++;
+ if (rnd == mask)
+ rnd--;
+ addr.ip.v6.s6_addr32[3] |= htonl(rnd);
+ ip_to_string(&addr, result);
+}
diff --git a/test/packetdrill/ip_address.h b/test/packetdrill/ip_address.h
new file mode 100644
index 0000000..6ee586b
--- /dev/null
+++ b/test/packetdrill/ip_address.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Types and operations for IPv4 and IPv6 addresses.
+ */
+
+#ifndef __IP_ADDRESS_H__
+#define __IP_ADDRESS_H__
+
+#include "types.h"
+
+#include <netinet/in.h>
+
+/* IPv4 or IPv6 address. */
+struct ip_address {
+ int address_family; /* AF_INET or AF_INET6 */
+ union {
+ struct in_addr v4;
+ struct in6_addr v6;
+ u8 bytes[16];
+ } ip; /* IP address (network order) */
+};
+
+static inline void ip_reset(struct ip_address *ip)
+{
+ memset(ip, 0, sizeof(*ip));
+}
+
+/* Fill in an ip_address using the given family-specific struct. */
+extern void ip_from_ipv4(const struct in_addr *ipv4, struct ip_address *ip);
+extern void ip_from_ipv6(const struct in6_addr *ipv6, struct ip_address *ip);
+
+/* Fill in the given family-specific struct using the given ip_address. */
+extern void ip_to_ipv4(const struct ip_address *ip, struct in_addr *ipv4);
+extern void ip_to_ipv6(const struct ip_address *ip, struct in6_addr *ipv6);
+
+/* Return the number of bytes in the on-the-wire representation of
+ * addresses of the given family.
+ */
+extern int ip_address_length(int address_family);
+
+/* Return the number of bytes in sockaddr of the given family. */
+extern int sockaddr_length(int address_family);
+
+/* Return true iff the two addresses are the same. */
+static inline bool is_equal_ip(const struct ip_address *a,
+ const struct ip_address *b)
+{
+ return ((a->address_family == b->address_family) &&
+ !memcmp(&a->ip, &b->ip, ip_address_length(a->address_family)));
+}
+
+/* Parse a human-readable IPv4 address and return it. Print an error
+ * to stderr and exit if there is an error parsing the address.
+ */
+extern struct ip_address ipv4_parse(const char *ip_string);
+
+/* Parse a human-readable IPv6 address and return it. Print an error
+ * to stderr and exit if there is an error parsing the address.
+ */
+extern struct ip_address ipv6_parse(const char *ip_string);
+
+/* Print a human-readable representation of the given IP address in the
+ * given buffer, which must be at least ADDR_STR_LEN bytes long.
+ * Returns a pointer to the given buffer.
+ */
+extern const char *ip_to_string(const struct ip_address *ip, char *buffer);
+
+/* Create an IPv4-mapped IPv6 address. */
+extern struct ip_address ipv6_map_from_ipv4(const struct ip_address ipv4);
+
+/* Deconstruct an IPv4-mapped IPv6 address and fill in *ipv4 with the
+ * IPv4 address that was mapped into IPv6 space. Return STATUS_OK on
+ * success, or STATUS_ERR on failure (meaning the input ipv6 was not
+ * actually an IPv4-mapped IPv6 address).
+ */
+extern int ipv6_map_to_ipv4(const struct ip_address ipv6,
+ struct ip_address *ipv4);
+
+/* Fill in a sockaddr struct and socklen_t using the given IP and port.
+ * The IP address may be IPv4 or IPv6.
+ */
+extern void ip_to_sockaddr(const struct ip_address *ip, u16 port,
+ struct sockaddr *address, socklen_t *length);
+
+/* Fill in an IP address and port by parsing a sockaddr struct and
+ * socklen_t using the given IP and port. The IP address may be IPv4
+ * or IPv6. Exits with an error message if the address family is other
+ * than AF_INET or AF_INET6.
+ */
+extern void ip_from_sockaddr(const struct sockaddr *address, socklen_t length,
+ struct ip_address *ip, u16 *port);
+
+/* Return true iff the address is that of a local interface. */
+/* Note: this should return bool, but that doesn't compile on NetBSD. */
+extern int is_ip_local(const struct ip_address *ip);
+
+/* Fill in the name of the device configured with the given IP, if
+ * any. The dev_name buffer should be at least IFNAMSIZ bytes.
+ * Return true iff the IP is found on a local device.
+ */
+/* Note: this should return bool, but that doesn't compile on NetBSD. */
+extern int get_ip_device(const struct ip_address *ip, char *dev_name);
+
+/* Convert dotted decimal netmask to equivalent CIDR prefix length */
+extern int netmask_to_prefix(const char *netmask);
+
+void generate_random_ipv4_addr(char *result, const char *base,
+ const char *netmask);
+
+void generate_random_ipv6_addr(char *result, const char *base, int prefixlen);
+
+#endif /* __IP_ADDRESS_H__ */
diff --git a/test/packetdrill/ip_packet.c b/test/packetdrill/ip_packet.c
new file mode 100644
index 0000000..4a68ea6
--- /dev/null
+++ b/test/packetdrill/ip_packet.c
@@ -0,0 +1,221 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Implementation for module for formatting IPv4 and IPv6 packets.
+ */
+
+#include "ip_packet.h"
+
+#include "checksum.h"
+#include "ip.h"
+#include "ipv6.h"
+
+/* Fill in IPv4 header fields. */
+static void set_ipv4_header(struct ipv4 *ipv4,
+ u16 ip_bytes, u8 tos,
+ u8 ttl, u8 protocol)
+{
+ ipv4->version = 4;
+ ipv4->ihl = sizeof(struct ipv4) / sizeof(u32);
+ ipv4->tos = tos;
+
+ ipv4->tot_len = htons(ip_bytes);
+ ipv4->id = 0;
+ ipv4->frag_off = 0;
+ if (ttl)
+ ipv4->ttl = ttl;
+ else
+ ipv4->ttl = TTL_CHECK_NONE;
+ ipv4->protocol = protocol;
+ ipv4->check = 0;
+
+ ipv4->src_ip = in4addr_any;
+ ipv4->dst_ip = in4addr_any;
+}
+
+/* Fill in IPv6 header fields. */
+static void set_ipv6_header(struct ipv6 *ipv6,
+ u16 ip_bytes,
+ u8 tos, u32 flow_label,
+ u8 hop_limit, u8 protocol)
+{
+ ipv6->version = 6;
+ ipv6->traffic_class_hi = tos >> 4;
+ ipv6->traffic_class_lo = tos & 0x0f;
+ ipv6->flow_label_hi = (flow_label >> 16) & 0xf;
+ ipv6->flow_label_lo = htons(flow_label & 0xffff);
+
+ assert(ip_bytes >= sizeof(*ipv6));
+ ipv6->payload_len = htons(ip_bytes - sizeof(*ipv6));
+ ipv6->next_header = protocol;
+ if (hop_limit)
+ ipv6->hop_limit = hop_limit;
+ else
+ ipv6->hop_limit = TTL_CHECK_NONE;
+
+ ipv6->src_ip = in6addr_any;
+ ipv6->dst_ip = in6addr_any;
+}
+
+void set_ip_header(void *ip_header,
+ int address_family,
+ u16 ip_bytes,
+ u8 tos, u32 flowlabel,
+ u8 ttl, u8 protocol)
+{
+ if (address_family == AF_INET)
+ set_ipv4_header(ip_header, ip_bytes, tos, ttl, protocol);
+ else if (address_family == AF_INET6)
+ set_ipv6_header(ip_header, ip_bytes, tos, flowlabel,
+ ttl, protocol);
+ else
+ assert(!"bad ip_version in config");
+}
+
+void set_packet_ip_header(struct packet *packet,
+ int address_family,
+ u16 ip_bytes,
+ u8 tos, u32 flowlabel,
+ u8 ttl, u8 protocol)
+{
+ struct header *ip_header = NULL;
+
+ if (address_family == AF_INET) {
+ struct ipv4 *ipv4 = (struct ipv4 *) packet->buffer;
+ packet->ipv4 = ipv4;
+ assert(packet->ipv6 == NULL);
+ ip_header = packet_append_header(packet, HEADER_IPV4,
+ sizeof(*ipv4));
+ ip_header->total_bytes = ip_bytes;
+ set_ipv4_header(ipv4, ip_bytes, tos, ttl, protocol);
+ } else if (address_family == AF_INET6) {
+ struct ipv6 *ipv6 = (struct ipv6 *) packet->buffer;
+ packet->ipv6 = ipv6;
+ assert(packet->ipv4 == NULL);
+ ip_header = packet_append_header(packet, HEADER_IPV6,
+ sizeof(*ipv6));
+ ip_header->total_bytes = ip_bytes;
+ set_ipv6_header(ipv6, ip_bytes, tos, flowlabel, ttl, protocol);
+ } else {
+ assert(!"bad ip_version in config");
+ }
+}
+
+int ipv4_header_append(struct packet *packet,
+ const char *ip_src,
+ const char *ip_dst,
+ const u8 tos,
+ const u8 ttl,
+ char **error)
+{
+ struct header *header = NULL;
+ const int ipv4_bytes = sizeof(struct ipv4);
+ struct ipv4 *ipv4 = NULL;
+
+ header = packet_append_header(packet, HEADER_IPV4, ipv4_bytes);
+ if (header == NULL) {
+ asprintf(error, "too many headers");
+ return STATUS_ERR;
+ }
+
+ ipv4 = header->h.ipv4;
+ set_ip_header(ipv4, AF_INET, 0, tos, 0, ttl, 0);
+
+ if (inet_pton(AF_INET, ip_src, &ipv4->src_ip) != 1) {
+ asprintf(error, "bad IPv4 src address: '%s'\n", ip_src);
+ return STATUS_ERR;
+ }
+
+ if (inet_pton(AF_INET, ip_dst, &ipv4->dst_ip) != 1) {
+ asprintf(error, "bad IPv4 dst address: '%s'\n", ip_dst);
+ return STATUS_ERR;
+ }
+
+ return STATUS_OK;
+}
+
+int ipv6_header_append(struct packet *packet,
+ const char *ip_src,
+ const char *ip_dst,
+ const u8 tos,
+ const u8 hop_limit,
+ char **error)
+{
+ struct header *header = NULL;
+ const int ipv6_bytes = sizeof(struct ipv6);
+ struct ipv6 *ipv6 = NULL;
+
+ header = packet_append_header(packet, HEADER_IPV6, ipv6_bytes);
+ if (header == NULL) {
+ asprintf(error, "too many headers");
+ return STATUS_ERR;
+ }
+
+ ipv6 = header->h.ipv6;
+ set_ip_header(ipv6, AF_INET6, sizeof(struct ipv6), tos, 0, hop_limit, 0);
+
+ if (inet_pton(AF_INET6, ip_src, &ipv6->src_ip) != 1) {
+ asprintf(error, "bad IPv6 src address: '%s'\n", ip_src);
+ return STATUS_ERR;
+ }
+
+ if (inet_pton(AF_INET6, ip_dst, &ipv6->dst_ip) != 1) {
+ asprintf(error, "bad IPv6 dst address: '%s'\n", ip_dst);
+ return STATUS_ERR;
+ }
+
+ return STATUS_OK;
+}
+
+int ipv4_header_finish(struct packet *packet,
+ struct header *header, struct header *next_inner)
+{
+ struct ipv4 *ipv4 = header->h.ipv4;
+ int ip_bytes = sizeof(struct ipv4) + next_inner->total_bytes;
+
+ ipv4->tot_len = htons(ip_bytes);
+ ipv4->protocol = header_type_info(next_inner->type)->ip_proto;
+
+ /* Fill in IPv4 header checksum. */
+ ipv4->check = 0;
+ ipv4->check = ipv4_checksum(ipv4, ipv4->ihl * sizeof(u32));
+
+ header->total_bytes = ip_bytes;
+
+ return STATUS_OK;
+}
+
+int ipv6_header_finish(struct packet *packet,
+ struct header *header, struct header *next_inner)
+{
+ struct ipv6 *ipv6 = header->h.ipv6;
+ int ip_bytes = sizeof(struct ipv6) + next_inner->total_bytes;
+
+ assert(next_inner->total_bytes <= 0xffff);
+ ipv6->payload_len = htons(next_inner->total_bytes);
+ ipv6->next_header = header_type_info(next_inner->type)->ip_proto;
+
+ /* IPv6 has no header checksum. */
+
+ header->total_bytes = ip_bytes;
+
+ return STATUS_OK;
+}
diff --git a/test/packetdrill/ip_packet.h b/test/packetdrill/ip_packet.h
new file mode 100644
index 0000000..05f2a07
--- /dev/null
+++ b/test/packetdrill/ip_packet.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Interface for module for formatting IP packets.
+ */
+
+#ifndef __IP_PACKET_H__
+#define __IP_PACKET_H__
+
+#include "types.h"
+
+#include "packet.h"
+
+/* Populate header fields in the IP header at the given address. */
+extern void set_ip_header(void *ip_header,
+ int address_family,
+ u16 ip_bytes,
+ u8 tos, u32 flowlabel,
+ u8 ttl, u8 protocol);
+
+/* Set the packet's IP header pointer and then populate the IP header fields. */
+extern void set_packet_ip_header(struct packet *packet,
+ int address_family,
+ u16 ip_bytes,
+ u8 tos, u32 flowlabel,
+ u8 ttl, u8 protocol);
+
+/* Append an IPv4 header to the end of the given packet and fill in
+ * src/dst. On success, return STATUS_OK; on error return STATUS_ERR
+ * and fill in a malloc-allocated error message in *error.
+ */
+extern int ipv4_header_append(struct packet *packet,
+ const char *ip_src,
+ const char *ip_dst,
+ const u8 tos,
+ const u8 ttl,
+ char **error);
+
+/* Append an IPv6 header to the end of the given packet and fill in
+ * src/dst. On success, return STATUS_OK; on error return STATUS_ERR
+ * and fill in a malloc-allocated error message in *error.
+ */
+extern int ipv6_header_append(struct packet *packet,
+ const char *ip_src,
+ const char *ip_dst,
+ const u8 tos,
+ const u8 hop_limit,
+ char **error);
+
+/* Finalize the IPv4 header by filling in all necessary fields that
+ * were not filled in at parse time.
+ */
+extern int ipv4_header_finish(struct packet *packet,
+ struct header *header, struct header *next_inner);
+
+/* Finalize the IPv6 header by filling in all necessary fields that
+ * were not filled in at parse time.
+ */
+extern int ipv6_header_finish(struct packet *packet,
+ struct header *header, struct header *next_inner);
+
+#endif /* __IP_PACKET_H__ */
diff --git a/test/packetdrill/ip_prefix.c b/test/packetdrill/ip_prefix.c
new file mode 100644
index 0000000..044b94d
--- /dev/null
+++ b/test/packetdrill/ip_prefix.c
@@ -0,0 +1,148 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Implementation for operations for IPv4 and IPv6 prefixes.
+ */
+
+#include "ip_prefix.h"
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "logging.h"
+
+struct ip_prefix ip_to_prefix(const struct ip_address *ip, int prefix_len)
+{
+ int max_prefix_bits = 8 * ip_address_length(ip->address_family);
+ struct ip_prefix prefix;
+
+ if (prefix_len < 0 || prefix_len > max_prefix_bits)
+ die("invalid prefix_len: %d bits", prefix_len);
+
+ prefix.ip = *ip;
+ prefix.prefix_len = prefix_len;
+
+ return prefix;
+}
+
+void ip_prefix_normalize(struct ip_prefix *prefix)
+{
+ /* Find the byte and bit offset where the prefix ends. */
+ int bytes = prefix->prefix_len / 8;
+ int bits = prefix->prefix_len % 8;
+ int max_prefix_bytes = ip_address_length(prefix->ip.address_family);
+
+ /* Zero the bits beyond the prefix in the byte where it ends. */
+ if (bits != 0) {
+ int pos = 8 - bits;
+ prefix->ip.ip.bytes[bytes] &= ~((1U << pos) - 1);
+ ++bytes;
+
+ }
+ /* Zero out the rest of the bytes in the address. */
+ memset(prefix->ip.ip.bytes + bytes, 0, max_prefix_bytes - bytes);
+}
+
+/* Parse and return a prefix length (in bits) like /16 or /64 from the
+ * end of a string, and die if the prefix is bigger than the given max
+ * length. Use the maximum length if there is no prefix in the string.
+ */
+static int prefix_len_parse(const char *prefix_string, int max_len)
+{
+ int prefix_len = 0;
+ const char *len_str = NULL;
+
+ len_str = strstr(prefix_string, "/");
+ if (len_str != NULL) {
+ /* Parse prefix len in string */
+ char *end = NULL;
+
+ ++len_str; /* advance beyond '/' */
+ errno = 0;
+ prefix_len = strtol(len_str, &end, 10);
+
+ if (errno != 0 || *end != '\0' ||
+ (prefix_len < 0) || (prefix_len > max_len))
+ die("bad prefix length in prefix '%s'\n",
+ prefix_string);
+ } else {
+ /* Default prefix length is all address bits */
+ prefix_len = max_len;
+ }
+
+ return prefix_len;
+}
+
+/* Copy the address part of a "<address>/<prefix>" string. */
+static char *copy_prefix_address(const char *prefix_string)
+{
+ const char *slash = strstr(prefix_string, "/");
+ int len = 0;
+ if (slash != NULL)
+ len = slash - prefix_string;
+ else
+ len = strlen(prefix_string);
+ return strndup(prefix_string, len);
+}
+
+struct ip_prefix ipv4_prefix_parse(const char *prefix_string)
+{
+ char *ip_str = copy_prefix_address(prefix_string);
+ struct ip_address ip = ipv4_parse(ip_str);
+ int prefix_len = prefix_len_parse(prefix_string,
+ 8 * ip_address_length(AF_INET));
+
+ free(ip_str);
+
+ return ip_to_prefix(&ip, prefix_len);
+}
+
+struct ip_prefix ipv6_prefix_parse(const char *prefix_string)
+{
+ char *ip_str = copy_prefix_address(prefix_string);
+ struct ip_address ip = ipv6_parse(ip_str);
+ int prefix_len = prefix_len_parse(prefix_string,
+ 8 * ip_address_length(AF_INET6));
+
+ free(ip_str);
+
+ return ip_to_prefix(&ip, prefix_len);
+}
+
+const char *ip_prefix_to_string(struct ip_prefix *prefix, char *buffer)
+{
+ char ip_str[ADDR_STR_LEN];
+ int bytes = 0;
+
+ memset(ip_str, 0, sizeof(ip_str));
+ ip_to_string(&prefix->ip, ip_str);
+
+ if (strlen(ip_str) + strlen("/128") + 1 > ADDR_STR_LEN)
+ die("address prefix would overflow buffer!");
+
+ bytes = snprintf(buffer, ADDR_STR_LEN, "%s/%d",
+ ip_str, prefix->prefix_len);
+ if (bytes >= ADDR_STR_LEN)
+ die("address prefix overflowed buffer!");
+
+ return buffer;
+}
diff --git a/test/packetdrill/ip_prefix.h b/test/packetdrill/ip_prefix.h
new file mode 100644
index 0000000..0b82260
--- /dev/null
+++ b/test/packetdrill/ip_prefix.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Types and operations for IPv4 and IPv6 address prefixes.
+ */
+
+#ifndef __IP_PREFIX_H__
+#define __IP_PREFIX_H__
+
+#include "types.h"
+
+#include "ip_address.h"
+
+/* IPv4 or IPv6 address prefix. */
+struct ip_prefix {
+ struct ip_address ip;
+ int prefix_len; /* prefix length in bits */
+};
+
+static inline void ip_prefix_reset(struct ip_prefix *prefix)
+{
+ memset(prefix, 0, sizeof(*prefix));
+}
+
+/* Parse a human-readable IPv4 prefix and return it. Print an error
+ * to stderr and exit if there is an error parsing the prefix.
+ */
+extern struct ip_prefix ipv4_prefix_parse(const char *prefix_string);
+
+/* Parse a human-readable IPv6 prefix and return it. Print an error
+ * to stderr and exit if there is an error parsing the prefix.
+ */
+extern struct ip_prefix ipv6_prefix_parse(const char *prefix_string);
+
+/* Fill in the given prefix using the first 'prefix_len' bits of the
+ * given IP address, zeroing out bits beyond the prefix length.
+ */
+extern struct ip_prefix ip_to_prefix(const struct ip_address *ip,
+ int prefix_len);
+
+/* Zero the bits beyond the prefix length. */
+void ip_prefix_normalize(struct ip_prefix *prefix);
+
+/* Print a human-readable representation of the given IP prefix in the
+ * given buffer, which must be at least ADDR_STR_LEN bytes long.
+ * Returns a pointer to the given buffer.
+ */
+extern const char *ip_prefix_to_string(struct ip_prefix *prefix,
+ char *buffer);
+
+#endif /* __IP_PREFIX_H__ */
diff --git a/test/packetdrill/ipv6.h b/test/packetdrill/ipv6.h
new file mode 100644
index 0000000..07a0964
--- /dev/null
+++ b/test/packetdrill/ipv6.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Our own IPv6 header declarations, so we have something that's
+ * portable and somewhat more readable than a typical system header
+ * file.
+ */
+
+#ifndef __IPV6_HEADERS_H__
+#define __IPV6_HEADERS_H__
+
+#include "types.h"
+
+#include <netinet/in.h>
+
+struct ipv6 {
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+ __u8 traffic_class_hi:4,
+ version:4;
+ __u8 flow_label_hi:4,
+ traffic_class_lo:4;
+ __u16 flow_label_lo;
+#elif __BYTE_ORDER == __BIG_ENDIAN
+ __u8 version:4,
+ traffic_class_hi:4;
+ __u8 traffic_class_lo:4,
+ flow_label_hi:4;
+ __u16 flow_label_lo;
+#else
+# error "Please fix endianness defines"
+#endif
+
+ __be16 payload_len;
+ __u8 next_header;
+ __u8 hop_limit;
+
+ struct in6_addr src_ip;
+ struct in6_addr dst_ip;
+};
+
+#ifdef linux
+#define IPV6_HOPLIMIT 52
+#define IPV6_TCLASS 67
+#endif /* linux */
+
+static inline u8 ipv6_tos_byte(const struct ipv6 *ipv6)
+{
+ return (ipv6->traffic_class_hi << 4) | ipv6->traffic_class_lo;
+}
+
+static inline u32 ipv6_flow_label(const struct ipv6 *ipv6)
+{
+ return (ntohs(ipv6->flow_label_lo)) | (ipv6->flow_label_hi << 16);
+}
+
+static inline u8 ipv6_hoplimit_byte(const struct ipv6 *ipv6)
+{
+ return ipv6->hop_limit;
+}
+
+/* The following struct declaration is needed for the IPv6 ioctls
+ * SIOCSIFADDR and SIOCDIFADDR that add and delete IPv6 addresses from
+ * a network interface. We have to declare our own version here
+ * because this struct is only available in /usr/include/linux/ipv6.h,
+ * but that .h file has kernel IPv6 declarations that conflict with
+ * standard user-space IPv6 declarations.
+ */
+struct in6_ifreq {
+ struct in6_addr ifr6_addr;
+ __u32 ifr6_prefixlen;
+ int ifr6_ifindex;
+};
+
+#endif /* __IPV6_HEADERS_H__ */
diff --git a/test/packetdrill/lexer.l b/test/packetdrill/lexer.l
new file mode 100644
index 0000000..7d063d3
--- /dev/null
+++ b/test/packetdrill/lexer.l
@@ -0,0 +1,280 @@
+%{
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * This is the specification for the lexical scanner for the packetdrill
+ * script language. It is processed by the flex lexical scanner
+ * generator.
+ *
+ * For full documentation see: http://flex.sourceforge.net/manual/
+ *
+ * Here is a quick and dirty tutorial on flex:
+ *
+ * A flex lexical scanner specification is basically a list of rules,
+ * where each rule is a regular expressions for a lexical token to
+ * match, followed by a C fragment to execute when the scanner sees
+ * that pattern.
+ *
+ * The lexer feeds a stream of terminal symbols up to this parser,
+ * passing up a FOO token for each "return FOO" in the lexer spec. The
+ * lexer specifies what value to pass up to the parser by setting a
+ * yylval.fooval field, where fooval is a field in the %union in the
+ * .y file.
+ *
+ * TODO: detect overflow in numeric literals.
+ */
+
+#include "types.h"
+
+#include <netinet/in.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "script.h"
+#include "tcp_options.h"
+#include "parse.h"
+#include "config.h"
+
+/* This include of the bison-generated .h file must go last so that we
+ * can first include all of the declarations on which it depends.
+ */
+#include "parser.h"
+
+/* Suppress flex's generation of an uncalled static input() function, which
+ * leads to a compiler warning:
+ * warning: ‘input’ defined but not used
+ */
+#define YY_NO_INPUT
+
+/* Copy the string name "foo" after the "--" of a "--foo" option. */
+static char *option(const char *s)
+{
+ const int dash_dash_len = 2;
+ return strndup(s + dash_dash_len, strlen(s) - dash_dash_len);
+}
+
+/* Copy the string inside a quoted string. */
+static char *quoted(const char *s)
+{
+ const int delim_len = 1;
+ return strndup(s + delim_len, strlen(s) - 2*delim_len);
+}
+
+/* Check to see if the word in yytext is a user-defined symbol, and if so then
+ * return its value. Otherwise return the word itself.
+ */
+int word(void)
+{
+ char *word = yytext;
+ char *value = NULL;
+
+ /* Look in symbol table for matching user-defined symbol->value map. */
+ value = definition_get(in_config->defines, word);
+ if (value) {
+ if (value[0] == '"') {
+ yylval.string = quoted(value); /* SYM="val" */
+ return STRING;
+ } else if (value[0] == '`') {
+ yylval.string = quoted(value); /* SYM=`val` */
+ return BACK_QUOTED;
+ } else {
+ yylval.string = strdup(value); /* SYM=val */
+ return WORD;
+ }
+ }
+ /* A literal word (e.g. system call name or socket option name). */
+ yylval.string = strdup(word);
+ return WORD;
+}
+
+/* Copy the code inside a code snippet that is enclosed in %{ }% after
+ * first stripping the space and tab characters from either end of the
+ * snippet. We strip leading and trailing whitespace for Python users
+ * to remain sane, since Python is sensitive to whitespace. To summarize,
+ * given an input %{<space><code><space>}% we return: <code>
+ */
+static char *code(const char *s)
+{
+ const int delim_len = sizeof("%{")-1;
+
+ const char *start = s + delim_len;
+ while ((*start == ' ') || (*start == '\t'))
+ ++start;
+
+ const char *end = s + (strlen(s) - 1) - delim_len;
+ while ((*end == ' ') || (*end == '\t'))
+ --end;
+
+ const int code_len = end - start + 1;
+ return strndup(start, code_len);
+}
+
+/* Convert a hex string prefixed by "0x" to an integer value. */
+static s64 hextol(const char *s)
+{
+ return strtol(yytext + 2, NULL, 16);
+}
+
+%}
+
+%{
+#define YY_USER_ACTION yylloc.first_line = yylloc.last_line = yylineno;
+%}
+%option yylineno
+%option nounput
+
+/* A regexp for C++ comments: */
+cpp_comment \/\/[^\n]*\n
+
+/* Here is a summary of the regexp for C comments:
+ * open-comment
+ * any number of:
+ * (non-stars) or (star then non-slash)
+ * close comment
+ */
+c_comment \/\*(([^*])|(\*[^\/]))*\*\/
+
+/* The regexp for code snippets is analogous to that for C comments.
+ * Here is a summary of the regexp for code snippets:
+ * %{
+ * any number of:
+ * (non-}) or (} then non-%)
+ * }%
+ */
+code \%\{(([^}])|(\}[^\%]))*\}\%
+
+/* IPv4: a regular experssion for an IPv4 address */
+ipv4_addr [0-9]+[.][0-9]+[.][0-9]+[.][0-9]+
+
+/* IPv6: a regular experssion for an IPv6 address. The complexity is
+ * unfortunate, but we can't use a super-simple approach because TCP
+ * sequence number ranges like 1:1001 can look like IPv6 addresses if
+ * we use a naive approach.
+ */
+seg [0-9a-fA-F]{1,4}
+v0 [:][:]
+v1 ({seg}[:]){7,7}{seg}
+v2 ({seg}[:]){1,7}[:]
+v3 ({seg}[:]){1,6}[:]{seg}
+v4 ({seg}[:]){1,5}([:]{seg}){1,2}
+v5 ({seg}[:]){1,4}([:]{seg}){1,3}
+v6 ({seg}[:]){1,3}([:]{seg}){1,4}
+v7 ({seg}[:]){1,2}([:]{seg}){1,5}
+v8 {seg}[:](([:]{seg}){1,6})
+v9 [:]([:]{seg}){1,7}
+/* IPv4-mapped IPv6 address: */
+v10 [:][:]ffff[:]{ipv4_addr}
+/* IPv4-translated IPv6 address: */
+v11 [:][:]ffff[:](0){1,4}[:]{ipv4_addr}
+/* IPv4-embedded IPv6 addresses: */
+v12 ({seg}[:]){1,4}[:]{ipv4_addr}
+ipv6_addr ({v0}|{v1}|{v2}|{v3}|{v4}|{v5}|{v6}|{v7}|{v8}|{v9}|{v10}|{v11}|{v12})
+
+%%
+sa_family return SA_FAMILY;
+sin_port return SIN_PORT;
+sin_addr return SIN_ADDR;
+msg_name return MSG_NAME;
+msg_iov return MSG_IOV;
+msg_flags return MSG_FLAGS;
+msg_control return MSG_CONTROL;
+cmsg_data return CMSG_DATA;
+cmsg_level return CMSG_LEVEL;
+cmsg_type return CMSG_TYPE;
+ee_errno return EE_ERRNO;
+ee_origin return EE_ORIGIN;
+ee_type return EE_TYPE;
+ee_code return EE_CODE;
+ee_info return EE_INFO;
+ee_data return EE_DATA;
+scm_sec return SCM_SEC;
+scm_nsec return SCM_NSEC;
+fd return FD;
+u32 return U32;
+u64 return U64;
+ptr return PTR;
+events return EVENTS;
+revents return REVENTS;
+onoff return ONOFF;
+linger return LINGER;
+htons return _HTONS_;
+ipv4 return IPV4;
+ipv6 return IPV6;
+icmp return ICMP;
+udp return UDP;
+GREv0 return GRE;
+gre return GRE;
+raw return RAW;
+sum return SUM;
+off return OFF;
+key return KEY;
+seq return SEQ;
+none return NONE;
+checksum return CHECKSUM;
+sequence# return SEQUENCE;
+present return PRESENT;
+mpls return MPLS;
+label return LABEL;
+tc return TC;
+ttl return TTL;
+inet_addr return INET_ADDR;
+inet6_addr return INET6_ADDR;
+ack return ACK;
+eol return EOL;
+ecr return ECR;
+mss return MSS;
+mtu return MTU;
+nop return NOP;
+sack return SACK;
+sackOK return SACKOK;
+md5 return MD5;
+TS return TIMESTAMP;
+FO return FAST_OPEN;
+FOEXP return FAST_OPEN_EXP;
+tos return TOS;
+flowlabel return FLOWLABEL;
+flags return FLAGS;
+Flags return FLAGS;
+val return VAL;
+win return WIN;
+urg return URG;
+wscale return WSCALE;
+ect01 return ECT01;
+ect0 return ECT0;
+ect1 return ECT1;
+noecn return NO_ECN;
+ce return CE;
+id return ID;
+[.][.][.] return ELLIPSIS;
+--[a-zA-Z0-9_]+ yylval.string = option(yytext); return OPTION;
+[-]?[0-9]*[.][0-9]+ yylval.floating = atof(yytext); return FLOAT;
+[-]?[0-9]+ yylval.integer = atoll(yytext); return INTEGER;
+0x[0-9a-fA-F]+ yylval.integer = hextol(yytext); return HEX_INTEGER;
+[a-zA-Z0-9_]+ return word();
+\"(\\.|[^"])*\" yylval.string = quoted(yytext); return STRING;
+\`(\\.|[^`])*\` yylval.string = quoted(yytext); return BACK_QUOTED;
+[^ \t\n] return (int) yytext[0];
+[ \t\n]+ /* ignore whitespace */;
+{cpp_comment} /* ignore C++-style comment */;
+{c_comment} /* ignore C-style comment */;
+{code} yylval.string = code(yytext); return CODE;
+{ipv4_addr} yylval.string = strdup(yytext); return IPV4_ADDR;
+{ipv6_addr} yylval.string = strdup(yytext); return IPV6_ADDR;
+%%
diff --git a/test/packetdrill/link_layer.c b/test/packetdrill/link_layer.c
new file mode 100644
index 0000000..d6e85a4
--- /dev/null
+++ b/test/packetdrill/link_layer.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Link-layer utilities.
+ */
+
+#include "link_layer.h"
+
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "logging.h"
+
+#ifdef linux
+
+#include <net/if.h>
+#include <sys/ioctl.h>
+
+#include "wrap.h"
+
+void get_hw_address(const char *name, struct ether_addr *hw_address,
+ enum ip_version_t ip_version)
+{
+ u8 *m = NULL;
+ struct ifreq ifr;
+ int fd;
+
+ DEBUGP("get_hw_address for device %s\n", name);
+
+ fd = wrap_socket(ip_version, SOCK_DGRAM);
+
+ /* Discover the index of the interface. */
+ snprintf(ifr.ifr_name, IFNAMSIZ, "%s", name);
+ if (ioctl(fd, SIOCGIFINDEX, &ifr) < 0)
+ die_perror("ioctl SIOCGIFINDEX");
+
+ /* Get hardware address for the interface. */
+ if (ioctl(fd, SIOCGIFHWADDR, &ifr) != 0)
+ die_perror("ioctl SIOCGIFHWADDR");
+
+ m = (u8 *)&ifr.ifr_addr.sa_data;
+ DEBUGP("%s HWaddr: %02x:%02x:%02x:%02x:%02x:%02x\n",
+ name, m[0], m[1], m[2], m[3], m[4], m[5]);
+ memcpy(hw_address, m, sizeof(*hw_address));
+
+ if (close(fd))
+ die_perror("close");
+}
+
+#else
+
+#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
+#include <net/if_types.h>
+#include <net/if_dl.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <ifaddrs.h>
+#endif /* defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) */
+
+void get_hw_address(const char *name, struct ether_addr *hw_address)
+{
+ struct ifaddrs *ifaddrs_list, *ifaddr;
+
+ DEBUGP("get_hw_address for device %s\n", name);
+
+ if (getifaddrs(&ifaddrs_list) < 0)
+ die_perror("getifaddrs");
+
+ for (ifaddr = ifaddrs_list; ifaddr != NULL; ifaddr = ifaddr->ifa_next) {
+ if (strcmp(name, ifaddr->ifa_name) == 0 &&
+ ifaddr->ifa_addr->sa_family == AF_LINK) {
+ struct sockaddr_dl *sdl;
+ sdl = (struct sockaddr_dl *)ifaddr->ifa_addr;
+ if (sdl->sdl_type == IFT_ETHER) {
+ memcpy(hw_address, LLADDR(sdl),
+ sizeof(*hw_address));
+ freeifaddrs(ifaddrs_list);
+ return;
+ }
+ }
+ }
+
+ die("unable to find hw address for %s\n", name);
+}
+
+#endif
diff --git a/test/packetdrill/link_layer.h b/test/packetdrill/link_layer.h
new file mode 100644
index 0000000..e5812bf
--- /dev/null
+++ b/test/packetdrill/link_layer.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Link-layer utilities.
+ */
+
+#ifndef __LINK_LAYER_H__
+#define __LINK_LAYER_H__
+
+#include "types.h"
+
+#include "ethernet.h"
+
+struct config;
+
+/* Get the link layer address for the device with the given name, or die. */
+void get_hw_address(const char *name, struct ether_addr *hw_address,
+ enum ip_version_t ip_version);
+
+#endif /* __LINK_LAYER_H__ */
diff --git a/test/packetdrill/logging.c b/test/packetdrill/logging.c
new file mode 100644
index 0000000..730add4
--- /dev/null
+++ b/test/packetdrill/logging.c
@@ -0,0 +1,51 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Logging and output functions.
+ */
+
+#include "run.h"
+#include "system.h"
+
+#include <stdarg.h>
+#include <stdlib.h>
+
+extern void __attribute__((noreturn)) die(char *format, ...)
+{
+ va_list ap;
+
+ va_start(ap, format);
+ vfprintf(stderr, format, ap);
+ va_end(ap);
+
+ run_cleanup_command();
+
+ exit(EXIT_FAILURE);
+}
+
+void __attribute__((noreturn)) die_perror(char *message)
+{
+ perror(message);
+
+ run_cleanup_command();
+
+ exit(EXIT_FAILURE);
+}
diff --git a/test/packetdrill/logging.h b/test/packetdrill/logging.h
new file mode 100644
index 0000000..d961c9d
--- /dev/null
+++ b/test/packetdrill/logging.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Logging and output functions.
+ */
+
+#ifndef __LOGGING_H__
+#define __LOGGING_H__
+
+#include "types.h"
+
+/* Enable this to get debug logging. */
+#define DEBUG_LOGGING 0
+
+/* Use a gcc variadic macro to conditionally compile debug printing. */
+#define DEBUGP(...) \
+ if (DEBUG_LOGGING) { \
+ fprintf(stdout, __VA_ARGS__); \
+ fflush(stdout); \
+ }
+
+/* Log the message to stderr and then exit with a failure status code. */
+extern void __attribute__((noreturn)) die(char *format, ...);
+
+/* Call perror() with message and then exit with a failure status code. */
+extern void __attribute__((noreturn)) die_perror(char *message);
+
+#endif /* __LOGGING_H__ */
diff --git a/test/packetdrill/mpls.h b/test/packetdrill/mpls.h
new file mode 100644
index 0000000..b536437
--- /dev/null
+++ b/test/packetdrill/mpls.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Our own MPLS header declarations, so we have something that's
+ * portable and somewhat more readable than a typical system header
+ * file.
+ *
+ * We cannot include the kernel's MPLS .h files because this tool tries
+ * to compile and work for basically any Linux/BSD kernel version. So
+ * we declare our own version of various MPLS-related definitions here.
+ */
+
+#ifndef __MPLS_HEADERS_H__
+#define __MPLS_HEADERS_H__
+
+#include <stdlib.h>
+#include "types.h"
+
+/* On-the-wire MPLS "label stack entry", per RFC 3032 and RFC 5462. */
+struct mpls {
+ __be32 entry;
+};
+
+/* Bit-shifting macros to access MPLS fields (the label straddles byte
+ * boundaries so there's no simple/clean way to use bit fields).
+ */
+#define MPLS_LABEL_MASK 0xfffff000 /* label */
+#define MPLS_LABEL_SHIFT 12
+#define MPLS_TC_MASK 0x00000e00 /* traffic class */
+#define MPLS_TC_SHIFT 9
+#define MPLS_STACK_MASK 0x00000100 /* is stack bottom? */
+#define MPLS_STACK_SHIFT 8
+#define MPLS_TTL_MASK 0x000000ff /* time to live */
+#define MPLS_TTL_SHIFT 0
+
+/* Return the label from an MPLS label stack entry. */
+static inline u32 mpls_entry_label(const struct mpls *mpls)
+{
+ return (ntohl(mpls->entry) & MPLS_LABEL_MASK) >> MPLS_LABEL_SHIFT;
+}
+
+/* Return the traffic class from an MPLS label stack entry. */
+static inline u8 mpls_entry_tc(const struct mpls *mpls)
+{
+ return (ntohl(mpls->entry) & MPLS_TC_MASK) >> MPLS_TC_SHIFT;
+}
+
+/* Return the "is stack bottom?" bit from an MPLS label stack entry. */
+static inline bool mpls_entry_stack(const struct mpls *mpls)
+{
+ return (ntohl(mpls->entry) & MPLS_STACK_MASK) >> MPLS_STACK_SHIFT;
+}
+
+/* Return the TTL from an MPLS label stack entry. */
+static inline u8 mpls_entry_ttl(const struct mpls *mpls)
+{
+ return (ntohl(mpls->entry) & MPLS_TTL_MASK) >> MPLS_TTL_SHIFT;
+}
+
+/* Fill in an MPLS label stack entry with the given field values. */
+static inline void mpls_entry_set(u32 label, u8 traffic_class,
+ bool is_stack_bottom, u8 ttl,
+ struct mpls *mpls)
+{
+ mpls->entry = htonl((label << MPLS_LABEL_SHIFT) |
+ (traffic_class << MPLS_TC_SHIFT) |
+ (is_stack_bottom << MPLS_STACK_SHIFT) |
+ (ttl << MPLS_TTL_SHIFT));
+}
+
+/* Parse-time representation of an MPLS label stack entry. */
+#define MPLS_STACK_MAX_ENTRIES 6 /* maximum number of label entries */
+struct mpls_stack {
+ struct mpls entries[MPLS_STACK_MAX_ENTRIES];
+ int length; /* number of MPLS label stack entries */
+};
+
+/* Allocate and initialize a new MPLS label stack as empty. */
+static inline struct mpls_stack *mpls_stack_new(void)
+{
+ return calloc(1, sizeof(struct mpls_stack));
+}
+
+/* Appends the given MPLS label stack entry to the given stack. Returns
+ * STATUS_OK on success, or STATUS_ERR on error (if the label stack is full).
+ */
+static inline int mpls_stack_append(struct mpls_stack *stack, struct mpls mpls)
+{
+ if (stack->length >= ARRAY_SIZE(stack->entries))
+ return STATUS_ERR;
+ stack->entries[stack->length++] = mpls;
+ return STATUS_OK;
+}
+
+#endif /* __MPLS_HEADERS_H__ */
diff --git a/test/packetdrill/mpls_packet.c b/test/packetdrill/mpls_packet.c
new file mode 100644
index 0000000..2448681
--- /dev/null
+++ b/test/packetdrill/mpls_packet.c
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Implementation for module for formatting MPLS packets.
+ */
+
+#include "mpls_packet.h"
+
+#include "mpls.h"
+
+int new_mpls_stack_entry(s64 label, s64 traffic_class,
+ bool is_stack_bottom, s64 ttl,
+ struct mpls *mpls, char **error)
+{
+ if ((label < 0) || (label >= (1<<20))) {
+ asprintf(error, "MPLS label out of range for 20 bits");
+ return STATUS_ERR;
+ }
+
+ if ((traffic_class < 0) || (traffic_class >= (1<<3))) {
+ asprintf(error, "MPLS traffic_class out of range for 3 bits");
+ return STATUS_ERR;
+ }
+
+ if ((ttl < 0) || (ttl >= (1<<8))) {
+ asprintf(error, "MPLS ttl out of range for 8 bits");
+ return STATUS_ERR;
+ }
+
+ mpls_entry_set(label, traffic_class, is_stack_bottom, ttl, mpls);
+ return STATUS_OK;
+}
+
+int mpls_header_append(struct packet *packet, struct mpls_stack *mpls_stack,
+ char **error)
+{
+ struct header *header;
+ int mpls_bytes = mpls_stack->length * sizeof(struct mpls);
+
+ header = packet_append_header(packet, HEADER_MPLS, mpls_bytes);
+ if (header == NULL) {
+ asprintf(error, "too many headers");
+ return STATUS_ERR;
+ }
+
+ memcpy(header->h.mpls, mpls_stack->entries, mpls_bytes);
+
+ return STATUS_OK;
+}
+
+int mpls_header_finish(struct packet *packet,
+ struct header *header, struct header *next_inner)
+{
+ int mpls_bytes = header->header_bytes + next_inner->total_bytes;
+
+ header->total_bytes = mpls_bytes;
+
+ return STATUS_OK;
+}
diff --git a/test/packetdrill/mpls_packet.h b/test/packetdrill/mpls_packet.h
new file mode 100644
index 0000000..16079c0
--- /dev/null
+++ b/test/packetdrill/mpls_packet.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Interface for module for formatting MPLS packets.
+ */
+
+#ifndef __MPLS_PACKET_H__
+#define __MPLS_PACKET_H__
+
+#include "types.h"
+
+#include "mpls.h"
+#include "packet.h"
+
+/* Fill in the given MPLS label stack entry with the given field
+ * values, validating that actual parameter value fits inside the
+ * width of the field on the wire. On success, return STATUS_OK; on
+ * error return STATUS_ERR and fill in a malloc-allocated error
+ * message in *error.
+ */
+extern int new_mpls_stack_entry(s64 label, s64 traffic_class,
+ bool is_stack_bottom, s64 ttl,
+ struct mpls *mpls, char **error);
+
+/* Append an MPLS header to the end of the given packet. On success,
+ * return STATUS_OK; on error return STATUS_ERR and fill in a
+ * malloc-allocated error message in *error.
+ */
+extern int mpls_header_append(struct packet *packet,
+ struct mpls_stack *mpls_stack,
+ char **error);
+
+/* Finalize the MPLS header by filling in all necessary fields that
+ * were not filled in at parse time.
+ */
+extern int mpls_header_finish(struct packet *packet,
+ struct header *header, struct header *next_inner);
+
+#endif /* __MPLS_PACKET_H__ */
diff --git a/test/packetdrill/net_utils.c b/test/packetdrill/net_utils.c
new file mode 100644
index 0000000..1b59d64
--- /dev/null
+++ b/test/packetdrill/net_utils.c
@@ -0,0 +1,172 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Implementation for various network utilities.
+ */
+
+#include "net_utils.h"
+
+#include <stdlib.h>
+#include <net/if.h>
+#include <unistd.h>
+
+#include "logging.h"
+
+static void verbose_system(const char *command)
+{
+ int result;
+
+ DEBUGP("running: '%s'\n", command);
+ result = system(command);
+ DEBUGP("result: %d\n", result);
+ if (result != 0)
+ DEBUGP("error executing command '%s'\n", command);
+}
+
+/* Configure a local IPv4 address and netmask for the device */
+static void net_add_ipv4_address(const char *dev_name,
+ const struct ip_address *ip,
+ int prefix_len)
+{
+ char *command = NULL;
+ char ip_string[ADDR_STR_LEN];
+
+ ip_to_string(ip, ip_string);
+
+#ifdef linux
+ asprintf(&command, "ip addr add %s/%d dev %s > /dev/null 2>&1",
+ ip_string, prefix_len, dev_name);
+#endif
+#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
+ asprintf(&command, "/sbin/ifconfig %s %s/%d alias",
+ dev_name, ip_string, prefix_len);
+#endif /* defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) */
+
+ verbose_system(command);
+ free(command);
+}
+
+/* Configure a local IPv6 address and prefix length for the device */
+static void net_add_ipv6_address(const char *dev_name,
+ const struct ip_address *ip,
+ int prefix_len)
+{
+ char *command = NULL;
+ char ip_string[ADDR_STR_LEN];
+
+ ip_to_string(ip, ip_string);
+
+#ifdef linux
+
+ asprintf(&command, "ip addr add %s/%d dev %s > /dev/null 2>&1",
+ ip_string, prefix_len, dev_name);
+#endif
+#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
+
+ asprintf(&command, "/sbin/ifconfig %s inet6 %s/%d",
+ dev_name, ip_string, prefix_len);
+#endif /* defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) */
+
+ verbose_system(command);
+ free(command);
+
+ /* Wait for IPv6 duplicate address detection to converge,
+ * so that this address no longer shows as "tentative".
+ * e.g. "ip addr show" shows:
+ * inet6 fd3d:fa7b:d17d::36/48 scope global tentative
+ */
+#ifdef linux
+ if (!strstr(dev_name, "tun"))
+ sleep(2);
+#endif
+#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
+ sleep(3);
+#endif
+}
+
+void net_add_dev_address(const char *dev_name,
+ const struct ip_address *ip,
+ int prefix_len)
+{
+ switch (ip->address_family) {
+ case AF_INET:
+ net_add_ipv4_address(dev_name, ip, prefix_len);
+ break;
+ case AF_INET6:
+ net_add_ipv6_address(dev_name, ip, prefix_len);
+ break;
+ default:
+ assert(!"bad family");
+ break;
+ }
+}
+
+void net_del_dev_address(const char *dev_name,
+ const struct ip_address *ip,
+ int prefix_len)
+{
+ char *command = NULL;
+ char ip_string[ADDR_STR_LEN];
+
+ ip_to_string(ip, ip_string);
+
+#ifdef linux
+ asprintf(&command, "ip addr del %s/%d dev %s > /dev/null 2>&1",
+ ip_string, prefix_len, dev_name);
+#endif
+#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
+ asprintf(&command, "/sbin/ifconfig %s %s %s/%d -alias",
+ dev_name,
+ ip->address_family == AF_INET6 ? "inet6" : "",
+ ip_string, prefix_len);
+#endif /* defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) */
+
+ verbose_system(command);
+ free(command);
+}
+
+/* In general we want to avoid configuring a new IP address on an
+ * interface, because we do not want to pay the latency penaly
+ * (e.g. it takes about one second for IPv6 duplicate address
+ * detection). So if we find the IP configured the correct local
+ * network device, then we're done, and we short-circuit and return
+ * immediately. Otherwise remove the address from the current device
+ * and add it on the newly-requested device.
+ */
+void net_setup_dev_address(const char *dev_name,
+ const struct ip_address *ip,
+ int prefix_len)
+{
+ char cur_dev_name[IFNAMSIZ];
+
+ bool found = get_ip_device(ip, cur_dev_name);
+
+ DEBUGP("net_setup_dev_address: found: %d\n", found);
+
+ if (found && strcmp(cur_dev_name, dev_name) == 0) {
+ DEBUGP("net_setup_dev_address: found on correct device\n");
+ return;
+ }
+
+ if (found)
+ net_del_dev_address(cur_dev_name, ip, prefix_len);
+ net_add_dev_address(dev_name, ip, prefix_len);
+}
diff --git a/test/packetdrill/net_utils.h b/test/packetdrill/net_utils.h
new file mode 100644
index 0000000..bdc1009
--- /dev/null
+++ b/test/packetdrill/net_utils.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Interface for various network utilities related to configuring IP
+ * addresses for network devices.
+ */
+
+#ifndef __NET_UTILS_H__
+#define __NET_UTILS_H__
+
+#include "types.h"
+
+#include "ip_address.h"
+
+/* Add the given IP address, with the given subnet/prefix length,
+ * to the given device.
+ */
+extern void net_add_dev_address(const char *dev_name,
+ const struct ip_address *ip,
+ int prefix_len);
+
+/* Delete the given IP address, with the given subnet/prefix length,
+ * from the given device.
+ */
+extern void net_del_dev_address(const char *dev_name,
+ const struct ip_address *ip,
+ int prefix_len);
+
+/* See if the given IP address, with the given subnet/prefix length,
+ * is already on the given device. If so, return without doing
+ * anything. If not, delete it from any device it's currently on, and
+ * add it to the given network device.
+ */
+extern void net_setup_dev_address(const char *dev_name,
+ const struct ip_address *ip,
+ int prefix_len);
+
+#endif /* __NET_UTILS_H__ */
diff --git a/test/packetdrill/netdev.c b/test/packetdrill/netdev.c
new file mode 100644
index 0000000..7734709
--- /dev/null
+++ b/test/packetdrill/netdev.c
@@ -0,0 +1,502 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Implementation for a "virtual network device" module to
+ * inject packets into the kernel and read packets leaving the kernel.
+ */
+
+#include "netdev.h"
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
+#include <net/if_tun.h>
+#endif /* defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) */
+
+#include "assert.h"
+#include "ip.h"
+#include "ipv6.h"
+#include "logging.h"
+#include "net_utils.h"
+#include "packet.h"
+#include "packet_parser.h"
+#include "packet_socket.h"
+#include "tcp.h"
+#include "tun.h"
+#include "wrap.h"
+
+/* Internal private state for the netdev for purely local tests. */
+struct local_netdev {
+ struct netdev netdev; /* "inherit" from netdev */
+
+ char *name; /* malloc-ed copy of interface name (owned) */
+ int tun_fd; /* tun for sending/receiving packets */
+ int control_fd; /* fd for configuration of tun interface */
+ int index; /* interface index from if_nametoindex */
+ struct packet_socket *psock; /* for sniffing packets (owned) */
+};
+
+struct netdev_ops local_netdev_ops;
+
+/* "Downcast" an abstract netdev to our local flavor. */
+static inline struct local_netdev *to_local_netdev(struct netdev *netdev)
+{
+ return (struct local_netdev *)netdev;
+}
+
+/* Clean up any old tun device state that might be lying around from
+ * previous tests. NetBSD the kernel does not automatically tear down
+ * unreferenced tun devices and routes referencing those routes.
+ */
+static void cleanup_old_device(struct config *config,
+ struct local_netdev *netdev)
+{
+#if defined(__NetBSD__)
+ char *cleanup_command = NULL;
+ int result;
+
+ asprintf(&cleanup_command,
+ "/sbin/ifconfig %s down delete > /dev/null 2>&1",
+ TUN_DEV);
+ DEBUGP("running: '%s'\n", cleanup_command);
+ result = system(cleanup_command);
+ DEBUGP("result: %d\n", result);
+ free(cleanup_command);
+#endif /* defined(__NetBSD__) */
+}
+
+/* Check that the remote IP is actually remote. It must be to ensure
+ * that test packets will pass into our tun device.
+ */
+static void check_remote_address(struct config *config,
+ struct local_netdev *netdev)
+{
+ if (is_ip_local(&config->live_remote_ip)) {
+ die("error: live_remote_ip %s is not remote\n",
+ config->live_remote_ip_string);
+ }
+}
+
+/* Make sure config->live_local_ip is not configured on any devices.
+ * This is only used for anyip tests.
+ */
+static void check_local_anyip(struct config *config)
+{
+ if (is_ip_local(&config->live_local_ip)) {
+ die("error: live_local_ip %s is not remote for anyip\n",
+ config->live_local_ip_string);
+ }
+}
+
+/* Create a tun device for the lifetime of this test. */
+static void create_device(struct config *config, struct local_netdev *netdev)
+{
+ /* Open the tun device, which "clones" it for our purposes. */
+ int tun_fd;
+#ifdef linux
+ int nb = 0;
+
+loop:
+ if (++nb > 10)
+ die_perror("open tun device");
+#endif
+ tun_fd = open(TUN_PATH, O_RDWR);
+ if (tun_fd < 0)
+ die_perror("open tun device");
+
+ netdev->tun_fd = tun_fd;
+
+#ifdef linux
+ /* Create the device. Since we do not specify a device name, the
+ * kernel will try to allocate the "next" device of the specified
+ * type. This device will disappear when we are done.
+ */
+ struct ifreq ifr;
+ memset(&ifr, 0, sizeof(ifr));
+ ifr.ifr_flags = IFF_TUN | IFF_NO_PI | IFF_VNET_HDR;
+ int status = ioctl(netdev->tun_fd, TUNSETIFF, (void *)&ifr);
+ if (status < 0)
+ die_perror("TUNSETIFF");
+
+ /* Our tests rely on using tun0.
+ * We might change this in the future, by passing a variable filled
+ * with tunnel name. In the mean time, wait a bit that tun0 gets free.
+ */
+ if (strcmp(ifr.ifr_name, "tun0")) {
+ close(tun_fd);
+ usleep(100000);
+ goto loop;
+ }
+ netdev->name = strdup(ifr.ifr_name);
+#endif
+
+#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
+ const int mode = IFF_BROADCAST | IFF_MULTICAST;
+ if (ioctl(netdev->tun_fd, TUNSIFMODE, &mode, sizeof(mode)) < 0)
+ die_perror("TUNSIFMODE");
+
+ netdev->name = strdup(TUN_DEV);
+#endif /* defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) */
+
+#if defined(__FreeBSD__) || defined(__NetBSD__)
+ /* On FreeBSD and NetBSD we need to explicitly ask to be able
+ * to prepend the address family when injecting tun packets.
+ * OpenBSD presumes we are doing this, even without the ioctl.
+ */
+ const int header = 1;
+ if (ioctl(netdev->tun_fd, TUNSIFHEAD, &header, sizeof(header)) < 0)
+ die_perror("TUNSIFHEAD");
+#endif /* defined(__FreeBSD__) || defined(__NetBSD__) */
+
+ DEBUGP("tun name: '%s'\n", netdev->name);
+
+ netdev->index = if_nametoindex(netdev->name);
+ if (netdev->index == 0)
+ die_perror("if_nametoindex");
+
+ DEBUGP("tun index: '%d'\n", netdev->index);
+
+ if (config->speed != TUN_DRIVER_SPEED_CUR) {
+ char *command;
+ asprintf(&command, "ethtool -s %s speed %u autoneg off",
+ netdev->name, config->speed);
+ if (system(command) < 0)
+ die("Error executing %s\n", command);
+ free(command);
+
+ /* Need to bring interface down and up so the interface speed
+ * will be copied to the link_speed field. This field is
+ * used by TCP's cwnd bound. */
+ asprintf(&command, "ifconfig %s down; sleep 1; ifconfig %s up; "
+ "sleep 1", netdev->name, netdev->name);
+ if (system(command) < 0)
+ die("Error executing %s\n", command);
+ free(command);
+ }
+
+ if (config->mtu != TUN_DRIVER_DEFAULT_MTU) {
+ char *command;
+ asprintf(&command, "ifconfig %s mtu %d",
+ netdev->name, config->mtu);
+ if (system(command) < 0)
+ die("Error executing %s\n", command);
+ free(command);
+ }
+
+ /* Open a socket we can use to configure the tun interface. */
+ netdev->control_fd = wrap_socket(config->ip_version, SOCK_DGRAM);
+}
+
+/* Set the offload flags to be like a typical ethernet device */
+static void set_device_offload_flags(struct local_netdev *netdev)
+{
+#ifdef linux
+ const u32 offload =
+ TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | TUN_F_TSO_ECN;
+ if (ioctl(netdev->tun_fd, TUNSETOFFLOAD, offload) != 0)
+ die_perror("TUNSETOFFLOAD");
+#endif
+}
+
+/* Bring up the device */
+static void bring_up_device(struct local_netdev *netdev)
+{
+ struct ifreq ifr;
+ memset(&ifr, 0, sizeof(ifr));
+ strncpy(ifr.ifr_name, netdev->name, IFNAMSIZ);
+ if (ioctl(netdev->control_fd, SIOCGIFFLAGS, &ifr) < 0)
+ die_perror("SIOCGIFFLAGS");
+ ifr.ifr_flags |= IFF_UP | IFF_RUNNING;
+ if (ioctl(netdev->control_fd, SIOCSIFFLAGS, &ifr) < 0)
+ die_perror("SIOCSIFFLAGS");
+}
+
+/* Route traffic destined for our remote IP through this device.
+ * In anyip environment, we don't use the gateway IP.
+ */
+static void route_traffic_to_device(struct config *config,
+ struct local_netdev *netdev)
+{
+ char *route_command = NULL;
+#ifdef linux
+ asprintf(&route_command,
+ "ip -%d route del %s > /dev/null 2>&1 ; "
+ "ip -%d route add %s dev %s %s%s > /dev/null 2>&1",
+ (config->wire_protocol == AF_INET) ? 4 : 6,
+ config->live_remote_prefix_string,
+ (config->wire_protocol == AF_INET) ? 4 : 6,
+ config->live_remote_prefix_string,
+ netdev->name,
+ config->is_anyip ? "" : "via ",
+ config->is_anyip ? "" :
+ config->live_gateway_ip_string);
+#endif
+#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
+ if (config->wire_protocol == AF_INET) {
+ asprintf(&route_command,
+ "route delete %s > /dev/null 2>&1 ; "
+ "route add %s %s > /dev/null",
+ config->live_remote_prefix_string,
+ config->live_remote_prefix_string,
+ config->live_gateway_ip_string);
+ } else if (config->wire_protocol == AF_INET6) {
+ asprintf(&route_command,
+ "route delete -inet6 %s > /dev/null 2>&1 ; "
+#if defined(__FreeBSD__)
+ "route add -inet6 %s -interface tun0 %s > /dev/null",
+#elif defined(__OpenBSD__) || defined(__NetBSD__)
+ "route add -inet6 %s %s > /dev/null",
+#endif
+ config->live_remote_prefix_string,
+ config->live_remote_prefix_string,
+ config->live_gateway_ip_string);
+ } else {
+ assert(!"bad wire protocol");
+ }
+#endif /* defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) */
+ int result = system(route_command);
+ if ((result == -1) || (WEXITSTATUS(result) != 0)) {
+ die("error executing route command '%s'\n",
+ route_command);
+ }
+ free(route_command);
+}
+
+struct netdev *local_netdev_new(struct config *config)
+{
+ struct local_netdev *netdev = calloc(1, sizeof(struct local_netdev));
+
+ netdev->netdev.ops = &local_netdev_ops;
+
+ cleanup_old_device(config, netdev);
+
+ check_remote_address(config, netdev);
+ create_device(config, netdev);
+ set_device_offload_flags(netdev);
+ bring_up_device(netdev);
+
+ if (config->is_anyip)
+ check_local_anyip(config);
+ else
+ net_setup_dev_address(netdev->name,
+ &config->live_local_ip,
+ config->live_prefix_len);
+
+ route_traffic_to_device(config, netdev);
+ netdev->psock = packet_socket_new(netdev->name);
+
+ return (struct netdev *)netdev;
+}
+
+static void local_netdev_free(struct netdev *a_netdev)
+{
+ struct local_netdev *netdev = to_local_netdev(a_netdev);
+
+ if (netdev->psock)
+ packet_socket_free(netdev->psock);
+ if (netdev->tun_fd >= 0)
+ close(netdev->tun_fd);
+ if (netdev->control_fd >= 0)
+ close(netdev->control_fd);
+ if (netdev->name != NULL)
+ free(netdev->name);
+ memset(netdev, 0, sizeof(*netdev)); /* paranoia to help catch bugs */
+ free(netdev);
+}
+
+#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
+/* According to `man 4 tun` on OpenBSD: "Each packet read or written
+ * is prefixed with a tunnel header consisting of a 4-byte network
+ * byte order integer containing the address family in the case of
+ * layer 3 tunneling." Similarly, on FreeBSD and NetBSD one must use
+ * ioctl(TUNSIFHEAD) and prepend an address family, in order to be
+ * able to send IPv6 packets (otherwise FreeBSD and NetBSD assume the
+ * packets are IPv4).
+ */
+static void bsd_tun_write(struct local_netdev *netdev,
+ struct packet *packet)
+{
+ int address_family = htonl(packet_address_family(packet));
+ struct iovec vector[2] = {
+ { &address_family, sizeof(address_family) },
+ { packet_start(packet), packet->ip_bytes }
+ };
+
+ if (writev(netdev->tun_fd, vector, ARRAY_SIZE(vector)) < 0)
+ die_perror("BSD tun write()");
+}
+#endif /* defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) */
+
+#ifdef linux
+#include <linux/virtio_net.h>
+
+static void linux_tun_write(struct local_netdev *netdev,
+ struct packet *packet)
+{
+ struct virtio_net_hdr gso = { 0 };
+ struct iovec vector[2] = {
+ { &gso, sizeof(gso) },
+ { packet_start(packet), packet->ip_bytes }
+ };
+
+ if (packet->tcp && packet->mss) {
+ if (packet->ipv4)
+ gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
+ else
+ gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
+ gso.gso_size = packet->mss;
+ }
+ if (writev(netdev->tun_fd, vector, ARRAY_SIZE(vector)) < 0)
+ die_perror("Linux tun write()");
+}
+#endif /* linux */
+
+static int local_netdev_send(struct netdev *a_netdev,
+ struct packet *packet)
+{
+ struct local_netdev *netdev = to_local_netdev(a_netdev);
+
+ assert(packet->ip_bytes > 0);
+ /* We do IPv4 and IPv6 */
+ assert(packet->ipv4 || packet->ipv6);
+ /* We only do TCP and ICMP */
+ assert(packet->tcp || packet->udp || packet->icmpv4 || packet->icmpv6);
+
+ DEBUGP("local_netdev_send\n");
+
+#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
+ bsd_tun_write(netdev, packet);
+#endif /* defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) */
+
+#ifdef linux
+ linux_tun_write(netdev, packet);
+#endif /* linux */
+
+ return STATUS_OK;
+}
+
+/* Read the given number of packets out of the tun device. We read
+ * these packets so that the kernel can exercise its normal code paths
+ * for packet transmit completion, since this code path may feed back
+ * to TCP behavior; e.g., see the Linux patch "tcp: avoid retransmits
+ * of TCP packets hanging in host queues". We don't need to actually
+ * need the packet contents, but on Linux we need to read at least 1
+ * byte of packet data to consume the packet.
+ * After we added IFF_VNET_HDR attribute to the linux tun device,
+ * we expect to receive a virtio_net_hdr at the beginning.
+ */
+static void local_netdev_read_queue(struct local_netdev *netdev,
+ int num_packets)
+{
+#ifdef linux
+ char buf[sizeof(struct virtio_net_hdr) + 1];
+#else
+ char buf[1];
+#endif
+ int i = 0, in_bytes = 0;
+
+ for (i = 0; i < num_packets; ++i) {
+ in_bytes = read(netdev->tun_fd, buf, sizeof(buf));
+ assert(in_bytes <= (int)sizeof(buf));
+
+ if (in_bytes < 0) {
+ if (errno == EINTR)
+ continue;
+ else
+ die_perror("tun read()");
+ }
+ }
+}
+
+static int local_netdev_receive(struct netdev *a_netdev,
+ struct packet **packet, char **error)
+{
+ struct local_netdev *netdev = to_local_netdev(a_netdev);
+ int status = STATUS_ERR;
+ int num_packets = 0;
+
+ DEBUGP("local_netdev_receive\n");
+
+ status = netdev_receive_loop(netdev->psock, PACKET_LAYER_3_IP,
+ DIRECTION_OUTBOUND, packet, &num_packets,
+ error);
+ local_netdev_read_queue(netdev, num_packets);
+ return status;
+}
+
+int netdev_receive_loop(struct packet_socket *psock,
+ enum packet_layer_t layer,
+ enum direction_t direction,
+ struct packet **packet,
+ int *num_packets,
+ char **error)
+{
+ assert(*packet == NULL); /* should be no packet yet */
+
+ *num_packets = 0;
+ while (1) {
+ int in_bytes = 0;
+ enum packet_parse_result_t result;
+
+ *packet = packet_new(PACKET_READ_BYTES);
+
+ /* Sniff the next outbound packet from the kernel under test. */
+ if (packet_socket_receive(psock, direction, *packet, &in_bytes))
+ continue;
+
+ ++*num_packets;
+ result = parse_packet(*packet, in_bytes, layer, error);
+
+ if (result == PACKET_OK)
+ return STATUS_OK;
+
+ packet_free(*packet);
+ *packet = NULL;
+
+ if (result == PACKET_BAD)
+ return STATUS_ERR;
+
+ DEBUGP("parse_result:%d; error parsing packet: %s\n",
+ result, *error);
+ }
+
+ assert(!"should not be reached");
+ return STATUS_ERR; /* not reached */
+}
+
+struct netdev_ops local_netdev_ops = {
+ .free = local_netdev_free,
+ .send = local_netdev_send,
+ .receive = local_netdev_receive,
+};
diff --git a/test/packetdrill/netdev.h b/test/packetdrill/netdev.h
new file mode 100644
index 0000000..c69c138
--- /dev/null
+++ b/test/packetdrill/netdev.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Interface for a "virtual network device" module to inject packets
+ * into the kernel and sniff packets leaving the kernel.
+ */
+
+#ifndef __PACKET_NETDEV_H__
+#define __PACKET_NETDEV_H__
+
+#include "types.h"
+
+#include "config.h"
+#include "packet.h"
+#include "packet_parser.h"
+#include "packet_socket.h"
+
+struct netdev_ops;
+
+/* A C-style poor-man's "pure virtual" netdev. */
+struct netdev {
+ struct netdev_ops *ops; /* C-style vtable pointer */
+};
+
+struct netdev_ops {
+ /* Tear down a netdev and free up the resources it has allocated. */
+ void (*free)(struct netdev *netdev);
+
+ /* Inject a raw TCP/IP packet into the kernel. */
+ int (*send)(struct netdev *netdev,
+ struct packet *packet);
+
+ /* Sniff the next TCP/IP packet leaving the kernel and return a
+ * pointer to the newly-allocated packet. Caller must free the packet
+ * with packet_free().
+ */
+ int (*receive)(struct netdev *netdev,
+ struct packet **packet, char **error);
+};
+
+
+/* Tear down a netdev and free up the resources it has allocated. */
+static inline void netdev_free(struct netdev *netdev)
+{
+ netdev->ops->free(netdev);
+}
+
+/* Inject a raw TCP/IP packet into the kernel. */
+static inline int netdev_send(struct netdev *netdev,
+ struct packet *packet)
+{
+ return netdev->ops->send(netdev, packet);
+}
+
+/* Sniff the next TCP/IP packet leaving the kernel and return a
+ * pointer to the newly-allocated packet. Caller must free the packet
+ * with packet_free().
+ */
+static inline int netdev_receive(struct netdev *netdev,
+ struct packet **packet,
+ char **error)
+{
+ return netdev->ops->receive(netdev, packet, error);
+}
+
+
+/* Keep sniffing packets leaving the kernel until we see one we know
+ * about and can parse. Return a pointer to the newly-allocated
+ * packet. Caller must free the packet with packet_free().
+ */
+extern int netdev_receive_loop(struct packet_socket *psock,
+ enum packet_layer_t layer,
+ enum direction_t direction,
+ struct packet **packet,
+ int *num_packets,
+ char **error);
+
+/* Allocate and return a new netdev for purely local tests. */
+extern struct netdev *local_netdev_new(struct config *config);
+
+#endif /* __PACKET_NETDEV_H__ */
diff --git a/test/packetdrill/open_memstream.c b/test/packetdrill/open_memstream.c
new file mode 100644
index 0000000..9114091
--- /dev/null
+++ b/test/packetdrill/open_memstream.c
@@ -0,0 +1,142 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * FreeBSD does not have open_memstream(), so we roll our own minimalist
+ * implementation here.
+ */
+
+#include "types.h"
+
+#ifndef HAVE_OPEN_MEMSTREAM
+
+#include "assert.h"
+#include "open_memstream.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+
+/* Our internal state for the memstream. */
+struct mem_stream {
+ char **buf; /* pointer to the output buffer pointer */
+ size_t *sizeloc; /* pointer to the output final buffer size */
+
+ size_t buf_size; /* currently allocated size of buffer */
+ size_t offset; /* current write offset */
+};
+
+#define INITIAL_BUF_SIZE 1024
+
+/* Grow buffer, if needed, to write "write_bytes" bytes at the current
+ * offset. We also have to take into account the extra '\0' that we
+ * maintain just past the end. Returns 0 on success, or -1 on failure.
+ */
+static int mem_stream_grow(struct mem_stream *stream, int write_bytes)
+{
+ char *new_buf = NULL;
+ size_t new_size = 0;
+ size_t needed_bytes = 0;
+
+ needed_bytes = stream->offset + write_bytes + 1;
+ if (needed_bytes <= stream->buf_size)
+ return 0;
+
+ if (stream->buf_size == 0)
+ new_size = INITIAL_BUF_SIZE;
+ else
+ new_size = 2 * stream->buf_size;
+
+ if (new_size < needed_bytes)
+ new_size = needed_bytes;
+
+ new_buf = (char *) realloc(*stream->buf, new_size);
+ if (new_buf == NULL)
+ return -1;
+
+ *stream->buf = new_buf;
+ stream->buf_size = new_size;
+
+ return 0;
+}
+
+/* Write the give data to our memstream, expanding our buffer if we
+ * need to. Per the specification in the Linux man pages, "A null byte
+ * is maintained at the end of the buffer. This byte is not included
+ * in the size value stored at sizeloc."
+ */
+static int write_memstream(void *cookie, const char *buf, int write_bytes)
+{
+ struct mem_stream *stream = (struct mem_stream *) cookie;
+
+ if (mem_stream_grow(stream, write_bytes) < 0)
+ return -1;
+
+ memcpy(*stream->buf + stream->offset, buf, write_bytes);
+ stream->offset += write_bytes;
+
+ *(*stream->buf + stream->offset) = '\0';
+
+ *stream->sizeloc = stream->offset; /* size does not include '\0' */
+
+ return write_bytes;
+}
+
+/* Clean up */
+static int close_memstream(void *cookie)
+{
+ struct mem_stream *stream = (struct mem_stream *) cookie;
+
+ free(stream);
+
+ return 0;
+}
+
+/* Create a memstream. */
+FILE *open_memstream(char **ptr, size_t *sizeloc)
+{
+ FILE *f;
+ struct mem_stream *stream;
+
+ if (ptr == NULL || sizeloc == NULL) {
+ errno = EINVAL;
+ return NULL;
+ }
+
+ stream = (struct mem_stream *) calloc(1, sizeof(struct mem_stream));
+ if (stream == NULL)
+ return NULL;
+
+ f = funopen(stream, NULL, write_memstream, NULL, close_memstream);
+ if (f == NULL) {
+ free(stream);
+ return NULL;
+ }
+
+ *ptr = NULL;
+ *sizeloc = 0;
+
+ stream->buf = ptr;
+ stream->sizeloc = sizeloc;
+
+ return f;
+}
+
+#endif /* HAVE_OPEN_MEMSTREAM */
diff --git a/test/packetdrill/open_memstream.h b/test/packetdrill/open_memstream.h
new file mode 100644
index 0000000..bf23705
--- /dev/null
+++ b/test/packetdrill/open_memstream.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * FreeBSD does not have open_memstream(), so we roll our own minimalist
+ * implementation here.
+ */
+
+#ifndef __OPEN_MEMSTREAM_H__
+#define __OPEN_MEMSTREAM_H__
+
+#ifndef HAVE_OPEN_MEMSTREAM
+
+#include <stdio.h>
+
+FILE *open_memstream(char **ptr, size_t *sizeloc);
+
+#endif /*HAVE_OPEN_MEMSTREAM*/
+
+#endif /* __OPEN_MEMSTREAM_H__ */
diff --git a/test/packetdrill/packet.c b/test/packetdrill/packet.c
new file mode 100644
index 0000000..d2d792a
--- /dev/null
+++ b/test/packetdrill/packet.c
@@ -0,0 +1,327 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Implementation for a representation of TCP/IP packets.
+ * Packets are represented in their wire format.
+ */
+
+#include "packet.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include "assert.h"
+#include "ethernet.h"
+#include "gre_packet.h"
+#include "ip_packet.h"
+#include "logging.h"
+#include "mpls_packet.h"
+
+
+/* Info for all types of header we support. */
+struct header_type_info header_types[HEADER_NUM_TYPES] = {
+ { "NONE", 0, 0, NULL },
+ { "IPV4", IPPROTO_IPIP, ETHERTYPE_IP, ipv4_header_finish },
+ { "IPV6", IPPROTO_IPV6, ETHERTYPE_IPV6, ipv6_header_finish },
+ { "GRE", IPPROTO_GRE, 0, gre_header_finish },
+ { "MPLS", 0, ETHERTYPE_MPLS_UC, mpls_header_finish },
+ { "TCP", IPPROTO_TCP, 0, NULL },
+ { "UDP", IPPROTO_UDP, 0, NULL },
+ { "ICMPV4", IPPROTO_ICMP, 0, NULL },
+ { "ICMPV6", IPPROTO_ICMPV6, 0, NULL },
+};
+
+struct packet *packet_new(u32 buffer_bytes)
+{
+ struct packet *packet = calloc(1, sizeof(struct packet));
+ packet->buffer = malloc(buffer_bytes);
+ packet->buffer_bytes = buffer_bytes;
+ return packet;
+}
+
+void packet_free(struct packet *packet)
+{
+ free(packet->buffer);
+ memset(packet, 0, sizeof(*packet)); /* paranoia to help catch bugs */
+ free(packet);
+}
+
+struct packet_list *packet_list_new(void)
+{
+ struct packet_list *list = calloc(1, sizeof(struct packet_list));
+ list->packet = NULL;
+ list->next = NULL;
+ return list;
+}
+
+void packet_list_free(struct packet_list *list)
+{
+ while (list != NULL) {
+ struct packet_list *dead_list = list;
+ if (list->packet)
+ packet_free(list->packet);
+ list = list->next;
+ free(dead_list);
+ }
+}
+
+int packet_header_count(const struct packet *packet)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(packet->headers); ++i) {
+ if (packet->headers[i].type == HEADER_NONE)
+ break;
+ }
+ return i;
+}
+
+/* Copy any header info from old_packet to new_packet. */
+static void packet_copy_headers(struct packet *new_packet,
+ struct packet *old_packet,
+ int bytes_headroom)
+{
+ int i;
+ u8 *base = new_packet->buffer + bytes_headroom;
+
+ for (i = 0; i < ARRAY_SIZE(old_packet->headers); ++i) {
+ struct header *old_header = &old_packet->headers[i];
+ struct header *new_header = &new_packet->headers[i];
+ int offset = 0;
+
+ if (old_header->type == HEADER_NONE)
+ break;
+ offset = old_header->h.ptr - old_packet->buffer;
+ new_header->h.ptr = base + offset;
+ new_header->header_bytes = old_header->header_bytes;
+ new_header->total_bytes = old_header->total_bytes;
+ new_header->type = old_header->type;
+ }
+}
+
+struct header *packet_append_header(struct packet *packet,
+ enum header_t header_type,
+ int header_bytes)
+{
+ struct header *header = NULL;
+ int num_headers = packet_header_count(packet);
+ int packet_bytes;
+
+ assert(num_headers <= PACKET_MAX_HEADERS);
+ if (num_headers == PACKET_MAX_HEADERS)
+ return NULL;
+
+ header = &packet->headers[num_headers];
+
+ if (packet->ip_bytes + header_bytes > packet->buffer_bytes)
+ return NULL;
+ packet_bytes = packet->l2_header_bytes + packet->ip_bytes;
+ header->h.ptr = packet->buffer + packet_bytes;
+ packet->ip_bytes += header_bytes;
+
+ header->type = header_type;
+ header->header_bytes = header_bytes;
+ header->total_bytes = 0;
+ return header;
+}
+
+/* Map a pointer to a packet offset from an old base to a new base. */
+static void *offset_ptr(u8 *old_base, u8* new_base, void *old_ptr)
+{
+ u8 *old = (u8*)old_ptr;
+
+ return (old == NULL) ? NULL : (new_base + (old - old_base));
+}
+
+static void packet_duplicate_info(struct packet *packet,
+ struct packet *old_packet,
+ int bytes_headroom,
+ int extra_payload)
+{
+ u8 *old_base = old_packet->buffer;
+ u8 *new_base = packet->buffer + bytes_headroom;
+
+ packet->ip_bytes = old_packet->ip_bytes + extra_payload;
+ packet->direction = old_packet->direction;
+ packet->time_usecs = old_packet->time_usecs;
+ packet->flags = old_packet->flags;
+ packet->tos_chk = old_packet->tos_chk;
+
+ packet_copy_headers(packet, old_packet, bytes_headroom);
+
+ /* Set up layer 3 header pointer. */
+ packet->ipv4 = offset_ptr(old_base, new_base, old_packet->ipv4);
+ packet->ipv6 = offset_ptr(old_base, new_base, old_packet->ipv6);
+ packet->tcp = offset_ptr(old_base, new_base, old_packet->tcp);
+ packet->udp = offset_ptr(old_base, new_base, old_packet->udp);
+ packet->icmpv4 = offset_ptr(old_base, new_base, old_packet->icmpv4);
+ packet->icmpv6 = offset_ptr(old_base, new_base, old_packet->icmpv6);
+
+ packet->tcp_ts_val = offset_ptr(old_base, new_base,
+ old_packet->tcp_ts_val);
+ packet->tcp_ts_ecr = offset_ptr(old_base, new_base,
+ old_packet->tcp_ts_ecr);
+ packet->echoed_header = old_packet->echoed_header;
+}
+
+/* Make a copy of the given old packet, but in the new copy reserve the
+ * given number of bytes of headroom at the start of the packet->buffer.
+ * This empty headroom can later be filled with outer packet headers.
+ * A slow but simple model.
+ */
+static struct packet *packet_copy_with_headroom(struct packet *old_packet,
+ int bytes_headroom)
+{
+ /* Allocate a new packet and copy link layer header and IP datagram. */
+ const int bytes_used = packet_end(old_packet) - old_packet->buffer;
+ assert(bytes_used >= 0);
+ assert(bytes_used <= 128*1024);
+ struct packet *packet = packet_new(bytes_headroom + bytes_used);
+ u8 *old_base = old_packet->buffer;
+ u8 *new_base = packet->buffer + bytes_headroom;
+
+ memcpy(new_base, old_base, bytes_used);
+
+ packet_duplicate_info(packet, old_packet, bytes_headroom, 0);
+
+ return packet;
+}
+
+struct packet *packet_copy(struct packet *old_packet)
+{
+ return packet_copy_with_headroom(old_packet, 0);
+}
+
+/* Finalize all the headers once we know what's inside inner layers. */
+static void packet_finish_encapsulation_headers(struct packet *packet)
+{
+ int i;
+ struct header *header = NULL, *next = NULL;
+
+ /* Proceed from inner to outer. */
+ for (i = ARRAY_SIZE(packet->headers) - 1; i >= 0; --i, next = header) {
+ struct header_type_info *type_info = NULL;
+
+ header = &packet->headers[i];
+ if (header->type == HEADER_NONE)
+ continue;
+
+ type_info = header_type_info(header->type);
+ if (type_info->finish != NULL)
+ type_info->finish(packet, header, next);
+ }
+}
+
+struct packet *packet_encapsulate(struct packet *outer, struct packet *inner)
+{
+ struct packet *packet = NULL;
+ const int outer_headers = packet_header_count(outer);
+ const int inner_headers = packet_header_count(inner);
+
+ assert(outer_headers + inner_headers <= PACKET_MAX_HEADERS);
+
+ /* Copy the inner packet bits and header metadata. */
+ packet = packet_copy_with_headroom(inner, outer->ip_bytes);
+
+ /* Copy over the bits in the outer headers. */
+ memcpy(packet->buffer, outer->buffer, outer->ip_bytes);
+
+ /* Move the inner header metadata to make room for the outer. */
+ memmove(packet->headers + outer_headers, packet->headers + 0,
+ inner_headers * sizeof(struct header));
+
+ /* Copy over the metadata about the outer headers. */
+ packet_copy_headers(packet, outer, 0);
+
+ assert(packet_header_count(packet) == outer_headers + inner_headers);
+
+ packet_finish_encapsulation_headers(packet);
+
+ packet->ip_bytes = outer->ip_bytes + inner->ip_bytes;
+
+ return packet;
+}
+
+struct header_type_info *header_type_info(enum header_t header_type)
+{
+ assert(header_type > HEADER_NONE);
+ assert(header_type < HEADER_NUM_TYPES);
+ assert(ARRAY_SIZE(header_types) == HEADER_NUM_TYPES);
+ return &header_types[header_type];
+}
+
+/* Aggregate a list of input packets into a single output packet. */
+struct packet *aggregate_packets(const struct packet_list *head,
+ const struct packet_list *tail,
+ int payload_size)
+{
+ int i;
+ /* Copy the headers from the last source packet. */
+ struct packet *first_packet = head->packet;
+ struct packet *last_packet = tail->packet;
+ struct packet *old_packet = last_packet;
+ /* Allocate a new packet that can accommodate the combined payload */
+ int extra_payload = payload_size - packet_payload_len(old_packet);
+ int headers_len = packet_payload(old_packet) - old_packet->buffer;
+ int old_packet_size = packet_end(old_packet) - old_packet->buffer;
+ struct packet *packet = packet_new(old_packet_size + extra_payload);
+
+ u8 *old_base = old_packet->buffer;
+ u8 *new_base = packet->buffer;
+ u8 *iter_base = new_base + headers_len;
+
+ DEBUGP("aggregate_packets with combined payload size of %d bytes\n",
+ payload_size);
+ memcpy(new_base, old_base, headers_len);
+
+ /* Copy the payload from all the source packets. */
+ do {
+ memcpy(iter_base, packet_payload(head->packet),
+ packet_payload_len(head->packet));
+ iter_base += packet_payload_len(head->packet);
+ head = head->next;
+ } while (head != NULL);
+
+ packet_duplicate_info(packet, old_packet, 0, extra_payload);
+
+ /* Adjust header bytes information to account for the larger payload. */
+ for (i = 0; i < ARRAY_SIZE(packet->headers); ++i) {
+ struct header *new_header = &packet->headers[i];
+
+ if (new_header->type == HEADER_NONE)
+ break;
+ new_header->total_bytes += extra_payload;
+ DEBUGP("%s header starts at %p\n",
+ header_type_info(new_header->type)->name,
+ new_header->h.ptr);
+ /* For TCP header, we must copy the seq number and the cwr flag
+ * from the first packet.
+ */
+ if (new_header->type == HEADER_TCP) {
+ assert(packet->tcp != NULL);
+ assert(first_packet->tcp != NULL);
+ packet->tcp->seq = first_packet->tcp->seq;
+ packet->tcp->cwr = first_packet->tcp->cwr;
+ }
+ }
+ packet_finish_encapsulation_headers(packet);
+
+ return packet;
+}
diff --git a/test/packetdrill/packet.h b/test/packetdrill/packet.h
new file mode 100644
index 0000000..aa41104
--- /dev/null
+++ b/test/packetdrill/packet.h
@@ -0,0 +1,425 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Interface and type declarations for a representation of TCP/IP packets.
+ * Packets are represented in their wire format.
+ */
+
+#ifndef __PACKET_H__
+#define __PACKET_H__
+
+#include "types.h"
+
+#include <sys/time.h>
+#include "assert.h"
+#include "gre.h"
+#include "header.h"
+#include "icmp.h"
+#include "icmpv6.h"
+#include "ip.h"
+#include "ipv6.h"
+#include "tcp.h"
+#include "udp.h"
+#include "unaligned.h"
+
+/* The data offset field is 4 bits, and specifies the length of the TCP header,
+ * including options, in 32-bit words.
+ */
+#define MAX_TCP_HEADER_BYTES (15*4)
+
+#define MAX_TCP_DATAGRAM_BYTES (64*1024) /* for sanity-checking */
+#define MAX_UDP_DATAGRAM_BYTES (64*1024) /* for sanity-checking */
+
+/* We allow reading pretty big packets, since some interface MTUs can
+ * be pretty big (the Linux loopback MTU, for example, is typically
+ * around 16KB).
+ */
+static const int PACKET_READ_BYTES = 64 * 1024;
+
+/* Maximum number of headers. */
+#define PACKET_MAX_HEADERS 6
+
+/* Maximum number of bytes of headers. */
+#define PACKET_MAX_HEADER_BYTES 256
+
+/* TCP/UDP/IPv4 packet, including IPv4 header, TCP/UDP header, and data. There
+ * may also be a link layer header between the 'buffer' and 'ip'
+ * pointers, but we typically ignore that. The 'buffer_bytes' field
+ * gives the total space in the buffer, which may be bigger than the
+ * actual amount occupied by the packet data.
+ */
+struct packet {
+ u8 *buffer; /* data buffer: full contents of packet */
+ u32 buffer_bytes; /* bytes of space in data buffer */
+ u32 l2_header_bytes; /* bytes in outer hardware/layer-2 header */
+ u32 ip_bytes; /* bytes in outermost IP hdrs/payload */
+ enum direction_t direction; /* direction packet is traveling */
+
+ /* Metadata about all the headers in the packet, including all
+ * layers of encapsulation, from outer to inner, starting from
+ * the outermost IP header at headers[0].
+ */
+ struct header headers[PACKET_MAX_HEADERS];
+
+ /* The following pointers point into the 'buffer' area. Each
+ * pointer may be NULL if there is no header of that type
+ * present in the packet. In each case these are pointers to
+ * the innermost header of that kind, since that is where most
+ * of the interesting TCP/UDP/IP action is.
+ */
+
+ /* Layer 3 */
+ struct ipv4 *ipv4; /* start of IPv4 header, if present */
+ struct ipv6 *ipv6; /* start of IPv6 header, if present */
+
+ /* Layer 4 */
+ struct tcp *tcp; /* start of TCP header, if present */
+ struct udp *udp; /* start of UDP header, if present */
+ struct icmpv4 *icmpv4; /* start of ICMPv4 header, if present */
+ struct icmpv6 *icmpv6; /* start of ICMPv6 header, if present */
+ bool echoed_header; /* icmp payload is an echoed header?
+ This is for TCP/UDP */
+
+
+ s64 time_usecs; /* wall time of receive/send if non-zero */
+
+ u32 flags; /* various meta-flags */
+#define FLAG_WIN_NOCHECK 0x1 /* don't check TCP receive window */
+#define FLAG_OPTIONS_NOCHECK 0x2 /* don't check TCP options */
+
+ enum tos_chk_t tos_chk; /* how to treat the TOS byte of a packet */
+
+ __be32 *tcp_ts_val; /* location of TCP timestamp val, or NULL */
+ __be32 *tcp_ts_ecr; /* location of TCP timestamp ecr, or NULL */
+ int mss;
+};
+
+/* A simple list of packets. */
+struct packet_list {
+ struct packet *packet; /* the packet content */
+ struct packet_list *next; /* link to next element, or NULL if last */
+};
+
+/* Allocate a packet_list and initialize its fields to NULL. */
+extern struct packet_list *packet_list_new(void);
+
+/* Free an entire packet list. */
+extern void packet_list_free(struct packet_list *list);
+
+/* Allocate and initialize a packet. */
+extern struct packet *packet_new(u32 buffer_length);
+
+/* Free all the memory used by the packet. */
+extern void packet_free(struct packet *packet);
+
+/* Create a packet that is a copy of the contents of the given packet. */
+extern struct packet *packet_copy(struct packet *old_packet);
+
+/* Return the number of headers in the given packet. */
+extern int packet_header_count(const struct packet *packet);
+
+/* Attempt to append a new header to the given packet. Return a
+ * pointer to the new header metadata, or NULL if we can't add the
+ * header.
+ */
+extern struct header *packet_append_header(struct packet *packet,
+ enum header_t header_type,
+ int header_bytes);
+
+/* Return a newly-allocated packet that is a copy of the given inner packet
+ * but with the given outer packet prepended.
+ */
+extern struct packet *packet_encapsulate(struct packet *outer,
+ struct packet *inner);
+
+/* Aggregate a list of packets into a new packet carrying the combined
+ * payload and return the newly allocated packet. The head and tail parameters
+ * point to the first and the last packet, respectively, in the input list.
+ * payload_size is the payload size for the aggregated packet, equal to the
+ * summed payload across all the packets in the list.
+ * The source packets were previously checked to have compatible headers. Copy
+ * the headers from the last source packet, and update the length fields in all
+ * the headers to match the combined payload.
+ */
+extern struct packet *aggregate_packets(const struct packet_list *head,
+ const struct packet_list *tail,
+ int payload_size);
+
+/* Encapsulate a packet and free the original outer and inner packets. */
+static inline struct packet *packet_encapsulate_and_free(struct packet *outer,
+ struct packet *inner)
+{
+ struct packet *packet = packet_encapsulate(outer, inner);
+ packet_free(outer);
+ packet_free(inner);
+ return packet;
+}
+
+/* Return the direction in which the given packet is traveling. */
+static inline enum direction_t packet_direction(const struct packet *packet)
+{
+ return packet->direction;
+}
+
+/* Convenience accessors for peeking around in the packet... */
+
+/* Return the address family corresponding to the packet protocol. */
+static inline int packet_address_family(const struct packet *packet)
+{
+ if (packet->ipv4 != NULL)
+ return AF_INET;
+ if (packet->ipv6 != NULL)
+ return AF_INET6;
+ return AF_UNSPEC;
+}
+
+/* Return a pointer to the first byte of the outermost IP header. */
+static inline u8 *packet_start(const struct packet *packet)
+{
+ u8 *start = packet->headers[0].h.ptr;
+ assert(start != NULL);
+ return start;
+}
+
+/* Return a pointer to the first byte of the innermost IP header. */
+static inline u8 *ip_start(struct packet *packet)
+{
+ if (packet->ipv4 != NULL)
+ return (u8 *)packet->ipv4;
+ if (packet->ipv6 != NULL)
+ return (u8 *)packet->ipv6;
+ assert(!"bad address family");
+ return 0;
+}
+
+
+/* Return the length in bytes of the IP header for packets of the
+ * given address family, assuming no IP options.
+ */
+static inline int ip_header_min_len(int address_family)
+{
+ if (address_family == AF_INET)
+ return sizeof(struct ipv4);
+ else if (address_family == AF_INET6)
+ return sizeof(struct ipv6);
+ else
+ assert(!"bad ip_version in config");
+}
+
+/* Return the layer4 protocol of the packet. */
+static inline int packet_ip_protocol(const struct packet *packet)
+{
+ if (packet->ipv4 != NULL)
+ return packet->ipv4->protocol;
+ if (packet->ipv6 != NULL)
+ return packet->ipv6->next_header;
+ assert(!"no valid IP header");
+ return 0;
+}
+
+/* Return the length of an optionless TCP or UDP header. */
+static inline int layer4_header_len(int protocol)
+{
+ if (protocol == IPPROTO_TCP)
+ return sizeof(struct tcp);
+ if (protocol == IPPROTO_UDP)
+ return sizeof(struct udp);
+ assert(!"bad protocol");
+ return 0;
+}
+
+/* Return the length of the TCP header, including options. */
+static inline int packet_tcp_header_len(const struct packet *packet)
+{
+ assert(packet->tcp);
+ return packet->tcp->doff * sizeof(u32);
+}
+
+/* Return the length of the UDP header. */
+static inline int packet_udp_header_len(const struct packet *packet)
+{
+ assert(packet->udp);
+ return sizeof(struct udp);
+}
+
+/* Return the length of the ICMPv4 header. */
+static inline int packet_icmpv4_header_len(const struct packet *packet)
+{
+ assert(packet->icmpv4);
+ return sizeof(struct icmpv4);
+}
+
+/* Return the length of the ICMPv6 header. */
+static inline int packet_icmpv6_header_len(const struct packet *packet)
+{
+ assert(packet->icmpv6);
+ return sizeof(struct icmpv6);
+}
+
+/* Return the length of the TCP options. */
+static inline int packet_tcp_options_len(const struct packet *packet)
+{
+ assert(packet->tcp);
+ return packet_tcp_header_len(packet) - sizeof(*(packet->tcp));
+}
+
+/* Return a pointer to the TCP options. */
+static inline u8 *packet_tcp_options(struct packet *packet)
+{
+ assert(packet->tcp);
+ return (u8 *) (packet->tcp + 1);
+}
+
+static inline u32 packet_tcp_ts_val(const struct packet *packet)
+{
+ return get_unaligned_be32(packet->tcp_ts_val);
+}
+
+static inline u32 packet_tcp_ts_ecr(const struct packet *packet)
+{
+ return get_unaligned_be32(packet->tcp_ts_ecr);
+}
+
+static inline void packet_set_tcp_ts_val(struct packet *packet, u32 ts_val)
+{
+ put_unaligned_be32(ts_val, packet->tcp_ts_val);
+}
+
+static inline void packet_set_tcp_ts_ecr(struct packet *packet, u32 ts_ecr)
+{
+ put_unaligned_be32(ts_ecr, packet->tcp_ts_ecr);
+}
+
+/* Return a pointer to the TCP/UDP data payload. */
+static inline u8 *packet_payload(const struct packet *packet)
+{
+ if (packet->tcp)
+ return ((u8 *) packet->tcp) + packet_tcp_header_len(packet);
+ if (packet->udp)
+ return ((u8 *) packet->udp) + packet_udp_header_len(packet);
+ if (packet->icmpv4)
+ return ((u8 *) packet->icmpv4) + packet_icmpv4_header_len(packet);
+ if (packet->icmpv6)
+ return ((u8 *) packet->icmpv6) + packet_icmpv6_header_len(packet);
+
+ assert(!"no valid payload; not TCP or UDP or ICMP!?");
+ return NULL;
+}
+
+/* Return a pointer to the byte beyond the end of the packet. */
+static inline u8 *packet_end(const struct packet *packet)
+{
+ return packet_start(packet) + packet->ip_bytes;
+}
+
+/* Return the length of the TCP/UDP payload. */
+static inline int packet_payload_len(const struct packet *packet)
+{
+ return packet_end(packet) - packet_payload(packet);
+}
+
+/* Return the location of the IP header echoed by an ICMP message. */
+static inline u8 *packet_echoed_ip_header(struct packet *packet)
+{
+ if (packet->icmpv4 != NULL)
+ return (u8 *)(packet->icmpv4 + 1);
+ if (packet->icmpv6 != NULL)
+ return (u8 *)(packet->icmpv6 + 1);
+ assert(!"no valid icmp header");
+ return NULL;
+}
+
+/* Return the location of the IPv4 header echoed by an ICMP message, or NULL. */
+static inline struct ipv4 *packet_echoed_ipv4_header(struct packet *packet)
+{
+ return (struct ipv4 *)((packet->icmpv4 != NULL) ?
+ (packet->icmpv4 + 1) : NULL);
+}
+
+/* Return the location of the IPv6 header echoed by an ICMP message, or NULL. */
+static inline struct ipv6 *packet_echoed_ipv6_header(struct packet *packet)
+{
+ return (struct ipv6 *)((packet->icmpv6 != NULL) ?
+ (packet->icmpv6 + 1) : NULL);
+}
+
+/* Return the length in bytes of the IP header echoed by an ICMP message.
+ * For now we do not generate any IP options for echoed IP headers.
+ */
+static inline int packet_echoed_ip_header_len(struct packet *packet)
+{
+ if (packet->icmpv4 != NULL)
+ return sizeof(struct ipv4);
+ if (packet->icmpv6 != NULL)
+ return sizeof(struct ipv6);
+ assert(!"no valid icmp header");
+ return 0;
+}
+
+/* Return the layer4 protocol of the packet echoed inside an ICMP packet. */
+static inline int packet_echoed_ip_protocol(struct packet *packet)
+{
+ if (packet->icmpv4 != NULL)
+ return packet_echoed_ipv4_header(packet)->protocol;
+ if (packet->icmpv6 != NULL)
+ return packet_echoed_ipv6_header(packet)->next_header;
+ assert(!"no valid icmp header");
+ return 0;
+}
+
+/* Return the location of the TCP or UDP header echoed by an ICMP message. */
+static inline u8 *packet_echoed_layer4_header(struct packet *packet)
+{
+ u8 *echoed_ip = packet_echoed_ip_header(packet);
+ int ip_header_len = packet_echoed_ip_header_len(packet);
+ return echoed_ip + ip_header_len;
+}
+
+/* Return the location of the TCP header echoed by an ICMP message. */
+static inline struct tcp *packet_echoed_tcp_header(struct packet *packet)
+{
+ if (packet_echoed_ip_protocol(packet) == IPPROTO_TCP)
+ return (struct tcp *)(packet_echoed_layer4_header(packet));
+ return NULL;
+}
+
+/* Return the location of the UDP header echoed by an ICMP message. */
+static inline struct udp *packet_echoed_udp_header(struct packet *packet)
+{
+ if (packet_echoed_ip_protocol(packet) == IPPROTO_UDP)
+ return (struct udp *)(packet_echoed_layer4_header(packet));
+ return NULL;
+}
+
+/* Return the location of the TCP sequence number echoed by an ICMP message. */
+static inline u32 *packet_echoed_tcp_seq(struct packet *packet)
+{
+ struct tcp *echoed_tcp = packet_echoed_tcp_header(packet);
+ assert(echoed_tcp);
+ u32 *seq = &(echoed_tcp->seq);
+ /* Check that the seq field is actually in the space we
+ * reserved for the echoed prefix of the TCP header.
+ */
+ assert((char *) (seq + 1) <= (char *) echoed_tcp + ICMP_ECHO_BYTES);
+ return seq;
+}
+
+#endif /* __PACKET_H__ */
diff --git a/test/packetdrill/packet_checksum.c b/test/packetdrill/packet_checksum.c
new file mode 100644
index 0000000..d5164b3
--- /dev/null
+++ b/test/packetdrill/packet_checksum.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Implementation for a module to checksum TCP/IP packets.
+ */
+
+#include "packet_checksum.h"
+
+#include "checksum.h"
+#include "icmp.h"
+#include "icmpv6.h"
+#include "ip.h"
+#include "ipv6.h"
+#include "tcp.h"
+
+static void checksum_ipv4_packet(struct packet *packet)
+{
+ struct ipv4 *ipv4 = packet->ipv4;
+
+ /* Fill in IPv4 header checksum. */
+ ipv4->check = 0;
+ ipv4->check = ipv4_checksum(ipv4, ipv4_header_len(ipv4));
+ assert(packet->ip_bytes >= ntohs(ipv4->tot_len));
+
+ /* Find the length of layer 4 header, options, and payload. */
+ const int l4_bytes = ntohs(ipv4->tot_len) - ipv4_header_len(ipv4);
+ assert(l4_bytes > 0);
+
+ /* Fill in IPv4-based layer 4 checksum. */
+ if (packet->tcp != NULL) {
+ struct tcp *tcp = packet->tcp;
+ tcp->check = 0;
+ tcp->check = tcp_udp_v4_checksum(ipv4->src_ip,
+ ipv4->dst_ip,
+ IPPROTO_TCP, tcp, l4_bytes);
+ } else if (packet->udp != NULL) {
+ struct udp *udp = packet->udp;
+ udp->check = 0;
+ udp->check = tcp_udp_v4_checksum(ipv4->src_ip,
+ ipv4->dst_ip,
+ IPPROTO_UDP, udp, l4_bytes);
+ } else if (packet->icmpv4 != NULL) {
+ struct icmpv4 *icmpv4 = packet->icmpv4;
+ icmpv4->checksum = 0;
+ icmpv4->checksum = ipv4_checksum(icmpv4, l4_bytes);
+ } else {
+ assert(!"not TCP or ICMP");
+ }
+}
+
+static void checksum_ipv6_packet(struct packet *packet)
+{
+ struct ipv6 *ipv6 = packet->ipv6;
+
+ /* IPv6 has no header checksum. */
+ /* For now we do not support IPv6 extension headers. */
+ assert(packet->ip_bytes >= sizeof(*ipv6) + ntohs(ipv6->payload_len));
+
+ /* Find the length of layer 4 header, options, and payload. */
+ const int l4_bytes = ntohs(ipv6->payload_len);
+ assert(l4_bytes > 0);
+
+ /* Fill in IPv6-based layer 4 checksum. */
+ if (packet->tcp != NULL) {
+ struct tcp *tcp = packet->tcp;
+ tcp->check = 0;
+ tcp->check = tcp_udp_v6_checksum(&ipv6->src_ip,
+ &ipv6->dst_ip,
+ IPPROTO_TCP, tcp, l4_bytes);
+ } else if (packet->udp != NULL) {
+ struct udp *udp = packet->udp;
+ udp->check = 0;
+ udp->check = tcp_udp_v6_checksum(&ipv6->src_ip,
+ &ipv6->dst_ip,
+ IPPROTO_UDP, udp, l4_bytes);
+ } else if (packet->icmpv6 != NULL) {
+ /* IPv6 ICMP has a pseudo-header checksum, like TCP. */
+ struct icmpv6 *icmpv6 = packet->icmpv6;
+ icmpv6->checksum = 0;
+ icmpv6->checksum =
+ tcp_udp_v6_checksum(&ipv6->src_ip,
+ &ipv6->dst_ip,
+ IPPROTO_ICMPV6, icmpv6, l4_bytes);
+ } else {
+ assert(!"not TCP or ICMP");
+ }
+}
+
+void checksum_packet(struct packet *packet)
+{
+ int address_family = packet_address_family(packet);
+ if (address_family == AF_INET)
+ return checksum_ipv4_packet(packet);
+ else if (address_family == AF_INET6)
+ return checksum_ipv6_packet(packet);
+ else
+ assert(!"bad ip version");
+}
diff --git a/test/packetdrill/packet_checksum.h b/test/packetdrill/packet_checksum.h
new file mode 100644
index 0000000..2c87df3
--- /dev/null
+++ b/test/packetdrill/packet_checksum.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Interface for a module to checksum TCP/IP packets.
+ */
+
+#ifndef __PACKET_CHECKSUM_H__
+#define __PACKET_CHECKSUM_H__
+
+#include "packet.h"
+
+/* Fill in layer 3 and layer 4 checksums for the given input 'packet'. */
+extern void checksum_packet(struct packet *packet);
+
+#endif /* __PACKET_CHECKSUM_H__ */
diff --git a/test/packetdrill/packet_parser.c b/test/packetdrill/packet_parser.c
new file mode 100644
index 0000000..f593233
--- /dev/null
+++ b/test/packetdrill/packet_parser.c
@@ -0,0 +1,625 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Implementation for a module to parse TCP/IP packets.
+ */
+
+#include "packet_parser.h"
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <netinet/in.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "assert.h"
+#include "checksum.h"
+#include "ethernet.h"
+#include "gre.h"
+#include "ip.h"
+#include "ip_address.h"
+#include "logging.h"
+#include "packet.h"
+#include "tcp.h"
+
+static int parse_ipv4(struct packet *packet, u8 *header_start, u8 *packet_end,
+ char **error);
+static int parse_ipv6(struct packet *packet, u8 *header_start, u8 *packet_end,
+ char **error);
+static int parse_mpls(struct packet *packet, u8 *header_start, u8 *packet_end,
+ char **error);
+static int parse_layer3_packet_by_proto(struct packet *packet,
+ u16 proto, u8 *header_start,
+ u8 *packet_end, char **error);
+static int parse_layer4(struct packet *packet, u8 *header_start,
+ int layer4_protocol, int layer4_bytes,
+ u8 *packet_end, char **error);
+
+static int parse_layer2_packet(struct packet *packet,
+ u8 *header_start, u8 *packet_end,
+ char **error)
+{
+ u8 *p = header_start;
+ struct ether_header *ether = NULL;
+
+ /* Find Ethernet header */
+ if (p + sizeof(*ether) > packet_end) {
+ asprintf(error, "Ethernet header overflows packet");
+ goto error_out;
+ }
+ ether = (struct ether_header *)p;
+ p += sizeof(*ether);
+ packet->l2_header_bytes = sizeof(*ether);
+
+ return parse_layer3_packet_by_proto(packet, ntohs(ether->ether_type),
+ p, packet_end, error);
+
+error_out:
+ return PACKET_BAD;
+}
+
+static int parse_layer3_packet_by_proto(struct packet *packet,
+ u16 proto, u8 *header_start,
+ u8 *packet_end, char **error)
+{
+ u8 *p = header_start;
+
+ if (proto == ETHERTYPE_IP) {
+ struct ipv4 *ip = NULL;
+
+ /* Examine IPv4 header. */
+ if (p + sizeof(struct ipv4) > packet_end) {
+ asprintf(error, "IPv4 header overflows packet");
+ goto error_out;
+ }
+
+ /* Look at the IP version number, which is in the first 4 bits
+ * of both IPv4 and IPv6 packets.
+ */
+ ip = (struct ipv4 *)p;
+ if (ip->version == 4) {
+ return parse_ipv4(packet, p, packet_end, error);
+ } else {
+ asprintf(error, "Bad IP version (%d) for ETHERTYPE_IP", ip->version);
+ goto error_out;
+ }
+ } else if (proto == ETHERTYPE_IPV6) {
+ struct ipv6 *ip = NULL;
+
+ /* Examine IPv6 header. */
+ if (p + sizeof(struct ipv6) > packet_end) {
+ asprintf(error, "IPv6 header overflows packet");
+ goto error_out;
+ }
+
+ /* Look at the IP version number, which is in the first 4 bits
+ * of both IPv4 and IPv6 packets.
+ */
+ ip = (struct ipv6 *)p;
+ if (ip->version == 6) {
+ return parse_ipv6(packet, p, packet_end, error);
+ } else {
+ asprintf(error, "Bad IP version for ETHERTYPE_IPV6");
+ goto error_out;
+ }
+ } else if ((proto == ETHERTYPE_MPLS_UC) ||
+ (proto == ETHERTYPE_MPLS_MC)) {
+ return parse_mpls(packet, p, packet_end, error);
+ } else {
+ return PACKET_UNKNOWN_L4;
+ }
+
+error_out:
+ return PACKET_BAD;
+}
+
+static int parse_layer3_packet(struct packet *packet,
+ u8 *header_start, u8 *packet_end,
+ char **error)
+{
+ u8 *p = header_start;
+ /* Note that packet_end points to the byte beyond the end of packet. */
+ struct ipv4 *ip = NULL;
+
+ /* Examine IPv4/IPv6 header. */
+ if (p + sizeof(struct ipv4) > packet_end) {
+ asprintf(error, "IP header overflows packet");
+ return PACKET_BAD;
+ }
+
+ /* Look at the IP version number, which is in the first 4 bits
+ * of both IPv4 and IPv6 packets.
+ */
+ ip = (struct ipv4 *) (p);
+ if (ip->version == 4)
+ return parse_ipv4(packet, p, packet_end, error);
+ else if (ip->version == 6)
+ return parse_ipv6(packet, p, packet_end, error);
+
+ asprintf(error, "Unsupported IP version");
+ return PACKET_BAD;
+}
+
+int parse_packet(struct packet *packet, int in_bytes,
+ enum packet_layer_t layer, char **error)
+{
+ assert(in_bytes <= packet->buffer_bytes);
+ char *message = NULL; /* human-readable error summary */
+ char *hex = NULL; /* hex dump of bad packet */
+ enum packet_parse_result_t result = PACKET_BAD;
+ u8 *header_start = packet->buffer;
+ /* packet_end points to the byte beyond the end of packet. */
+ u8 *packet_end = packet->buffer + in_bytes;
+
+ if (layer == PACKET_LAYER_2_ETHERNET)
+ result = parse_layer2_packet(packet, header_start, packet_end,
+ error);
+ else if (layer == PACKET_LAYER_3_IP)
+ result = parse_layer3_packet(packet, header_start, packet_end,
+ error);
+ else
+ assert(!"bad layer");
+
+ if (result != PACKET_BAD)
+ return result;
+
+ /* Error. Add a packet hex dump to the error string we're returning. */
+ hex_dump(packet->buffer, in_bytes, &hex);
+ message = *error;
+ asprintf(error, "%s: packet of %d bytes:\n%s", message, in_bytes, hex);
+ free(message);
+ free(hex);
+
+ return PACKET_BAD;
+}
+
+/* Parse the IPv4 header and the TCP header inside. Return a
+ * packet_parse_result_t.
+ * Note that packet_end points to the byte beyond the end of packet.
+ */
+static int parse_ipv4(struct packet *packet, u8 *header_start, u8 *packet_end,
+ char **error)
+{
+ struct header *ip_header = NULL;
+ u8 *p = header_start;
+ const bool is_outer = (packet->ip_bytes == 0);
+ enum packet_parse_result_t result = PACKET_BAD;
+ struct ipv4 *ipv4 = (struct ipv4 *) (p);
+
+ const int ip_header_bytes = ipv4_header_len(ipv4);
+ assert(ip_header_bytes >= 0);
+ if (ip_header_bytes < sizeof(*ipv4)) {
+ asprintf(error, "IP header too short");
+ goto error_out;
+ }
+ if (p + ip_header_bytes > packet_end) {
+ asprintf(error, "Full IP header overflows packet");
+ goto error_out;
+ }
+ const int ip_total_bytes = ntohs(ipv4->tot_len);
+
+ if (p + ip_total_bytes > packet_end) {
+ asprintf(error, "IP payload overflows packet");
+ goto error_out;
+ }
+ if (ip_header_bytes > ip_total_bytes) {
+ asprintf(error, "IP header bigger than datagram");
+ goto error_out;
+ }
+ if (ntohs(ipv4->frag_off) & IP_MF) { /* more fragments? */
+ asprintf(error, "More fragments remaining");
+ goto error_out;
+ }
+ if (ntohs(ipv4->frag_off) & IP_OFFMASK) { /* fragment offset */
+ asprintf(error, "Non-zero fragment offset");
+ goto error_out;
+ }
+ const u16 checksum = ipv4_checksum(ipv4, ip_header_bytes);
+ if (checksum != 0) {
+ asprintf(error, "Bad IP checksum");
+ goto error_out;
+ }
+
+ ip_header = packet_append_header(packet, HEADER_IPV4, ip_header_bytes);
+ if (ip_header == NULL) {
+ asprintf(error, "Too many nested headers at IPv4 header");
+ goto error_out;
+ }
+ ip_header->total_bytes = ip_total_bytes;
+
+ /* Move on to the header inside. */
+ p += ip_header_bytes;
+ assert(p <= packet_end);
+
+ if (DEBUG_LOGGING) {
+ char src_string[ADDR_STR_LEN];
+ char dst_string[ADDR_STR_LEN];
+ struct ip_address src_ip, dst_ip;
+ ip_from_ipv4(&ipv4->src_ip, &src_ip);
+ ip_from_ipv4(&ipv4->dst_ip, &dst_ip);
+ DEBUGP("src IP: %s\n", ip_to_string(&src_ip, src_string));
+ DEBUGP("dst IP: %s\n", ip_to_string(&dst_ip, dst_string));
+ }
+
+ /* Examine the L4 header. */
+ const int layer4_bytes = ip_total_bytes - ip_header_bytes;
+ const int layer4_protocol = ipv4->protocol;
+ result = parse_layer4(packet, p, layer4_protocol, layer4_bytes,
+ packet_end, error);
+
+ /* If this is the innermost L3 header then this is the primary. */
+ if (!packet->ipv4 && !packet->ipv6)
+ packet->ipv4 = ipv4;
+ /* If this is the outermost IP header then this is the packet length. */
+ if (is_outer)
+ packet->ip_bytes = ip_total_bytes;
+
+ return result;
+
+error_out:
+ return PACKET_BAD;
+}
+
+/* Parse the IPv6 header and the TCP header inside. We do not
+ * currently support parsing IPv6 extension headers or any layer 4
+ * protocol other than TCP. Return a packet_parse_result_t.
+ * Note that packet_end points to the byte beyond the end of packet.
+ */
+static int parse_ipv6(struct packet *packet, u8 *header_start, u8 *packet_end,
+ char **error)
+{
+ struct header *ip_header = NULL;
+ u8 *p = header_start;
+ const bool is_outer = (packet->ip_bytes == 0);
+ struct ipv6 *ipv6 = (struct ipv6 *) (p);
+ enum packet_parse_result_t result = PACKET_BAD;
+
+ /* Check that header fits in sniffed packet. */
+ const int ip_header_bytes = sizeof(*ipv6);
+ if (p + ip_header_bytes > packet_end) {
+ asprintf(error, "IPv6 header overflows packet");
+ goto error_out;
+ }
+
+ /* Check that payload fits in sniffed packet. */
+ const int ip_total_bytes = (ip_header_bytes +
+ ntohs(ipv6->payload_len));
+
+ if (p + ip_total_bytes > packet_end) {
+ asprintf(error, "IPv6 payload overflows packet");
+ goto error_out;
+ }
+ assert(ip_header_bytes <= ip_total_bytes);
+
+ ip_header = packet_append_header(packet, HEADER_IPV6, ip_header_bytes);
+ if (ip_header == NULL) {
+ asprintf(error, "Too many nested headers at IPv6 header");
+ goto error_out;
+ }
+ ip_header->total_bytes = ip_total_bytes;
+
+ /* Move on to the header inside. */
+ p += ip_header_bytes;
+ assert(p <= packet_end);
+
+ if (DEBUG_LOGGING) {
+ char src_string[ADDR_STR_LEN];
+ char dst_string[ADDR_STR_LEN];
+ struct ip_address src_ip, dst_ip;
+ ip_from_ipv6(&ipv6->src_ip, &src_ip);
+ ip_from_ipv6(&ipv6->dst_ip, &dst_ip);
+ DEBUGP("src IP: %s\n", ip_to_string(&src_ip, src_string));
+ DEBUGP("dst IP: %s\n", ip_to_string(&dst_ip, dst_string));
+ }
+
+ /* Examine the L4 header. */
+ const int layer4_bytes = ip_total_bytes - ip_header_bytes;
+ const int layer4_protocol = ipv6->next_header;
+ result = parse_layer4(packet, p, layer4_protocol, layer4_bytes,
+ packet_end, error);
+
+ /* If this is the innermost L3 header then this is the primary. */
+ if (!packet->ipv4 && !packet->ipv6)
+ packet->ipv6 = ipv6;
+ /* If this is the outermost IP header then this is the packet length. */
+ if (is_outer)
+ packet->ip_bytes = ip_total_bytes;
+
+ return result;
+
+error_out:
+ return PACKET_BAD;
+}
+
+/* Parse the TCP header. Return a packet_parse_result_t. */
+static int parse_tcp(struct packet *packet, u8 *layer4_start, int layer4_bytes,
+ u8 *packet_end, char **error)
+{
+ struct header *tcp_header = NULL;
+ u8 *p = layer4_start;
+
+ assert(layer4_bytes >= 0);
+ if (layer4_bytes < sizeof(struct tcp)) {
+ asprintf(error, "Truncated TCP header");
+ goto error_out;
+ }
+ packet->tcp = (struct tcp *) p;
+ const int tcp_header_len = packet_tcp_header_len(packet);
+ if (tcp_header_len < sizeof(struct tcp)) {
+ asprintf(error, "TCP data offset too small");
+ goto error_out;
+ }
+ if (tcp_header_len > layer4_bytes) {
+ asprintf(error, "TCP data offset too big");
+ goto error_out;
+ }
+
+ tcp_header = packet_append_header(packet, HEADER_TCP, tcp_header_len);
+ if (tcp_header == NULL) {
+ asprintf(error, "Too many nested headers at TCP header");
+ goto error_out;
+ }
+ tcp_header->total_bytes = layer4_bytes;
+
+ p += layer4_bytes;
+ assert(p <= packet_end);
+
+ DEBUGP("TCP src port: %d\n", ntohs(packet->tcp->src_port));
+ DEBUGP("TCP dst port: %d\n", ntohs(packet->tcp->dst_port));
+ return PACKET_OK;
+
+error_out:
+ return PACKET_BAD;
+}
+
+/* Parse the UDP header. Return a packet_parse_result_t. */
+static int parse_udp(struct packet *packet, u8 *layer4_start, int layer4_bytes,
+ u8 *packet_end, char **error)
+{
+ struct header *udp_header = NULL;
+ u8 *p = layer4_start;
+
+ assert(layer4_bytes >= 0);
+ if (layer4_bytes < sizeof(struct udp)) {
+ asprintf(error, "Truncated UDP header");
+ goto error_out;
+ }
+ packet->udp = (struct udp *) p;
+ const int udp_len = ntohs(packet->udp->len);
+ const int udp_header_len = sizeof(struct udp);
+ if (udp_len < udp_header_len) {
+ asprintf(error, "UDP datagram length too small for UDP header");
+ goto error_out;
+ }
+ if (udp_len < layer4_bytes) {
+ asprintf(error, "UDP datagram length too small");
+ goto error_out;
+ }
+ if (udp_len > layer4_bytes) {
+ asprintf(error, "UDP datagram length too big");
+ goto error_out;
+ }
+
+ udp_header = packet_append_header(packet, HEADER_UDP, udp_header_len);
+ if (udp_header == NULL) {
+ asprintf(error, "Too many nested headers at UDP header");
+ goto error_out;
+ }
+ udp_header->total_bytes = layer4_bytes;
+
+ p += layer4_bytes;
+ assert(p <= packet_end);
+
+ DEBUGP("UDP src port: %d\n", ntohs(packet->udp->src_port));
+ DEBUGP("UDP dst port: %d\n", ntohs(packet->udp->dst_port));
+ return PACKET_OK;
+
+error_out:
+ return PACKET_BAD;
+}
+
+/* Parse the ICMP header. Return a packet_parse_result_t. */
+static int parse_icmpv4(struct packet *packet, u8 *layer4_start, int layer4_bytes,
+ u8 *packet_end, char **error)
+{
+ struct header *icmp_header = NULL;
+ u8 *p = layer4_start;
+
+ assert(layer4_bytes >= 0);
+ const int icmpv4_len = sizeof(struct icmpv4);
+ if (layer4_bytes < icmpv4_len) {
+ asprintf(error, "Truncated ICMPv4 header");
+ goto error_out;
+ }
+ packet->icmpv4 = (struct icmpv4 *) p;
+ icmp_header = packet_append_header(packet, HEADER_ICMPV4, icmpv4_len);
+
+ if (icmp_header == NULL) {
+ asprintf(error, "Too many nested headers at ICMP header");
+ goto error_out;
+ }
+ icmp_header->total_bytes = layer4_bytes;
+
+ p += layer4_bytes;
+ assert(p <= packet_end);
+
+ DEBUGP("ICMPv4 type: %d\n", packet->icmpv4->type);
+ DEBUGP("ICMPv4 code: %d\n", packet->icmpv4->code);
+ return PACKET_OK;
+
+error_out:
+ return PACKET_BAD;
+}
+
+static int parse_icmpv6(struct packet *packet, u8 *layer4_start, int layer4_bytes,
+ u8 *packet_end, char **error)
+{
+ struct header *icmp_header = NULL;
+ u8 *p = layer4_start;
+
+ assert(layer4_bytes >= 0);
+ const int icmpv6_len = sizeof(struct icmpv6);
+ if (layer4_bytes < icmpv6_len) {
+ asprintf(error, "Truncated ICMPv6 header");
+ goto error_out;
+ }
+ packet->icmpv6 = (struct icmpv6 *) p;
+ icmp_header = packet_append_header(packet, HEADER_ICMPV6, icmpv6_len);
+
+ if (icmp_header == NULL) {
+ asprintf(error, "Too many nested headers at ICMP header");
+ goto error_out;
+ }
+ icmp_header->total_bytes = layer4_bytes;
+
+ p += layer4_bytes;
+ assert(p <= packet_end);
+
+ DEBUGP("ICMPv6 type: %d\n", packet->icmpv6->type);
+ DEBUGP("ICMPv6 code: %d\n", packet->icmpv6->code);
+ return PACKET_OK;
+
+error_out:
+ return PACKET_BAD;
+}
+
+/* Parse the GRE header. Return a packet_parse_result_t. */
+static int parse_gre(struct packet *packet, u8 *layer4_start, int layer4_bytes,
+ u8 *packet_end, char **error)
+{
+ struct header *gre_header = NULL;
+ u8 *p = layer4_start;
+ struct gre *gre = (struct gre *) p;
+
+ assert(layer4_bytes >= 0);
+ if (layer4_bytes < GRE_MINLEN) {
+ asprintf(error, "Truncated GRE header");
+ goto error_out;
+ }
+ if (gre->version != 0) {
+ asprintf(error, "GRE header has unsupported version number");
+ goto error_out;
+ }
+ if (gre->has_routing) {
+ asprintf(error, "GRE header has unsupported routing info");
+ goto error_out;
+ }
+ const int gre_header_len = gre_len(gre);
+ if (gre_header_len < GRE_MINLEN) {
+ asprintf(error, "GRE header length too small for GRE header");
+ goto error_out;
+ }
+ if (gre_header_len > layer4_bytes) {
+ asprintf(error, "GRE header length too big");
+ goto error_out;
+ }
+
+ assert(p + layer4_bytes <= packet_end);
+
+ DEBUGP("GRE header len: %d\n", gre_header_len);
+
+ gre_header = packet_append_header(packet, HEADER_GRE, gre_header_len);
+ if (gre_header == NULL) {
+ asprintf(error, "Too many nested headers at GRE header");
+ goto error_out;
+ }
+ gre_header->total_bytes = layer4_bytes;
+
+ p += gre_header_len;
+ assert(p <= packet_end);
+ return parse_layer3_packet_by_proto(packet, ntohs(gre->proto),
+ p, packet_end, error);
+
+error_out:
+ return PACKET_BAD;
+}
+
+int parse_mpls(struct packet *packet, u8 *header_start, u8 *packet_end,
+ char **error)
+{
+ struct header *mpls_header = NULL;
+ u8 *p = header_start;
+ int mpls_header_bytes = 0;
+ int mpls_total_bytes = packet_end - p;
+ bool is_stack_bottom = false;
+
+ do {
+ struct mpls *mpls_entry = (struct mpls *)(p);
+
+ if (p + sizeof(struct mpls) > packet_end) {
+ asprintf(error, "MPLS stack entry overflows packet");
+ goto error_out;
+ }
+
+ is_stack_bottom = mpls_entry_stack(mpls_entry);
+
+ p += sizeof(struct mpls);
+ mpls_header_bytes += sizeof(struct mpls);
+ } while (!is_stack_bottom && p < packet_end);
+
+ assert(mpls_header_bytes <= mpls_total_bytes);
+
+ mpls_header = packet_append_header(packet, HEADER_MPLS,
+ mpls_header_bytes);
+ if (mpls_header == NULL) {
+ asprintf(error, "Too many nested headers at MPLS header");
+ goto error_out;
+ }
+ mpls_header->total_bytes = mpls_total_bytes;
+
+ /* Move on to the header inside the MPLS label stack. */
+ assert(p <= packet_end);
+ return parse_layer3_packet(packet, p, packet_end, error);
+
+error_out:
+ return PACKET_BAD;
+}
+
+static int parse_layer4(struct packet *packet, u8 *layer4_start,
+ int layer4_protocol, int layer4_bytes,
+ u8 *packet_end, char **error)
+{
+ if (layer4_protocol == IPPROTO_TCP) {
+ return parse_tcp(packet, layer4_start, layer4_bytes, packet_end,
+ error);
+ } else if (layer4_protocol == IPPROTO_UDP) {
+ return parse_udp(packet, layer4_start, layer4_bytes, packet_end,
+ error);
+ } else if (layer4_protocol == IPPROTO_ICMP) {
+ return parse_icmpv4(packet, layer4_start, layer4_bytes, packet_end,
+ error);
+ } else if (layer4_protocol == IPPROTO_ICMPV6) {
+ return parse_icmpv6(packet, layer4_start, layer4_bytes, packet_end,
+ error);
+ } else if (layer4_protocol == IPPROTO_GRE) {
+ return parse_gre(packet, layer4_start, layer4_bytes, packet_end,
+ error);
+ } else if (layer4_protocol == IPPROTO_IPIP) {
+ return parse_ipv4(packet, layer4_start, packet_end, error);
+ } else if (layer4_protocol == IPPROTO_IPV6) {
+ return parse_ipv6(packet, layer4_start, packet_end, error);
+ }
+ return PACKET_UNKNOWN_L4;
+}
diff --git a/test/packetdrill/packet_parser.h b/test/packetdrill/packet_parser.h
new file mode 100644
index 0000000..8bd6512
--- /dev/null
+++ b/test/packetdrill/packet_parser.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Interface for a module to parse TCP/IP packets.
+ */
+
+#ifndef __PACKET_PARSER_H__
+#define __PACKET_PARSER_H__
+
+#include "packet.h"
+
+/* What layer of headers is at the head of the packet? */
+enum packet_layer_t {
+ PACKET_LAYER_3_IP = 0, /* no layer 2 headers */
+ PACKET_LAYER_2_ETHERNET, /* layer 2 is Ethernet */
+};
+
+enum packet_parse_result_t {
+ PACKET_OK, /* no errors detected */
+ PACKET_BAD, /* illegal header */
+ PACKET_UNKNOWN_L4, /* not TCP or UDP */
+};
+
+/* Given an input packet of length 'in_bytes' stored in the buffer
+ * whose location is given by the packet's 'buffer' field and whose
+ * full size is given by the 'buffer_bytes' field, parses the packets
+ * and fills in packet fields 'ip_bytes', 'ip', and 'tcp'. On success,
+ * returns PACKET_OK; on error, returns a enum packet_parse_result_t error
+ * code and fills in *error with a human-readable, malloc-allocated
+ * error message.
+ */
+int parse_packet(struct packet *packet, int in_bytes,
+ enum packet_layer_t layer, char **error);
+
+#endif /* __PACKET_PARSER_H__ */
diff --git a/test/packetdrill/packet_parser_test.c b/test/packetdrill/packet_parser_test.c
new file mode 100644
index 0000000..d0d33d9
--- /dev/null
+++ b/test/packetdrill/packet_parser_test.c
@@ -0,0 +1,484 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Test for parsing IP packets.
+ */
+
+#include "assert.h"
+#include "packet_parser.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+static void test_parse_tcp_ipv4_packet(void)
+{
+ /* A TCP/IPv4 packet. */
+ u8 data[] = {
+ /* 192.0.2.1:53055 > 192.168.0.1:8080
+ * . 1:1(0) ack 2202903899 win 257
+ * <sack 2202905347:2202906795,TS val 300 ecr 1623332896>
+ */
+ 0x45, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0x06, 0x39, 0x11, 0xc0, 0x00, 0x02, 0x01,
+ 0xc0, 0xa8, 0x00, 0x01, 0xcf, 0x3f, 0x1f, 0x90,
+ 0x00, 0x00, 0x00, 0x01, 0x83, 0x4d, 0xa5, 0x5b,
+ 0xa0, 0x10, 0x01, 0x01, 0xdb, 0x2d, 0x00, 0x00,
+ 0x05, 0x0a, 0x83, 0x4d, 0xab, 0x03, 0x83, 0x4d,
+ 0xb0, 0xab, 0x08, 0x0a, 0x00, 0x00, 0x01, 0x2c,
+ 0x60, 0xc2, 0x18, 0x20
+ };
+
+ struct packet *packet = packet_new(sizeof(data));
+
+ /* Populate and parse a packet */
+ memcpy(packet->buffer, data, sizeof(data));
+ char *error = NULL;
+ enum packet_parse_result_t result =
+ parse_packet(packet, sizeof(data), PACKET_LAYER_3_IP,
+ &error);
+ assert(result == PACKET_OK);
+ assert(error == NULL);
+
+ struct ipv4 *expected_ipv4 = (struct ipv4 *)(packet->buffer);
+ struct tcp *expected_tcp = (struct tcp *)(expected_ipv4 + 1);
+
+ assert(packet->ip_bytes == sizeof(data));
+ assert(packet->ipv4 == expected_ipv4);
+ assert(packet->ipv6 == NULL);
+ assert(packet->tcp == expected_tcp);
+ assert(packet->udp == NULL);
+ assert(packet->icmpv4 == NULL);
+ assert(packet->icmpv6 == NULL);
+
+ assert(packet->time_usecs == 0);
+ assert(packet->flags == 0);
+
+ packet_free(packet);
+}
+
+static void test_parse_tcp_ipv6_packet(void)
+{
+ /* A TCP/IPv6 packet. */
+ u8 data[] = {
+ /* 2001:db8::1:54242 > fd3d:fa7b:d17d::1:8080
+ * S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+ */
+ 0x60, 0x00, 0x00, 0x00, 0x00, 0x20, 0x06, 0xff,
+ 0x20, 0x01, 0x0d, 0xb8, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+ 0xfd, 0x3d, 0xfa, 0x7b, 0xd1, 0x7d, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+ 0xd3, 0xe2, 0x1f, 0x90, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x80, 0x02, 0x80, 0x18,
+ 0x06, 0x60, 0x00, 0x00, 0x02, 0x04, 0x03, 0xe8,
+ 0x04, 0x02, 0x01, 0x01, 0x01, 0x03, 0x03, 0x07,
+ };
+
+ struct packet *packet = packet_new(sizeof(data));
+
+ /* Populate and parse a packet */
+ memcpy(packet->buffer, data, sizeof(data));
+ char *error = NULL;
+ enum packet_parse_result_t result =
+ parse_packet(packet, sizeof(data), PACKET_LAYER_3_IP,
+ &error);
+ assert(result == PACKET_OK);
+ assert(error == NULL);
+
+ struct ipv6 *expected_ipv6 = (struct ipv6 *)(packet->buffer);
+ struct tcp *expected_tcp = (struct tcp *)(expected_ipv6 + 1);
+
+ assert(packet->ip_bytes == sizeof(data));
+ assert(packet->ipv4 == NULL);
+ assert(packet->ipv6 == expected_ipv6);
+ assert(packet->tcp == expected_tcp);
+ assert(packet->udp == NULL);
+ assert(packet->icmpv4 == NULL);
+ assert(packet->icmpv6 == NULL);
+
+ assert(packet->time_usecs == 0);
+ assert(packet->flags == 0);
+
+ packet_free(packet);
+}
+
+static void test_parse_udp_ipv4_packet(void)
+{
+ /* A UDP/IPv4 packet. */
+ u8 data[] = {
+ /* 192.0.2.1.8080 > 192.168.0.1.57845: UDP, length 4 */
+ 0x45, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0x11, 0x39, 0x22, 0xc0, 0x00, 0x02, 0x01,
+ 0xc0, 0xa8, 0x00, 0x01, 0x1f, 0x90, 0xe1, 0xf5,
+ 0x00, 0x0c, 0x7b, 0xa5, 0x00, 0x00, 0x00, 0x00,
+ };
+
+ struct packet *packet = packet_new(sizeof(data));
+
+ /* Populate and parse a packet */
+ memcpy(packet->buffer, data, sizeof(data));
+ char *error = NULL;
+ enum packet_parse_result_t result =
+ parse_packet(packet, sizeof(data), PACKET_LAYER_3_IP,
+ &error);
+ assert(result == PACKET_OK);
+ assert(error == NULL);
+
+ struct ipv4 *expected_ipv4 = (struct ipv4 *)(packet->buffer);
+ struct udp *expected_udp = (struct udp *)(expected_ipv4 + 1);
+
+ assert(packet->ip_bytes == sizeof(data));
+ assert(packet->ipv4 == expected_ipv4);
+ assert(packet->ipv6 == NULL);
+ assert(packet->tcp == NULL);
+ assert(packet->udp == expected_udp);
+ assert(packet->icmpv4 == NULL);
+ assert(packet->icmpv6 == NULL);
+
+ assert(packet->time_usecs == 0);
+ assert(packet->flags == 0);
+
+ packet_free(packet);
+}
+
+
+static void test_parse_udp_ipv6_packet(void)
+{
+ /* A UDP/IPv6 packet. */
+ u8 data[] = {
+ /* 2001:db8::1.8080 > fd3d:fa7b:d17d::1.51557: UDP, length 4 */
+ 0x60, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x11, 0xff,
+ 0x20, 0x01, 0x0d, 0xb8, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+ 0xfd, 0x3d, 0xfa, 0x7b, 0xd1, 0x7d, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+ 0x1f, 0x90, 0xc9, 0x65, 0x00, 0x0c, 0x1f, 0xee,
+ 0x00, 0x00, 0x00, 0x00,
+ };
+
+ struct packet *packet = packet_new(sizeof(data));
+
+ /* Populate and parse a packet */
+ memcpy(packet->buffer, data, sizeof(data));
+ char *error = NULL;
+ enum packet_parse_result_t result =
+ parse_packet(packet, sizeof(data), PACKET_LAYER_3_IP,
+ &error);
+ assert(result == PACKET_OK);
+ assert(error == NULL);
+
+ struct ipv6 *expected_ipv6 = (struct ipv6 *)(packet->buffer);
+ struct udp *expected_udp = (struct udp *)(expected_ipv6 + 1);
+
+ assert(packet->ip_bytes == sizeof(data));
+ assert(packet->ipv4 == NULL);
+ assert(packet->ipv6 == expected_ipv6);
+ assert(packet->tcp == NULL);
+ assert(packet->udp == expected_udp);
+ assert(packet->icmpv4 == NULL);
+ assert(packet->icmpv6 == NULL);
+
+ assert(packet->time_usecs == 0);
+ assert(packet->flags == 0);
+
+ packet_free(packet);
+}
+
+static void test_parse_ipv4_gre_ipv4_tcp_packet(void)
+{
+ u8 *p = NULL;
+ int i = 0;
+
+ /* An IPv4/GRE/IPv4/TCP packet. */
+ u8 data[] = {
+ /* IP 2.2.2.2 > 1.1.1.1: GREv0, length 48:
+ IP 192.0.2.1.47078 > 192.168.0.1.8080:
+ . 2:6(4) ack 1 win 123 */
+ 0x45, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0x2f, 0xb5, 0x85, 0x02, 0x02, 0x02, 0x02,
+ 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x08, 0x00,
+ 0x45, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0x06, 0x39, 0x21, 0xc0, 0x00, 0x02, 0x01,
+ 0xc0, 0xa8, 0x00, 0x01, 0xb7, 0xe6, 0x1f, 0x90,
+ 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01,
+ 0x50, 0x10, 0x00, 0x7b, 0x55, 0x31, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00
+ };
+
+ struct packet *packet = packet_new(sizeof(data));
+
+ /* Populate and parse a packet */
+ memcpy(packet->buffer, data, sizeof(data));
+ char *error = NULL;
+ enum packet_parse_result_t result =
+ parse_packet(packet, sizeof(data), PACKET_LAYER_3_IP,
+ &error);
+ assert(result == PACKET_OK);
+ assert(error == NULL);
+
+ p = packet->buffer;
+ i = 0; /* outer most layer, 0 */
+
+ assert(packet->headers[i].type == HEADER_IPV4);
+ assert(packet->headers[i].h.ptr == p);
+ assert(packet->headers[i].header_bytes == sizeof(struct ipv4));
+ p += packet->headers[i].header_bytes;
+ i++;
+
+ assert(packet->headers[i].type == HEADER_GRE);
+ assert(packet->headers[i].h.ptr == p);
+ assert(packet->headers[i].header_bytes == GRE_MINLEN);
+ p += packet->headers[i].header_bytes;
+ i++;
+
+ struct ipv4 *expected_inner_ipv4 = (struct ipv4 *)p;
+ assert(packet->headers[i].type == HEADER_IPV4);
+ assert(packet->headers[i].h.ptr == p);
+ assert(packet->headers[i].header_bytes == sizeof(struct ipv4));
+ p += packet->headers[i].header_bytes;
+ i++;
+
+ struct tcp *expected_tcp = (struct tcp *)p;
+ assert(packet->headers[i].type == HEADER_TCP);
+ assert(packet->headers[i].h.ptr == p);
+ assert(packet->headers[i].header_bytes == sizeof(struct tcp));
+ p += packet->headers[i].header_bytes;
+ i++;
+
+ assert(packet->headers[i].type == HEADER_NONE);
+
+ assert(packet->ip_bytes == sizeof(data));
+ assert(packet->ipv4 == expected_inner_ipv4);
+ assert(packet->ipv6 == NULL);
+ assert(packet->tcp == expected_tcp);
+ assert(packet->udp == NULL);
+ assert(packet->icmpv4 == NULL);
+ assert(packet->icmpv6 == NULL);
+
+ assert(packet->time_usecs == 0);
+ assert(packet->flags == 0);
+
+ packet_free(packet);
+}
+
+static void test_parse_ipv4_gre_mpls_ipv4_tcp_packet(void)
+{
+ u8 *p = NULL;
+ int i = 0;
+
+ /* An IPv4/GRE/MPLS/IPv4/TCP packet. */
+ u8 data[] = {
+ /* ipv4 192.168.0.1 > 192.0.2.2: gre:
+ mpls
+ (label 0, tc 0, ttl 0)
+ (label 1048575, tc 7, [S], ttl 255):
+ 192.168.0.1:8080 > 192.0.2.1:56268
+ F. 2072102268:2072102268(0) ack 1 win 453
+ <nop,nop,TS val 117573699 ecr 5>
+ */
+
+ /* IPv4: */
+ 0x45, 0x00, 0x00, 0x54, 0x00, 0x00, 0x40, 0x00,
+ 0x40, 0x2f, 0xb7, 0xcf, 0xc0, 0xa8, 0x00, 0x01,
+ 0xc0, 0x00, 0x02, 0x02,
+ /* GRE: */
+ 0x00, 0x00, 0x88, 0x47,
+ /* MPLS: */
+ 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
+ /* IPv4, TCP: */
+ 0x45, 0x00, 0x00, 0x34, 0x86, 0x99, 0x40, 0x00,
+ 0x40, 0x06, 0x31, 0x80, 0xc0, 0xa8, 0x00, 0x01,
+ 0xc0, 0x00, 0x02, 0x01, 0x1f, 0x90, 0xdb, 0xcc,
+ 0x7b, 0x81, 0xc5, 0x7c, 0x00, 0x00, 0x00, 0x01,
+ 0x80, 0x11, 0x01, 0xc5, 0xa6, 0xa6, 0x00, 0x00,
+ 0x01, 0x01, 0x08, 0x0a, 0x07, 0x02, 0x08, 0x43,
+ 0x00, 0x00, 0x00, 0x05
+ };
+
+ struct packet *packet = packet_new(sizeof(data));
+
+ /* Populate and parse a packet */
+ memcpy(packet->buffer, data, sizeof(data));
+ char *error = NULL;
+ enum packet_parse_result_t result =
+ parse_packet(packet, sizeof(data), PACKET_LAYER_3_IP,
+ &error);
+ assert(result == PACKET_OK);
+ assert(error == NULL);
+
+ p = packet->buffer;
+ i = 0; /* outer most layer, 0 */
+
+ assert(packet->headers[i].type == HEADER_IPV4);
+ assert(packet->headers[i].h.ptr == p);
+ assert(packet->headers[i].header_bytes == sizeof(struct ipv4));
+ p += packet->headers[i].header_bytes;
+ i++;
+
+ assert(packet->headers[i].type == HEADER_GRE);
+ assert(packet->headers[i].h.ptr == p);
+ assert(packet->headers[i].header_bytes == GRE_MINLEN);
+ p += packet->headers[i].header_bytes;
+ i++;
+
+ assert(packet->headers[i].type == HEADER_MPLS);
+ assert(packet->headers[i].h.ptr == p);
+ assert(packet->headers[i].header_bytes == 2*sizeof(struct mpls));
+ p += packet->headers[i].header_bytes;
+ i++;
+
+ struct ipv4 *expected_inner_ipv4 = (struct ipv4 *)p;
+ assert(packet->headers[i].type == HEADER_IPV4);
+ assert(packet->headers[i].h.ptr == p);
+ assert(packet->headers[i].header_bytes == sizeof(struct ipv4));
+ p += packet->headers[i].header_bytes;
+ i++;
+
+ struct tcp *expected_tcp = (struct tcp *)p;
+ assert(packet->headers[i].type == HEADER_TCP);
+ assert(packet->headers[i].h.ptr == p);
+ assert(packet->headers[i].header_bytes ==
+ sizeof(struct tcp) + TCPOLEN_TIMESTAMP + 2); /* 2 for 2 NOPs */
+ p += packet->headers[i].header_bytes;
+ i++;
+
+ assert(packet->headers[i].type == HEADER_NONE);
+
+ assert(packet->ip_bytes == sizeof(data));
+ assert(packet->ipv4 == expected_inner_ipv4);
+ assert(packet->ipv6 == NULL);
+ assert(packet->tcp == expected_tcp);
+ assert(packet->udp == NULL);
+ assert(packet->icmpv4 == NULL);
+ assert(packet->icmpv6 == NULL);
+
+ assert(packet->time_usecs == 0);
+ assert(packet->flags == 0);
+
+ packet_free(packet);
+}
+
+static void test_parse_icmpv4_packet(void)
+{
+ /* An ICMPv4 packet. */
+ u8 data[] = {
+ /* 192.168.1.101:0 > 192.168.1.103:0
+ * icmpv4 echo request, id 10960, seq 1, length 8
+ */
+ 0x45, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x40, 0x00,
+ 0x40, 0x01, 0xb6, 0xc4, 0xc0, 0xa8, 0x01, 0x65,
+ 0xc0, 0xa8, 0x01, 0x67, 0x08, 0x00, 0xcd, 0x2e,
+ 0x2a, 0xd0, 0x00, 0x01,
+ };
+
+ struct packet *packet = packet_new(sizeof(data));
+
+ /* Populate and parse a packet */
+ memcpy(packet->buffer, data, sizeof(data));
+ char *error = NULL;
+ enum packet_parse_result_t result =
+ parse_packet(packet, sizeof(data), PACKET_LAYER_3_IP,
+ &error);
+ assert(result == PACKET_OK);
+ assert(error == NULL);
+
+ struct ipv4 *expected_ipv4 = (struct ipv4 *)(packet->buffer);
+ struct icmpv4 *expected_icmpv4 = (struct icmpv4 *)(expected_ipv4 + 1);
+
+ assert(packet->ip_bytes == sizeof(data));
+ assert(packet->ipv4 == expected_ipv4);
+ assert(packet->ipv6 == NULL);
+ assert(packet->tcp == NULL);
+ assert(packet->udp == NULL);
+ assert(packet->icmpv4 == expected_icmpv4);
+ assert(packet->icmpv6 == NULL);
+
+ assert(packet->time_usecs == 0);
+ assert(packet->flags == 0);
+
+ packet_free(packet);
+}
+
+static void test_parse_icmpv6_packet(void)
+{
+ /* An ICMPv6 packet. */
+ u8 data[] = {
+ /* IP6 fd6b:6bbb:34a1::2 > fd6b:6bbb:34a1::1: ICMP6,
+ * echo request, seq 1, length 64
+ */
+ /* IPv6: */
+ 0x60, 0x00, 0x00, 0x00, 0x00, 0x40, 0x3a, 0x40,
+ 0xfd, 0x6b, 0x6b, 0xbb, 0x34, 0xa1, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02,
+ 0xfd, 0x6b, 0x6b, 0xbb, 0x34, 0xa1, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+ /* ICMPv6: Echo Request */
+ 0x80, 0x00, 0xb7, 0x44, 0x74, 0x7f, 0x00, 0x01,
+ 0x08, 0xb7, 0xc9, 0x52, 0x4d, 0x1f, 0x0e, 0x00,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37
+ };
+
+ struct packet *packet = packet_new(sizeof(data));
+
+ /* Populate and parse a packet */
+ memcpy(packet->buffer, data, sizeof(data));
+ char *error = NULL;
+ enum packet_parse_result_t result =
+ parse_packet(packet, sizeof(data), PACKET_LAYER_3_IP,
+ &error);
+ assert(result == PACKET_OK);
+ assert(error == NULL);
+
+ struct ipv6 *expected_ipv6 = (struct ipv6 *)(packet->buffer);
+ struct icmpv6 *expected_icmpv6 = (struct icmpv6 *)(expected_ipv6 + 1);
+
+ assert(packet->ip_bytes == sizeof(data));
+ assert(packet->ipv4 == NULL);
+ assert(packet->ipv6 == expected_ipv6);
+ assert(packet->tcp == NULL);
+ assert(packet->udp == NULL);
+ assert(packet->icmpv4 == NULL);
+ assert(packet->icmpv6 == expected_icmpv6);
+
+ assert(packet->time_usecs == 0);
+ assert(packet->flags == 0);
+
+ packet_free(packet);
+}
+
+int main(void)
+{
+ test_parse_tcp_ipv4_packet();
+ test_parse_tcp_ipv6_packet();
+ test_parse_udp_ipv4_packet();
+ test_parse_udp_ipv6_packet();
+ test_parse_ipv4_gre_ipv4_tcp_packet();
+ test_parse_ipv4_gre_mpls_ipv4_tcp_packet();
+ test_parse_icmpv4_packet();
+ test_parse_icmpv6_packet();
+
+ return 0;
+}
diff --git a/test/packetdrill/packet_socket.h b/test/packetdrill/packet_socket.h
new file mode 100644
index 0000000..a2defd3
--- /dev/null
+++ b/test/packetdrill/packet_socket.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Platform-independent API to read and write raw packets.
+ *
+ * We allocate and configure things much like tcpdump. We do this so
+ * we can get timestamps on the outbound packets the kernel sends, to
+ * verify the correct timing (tun devices do not take timestamps).
+ */
+
+#ifndef __PACKET_SOCKET_H__
+#define __PACKET_SOCKET_H__
+
+#include "types.h"
+
+#include "ethernet.h"
+#include "ip_address.h"
+#include "packet.h"
+
+struct packet_socket;
+
+/* Allocate and initialize a packet socket. */
+extern struct packet_socket *packet_socket_new(const char *device_name);
+
+/* Free all the memory used by the packet socket. */
+extern void packet_socket_free(struct packet_socket *packet_socket);
+
+/* Add a filter so we only sniff packets we want. */
+extern void packet_socket_set_filter(
+ struct packet_socket *psock,
+ const struct ether_addr *client_ether_addr,
+ const struct ip_address *client_live_ip);
+
+/* Send the given packet using writev. Return STATUS_OK on success,
+ * or STATUS_ERR if writev returns an error.
+ */
+extern int packet_socket_writev(struct packet_socket *psock,
+ const struct iovec *iov, int iovcnt);
+
+/* Do a blocking sniff of the next packet going over the given device
+ * in the given direction, fill in the given packet with the sniffed
+ * packet info, and return the number of bytes in the packet in
+ * *in_bytes. If we successfully read a matching packet, return
+ * STATUS_OK; else return STATUS_ERR (in which case the caller can
+ * retry).
+ */
+extern int packet_socket_receive(struct packet_socket *psock,
+ enum direction_t direction,
+ struct packet *packet, int *in_bytes);
+
+#endif /* __PACKET_SOCKET_H__ */
diff --git a/test/packetdrill/packet_socket_linux.c b/test/packetdrill/packet_socket_linux.c
new file mode 100644
index 0000000..a1f49e2
--- /dev/null
+++ b/test/packetdrill/packet_socket_linux.c
@@ -0,0 +1,280 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * API to read and write raw packets implemented using Linux packet socket.
+ */
+
+#include "packet_socket.h"
+
+#include <errno.h>
+#include <net/if.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/uio.h>
+#include <unistd.h>
+
+#ifdef linux
+
+#include <netpacket/packet.h>
+#include <linux/filter.h>
+
+#include "assert.h"
+#include "ethernet.h"
+#include "logging.h"
+
+/* Number of bytes to buffer in the packet socket we use for sniffing. */
+static const int PACKET_SOCKET_RCVBUF_BYTES = 2*1024*1024;
+
+struct packet_socket {
+ int packet_fd; /* socket for sending, sniffing timestamped packets */
+ char *name; /* malloc-allocated copy of interface name */
+ int index; /* interface index from if_nametoindex */
+};
+
+/* Set the receive buffer for a socket to the given size in bytes. */
+static void set_receive_buffer_size(int fd, int bytes)
+{
+ if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &bytes, sizeof(bytes)) < 0)
+ die_perror("setsockopt SOL_SOCKET SO_RCVBUF");
+}
+
+/* Bind the packet socket with the given fd to the given interface. */
+static void bind_to_interface(int fd, int interface_index)
+{
+ struct sockaddr_ll sll;
+ memset(&sll, 0, sizeof(sll));
+ sll.sll_family = AF_PACKET;
+ sll.sll_ifindex = interface_index;
+ sll.sll_protocol = htons(ETH_P_ALL);
+
+ if (bind(fd, (struct sockaddr *)&sll, sizeof(sll)) < 0)
+ die_perror("bind packet socket");
+}
+
+/* Allocate and configure a packet socket just like the one tcpdump
+ * uses. We do this so we can get timestamps on the outbound packets
+ * the kernel sends, to verify the correct timing (tun devices do not
+ * take timestamps). To reduce CPU load and filtering complexity, we
+ * bind the socket to a single device so we only receive packets for
+ * that device.
+ */
+static void packet_socket_setup(struct packet_socket *psock)
+{
+ struct timeval tv;
+
+ psock->packet_fd = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
+ if (psock->packet_fd < 0)
+ die_perror("socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL))");
+
+ psock->index = if_nametoindex(psock->name);
+ if (psock->index == 0)
+ die_perror("if_nametoindex");
+ DEBUGP("device index: %s -> %d\n", psock->name, psock->index);
+
+ bind_to_interface(psock->packet_fd, psock->index);
+
+ set_receive_buffer_size(psock->packet_fd, PACKET_SOCKET_RCVBUF_BYTES);
+
+ /* Pay the non-trivial latency cost to enable timestamps now, before
+ * the test starts, to avoid significant delays in the middle of tests.
+ */
+ ioctl(psock->packet_fd, SIOCGSTAMP, &tv);
+}
+
+/* Add a filter so we only sniff packets we want. */
+void packet_socket_set_filter(struct packet_socket *psock,
+ const struct ether_addr *client_ether_addr,
+ const struct ip_address *client_live_ip)
+{
+ const u8 *client_ether = client_ether_addr->ether_addr_octet;
+
+ struct sock_fprog bpfcode;
+ struct sock_filter bpf_ipv4_src[] = {
+ /* this filter works for ethernet interfaces: */
+ /* tcpdump -p -n -s 0 -i lo -dd
+ * "ether src 11:22:33:44:55:66 and ip src 1.2.3.4"
+ */
+ { 0x20, 0, 0, 0x00000008 },
+ { 0x15, 0, 7, 0x33445566 }, /* ether: 33:44:55:66 */
+ { 0x28, 0, 0, 0x00000006 },
+ { 0x15, 0, 5, 0x00001122 }, /* ether: 11:22 */
+ { 0x28, 0, 0, 0x0000000c },
+ { 0x15, 0, 3, 0x00000800 },
+ { 0x20, 0, 0, 0x0000001a },
+ { 0x15, 0, 1, 0x01020304 }, /* IPv4: 1.2.3.4 */
+ { 0x6, 0, 0, 0x0000ffff },
+ { 0x6, 0, 0, 0x00000000 },
+ };
+ struct sock_filter bpf_ipv6_src[] = {
+ /* this filter works for ethernet interfaces: */
+ /* tcpdump -p -n -s 0 -i lo -dd
+ * "ether src 11:22:33:44:55:66 and ip6 src 1:2:3:4:5:6:7:8" */
+ { 0x20, 0, 0, 0x00000008 },
+ { 0x15, 0, 13, 0x33445566 }, /* ether: 33:44:55:66 */
+ { 0x28, 0, 0, 0x00000006 },
+ { 0x15, 0, 11, 0x00001122 }, /* ether: 11:22 */
+ { 0x28, 0, 0, 0x0000000c },
+ { 0x15, 0, 9, 0x000086dd },
+ { 0x20, 0, 0, 0x00000016 },
+ { 0x15, 0, 7, 0x00010002 }, /* IPv6: 1:2 */
+ { 0x20, 0, 0, 0x0000001a },
+ { 0x15, 0, 5, 0x00030004 }, /* IPv6: 3:4 */
+ { 0x20, 0, 0, 0x0000001e },
+ { 0x15, 0, 3, 0x00050006 }, /* IPv6: 5:6 */
+ { 0x20, 0, 0, 0x00000022 },
+ { 0x15, 0, 1, 0x00070008 }, /* IPv6: 7:8 */
+ { 0x6, 0, 0, 0x0000ffff },
+ { 0x6, 0, 0, 0x00000000 },
+ };
+
+ if (client_live_ip->address_family == AF_INET) {
+ /* Fill in the client-side IPv6 address to look for. */
+ bpf_ipv4_src[7].k = ntohl(client_live_ip->ip.v4.s_addr);
+
+ bpfcode.len = ARRAY_SIZE(bpf_ipv4_src);
+ bpfcode.filter = bpf_ipv4_src;
+ } else if (client_live_ip->address_family == AF_INET6) {
+ /* Fill in the client-side IPv6 address to look for. */
+ bpf_ipv6_src[7].k = ntohl(client_live_ip->ip.v6.s6_addr32[0]);
+ bpf_ipv6_src[9].k = ntohl(client_live_ip->ip.v6.s6_addr32[1]);
+ bpf_ipv6_src[11].k = ntohl(client_live_ip->ip.v6.s6_addr32[2]);
+ bpf_ipv6_src[13].k = ntohl(client_live_ip->ip.v6.s6_addr32[3]);
+
+ bpfcode.len = ARRAY_SIZE(bpf_ipv6_src);
+ bpfcode.filter = bpf_ipv6_src;
+ } else {
+ assert(!"bad address family");
+ }
+
+ /* Fill in the client-side ethernet address to look for. */
+ bpfcode.filter[1].k = ((client_ether[2] << 24) |
+ (client_ether[3] << 16) |
+ (client_ether[4] << 8) |
+ (client_ether[5]));
+ bpfcode.filter[3].k = ((client_ether[0] << 8) |
+ (client_ether[1]));
+
+ if (DEBUG_LOGGING) {
+ int i;
+ DEBUGP("filter constants:\n");
+ for (i = 0; i < bpfcode.len; ++i)
+ DEBUGP("0x%x\n", bpfcode.filter[i].k);
+ }
+
+ /* Attach the filter. */
+ if (setsockopt(psock->packet_fd, SOL_SOCKET, SO_ATTACH_FILTER,
+ &bpfcode, sizeof(bpfcode)) < 0) {
+ die_perror("setsockopt SOL_SOCKET, SO_ATTACH_FILTER");
+ }
+}
+
+struct packet_socket *packet_socket_new(const char *device_name)
+{
+ struct packet_socket *psock = calloc(1, sizeof(struct packet_socket));
+
+ psock->name = strdup(device_name);
+ psock->packet_fd = -1;
+
+ packet_socket_setup(psock);
+
+ return psock;
+}
+
+void packet_socket_free(struct packet_socket *psock)
+{
+ if (psock->packet_fd >= 0)
+ close(psock->packet_fd);
+
+ if (psock->name != NULL)
+ free(psock->name);
+
+ memset(psock, 0, sizeof(*psock)); /* paranoia to catch bugs*/
+ free(psock);
+}
+
+int packet_socket_writev(struct packet_socket *psock,
+ const struct iovec *iov, int iovcnt)
+{
+ if (writev(psock->packet_fd, iov, iovcnt) < 0) {
+ perror("writev");
+ return STATUS_ERR;
+ }
+ return STATUS_OK;
+}
+
+int packet_socket_receive(struct packet_socket *psock,
+ enum direction_t direction,
+ struct packet *packet, int *in_bytes)
+{
+ struct sockaddr_ll from;
+ memset(&from, 0, sizeof(from));
+ socklen_t from_len = sizeof(from);
+
+ /* Read the packet out of our kernel packet socket buffer. */
+ *in_bytes = recvfrom(psock->packet_fd,
+ packet->buffer, packet->buffer_bytes, 0,
+ (struct sockaddr *)&from, &from_len);
+ assert(*in_bytes <= packet->buffer_bytes);
+ if (*in_bytes < 0) {
+ if (errno == EINTR) {
+ DEBUGP("EINTR\n");
+ return STATUS_ERR;
+ } else {
+ die_perror("packet socket recvfrom()");
+ }
+ }
+
+ /* We only want packets our kernel is sending out. */
+ if (direction == DIRECTION_OUTBOUND &&
+ from.sll_pkttype != PACKET_OUTGOING) {
+ DEBUGP("not outbound\n");
+ return STATUS_ERR;
+ }
+ if (direction == DIRECTION_INBOUND &&
+ from.sll_pkttype != PACKET_HOST) {
+ DEBUGP("not inbound\n");
+ return STATUS_ERR;
+ }
+
+ /* We only want packets on our tun device. The kernel
+ * can put packets for other devices in our receive
+ * buffer before we bind the packet socket to the tun
+ * device.
+ */
+ if (from.sll_ifindex != psock->index) {
+ DEBUGP("not correct index\n");
+ return STATUS_ERR;
+ }
+
+ /* Get the time at which the kernel sniffed the packet. */
+ struct timeval tv;
+ if (ioctl(psock->packet_fd, SIOCGSTAMP, &tv) < 0)
+ die_perror("SIOCGSTAMP");
+ packet->time_usecs = timeval_to_usecs(&tv);
+ DEBUGP("sniffed packet sent at %u.%u = %lld\n",
+ (u32)tv.tv_sec, (u32)tv.tv_usec,
+ packet->time_usecs);
+
+ return STATUS_OK;
+}
+
+#endif /* linux */
diff --git a/test/packetdrill/packet_socket_pcap.c b/test/packetdrill/packet_socket_pcap.c
new file mode 100644
index 0000000..bedef71
--- /dev/null
+++ b/test/packetdrill/packet_socket_pcap.c
@@ -0,0 +1,290 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * API to read and write raw packets implemented using pcap.
+ */
+
+#include "packet_socket.h"
+
+#include <errno.h>
+#include <net/if.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/uio.h>
+#include <unistd.h>
+
+#ifdef USE_LIBPCAP
+
+#if defined(__FreeBSD__)
+#include <pcap/pcap.h>
+#elif defined(__OpenBSD__) || defined(__NetBSD__)
+#include <pcap.h>
+#endif
+
+#include "assert.h"
+#include "ethernet.h"
+#include "logging.h"
+
+struct packet_socket {
+ char *name; /* malloc-allocated copy of interface name */
+
+ pcap_t *pcap; /* handle for sending, sniffing timestamped packets */
+ char pcap_error[PCAP_ERRBUF_SIZE]; /* for libpcap errors */
+ int pcap_offset; /* offset of packet data in pcap buffer */
+};
+
+#if defined(__OpenBSD__)
+#include <net/bpf.h>
+/* Convert a bpf_timeval to microseconds. */
+static inline s64 bpf_timeval_to_usecs(const struct bpf_timeval *tv)
+{
+ return ((s64)tv->tv_sec) * 1000000LL + (s64)tv->tv_usec;
+}
+#endif /* defined(__OpenBSD__) */
+
+/* Call pcap_perror() and then exit with a failure status code. */
+extern void die_pcap_perror(pcap_t *pcap, char *message)
+{
+ pcap_perror(pcap, message);
+
+ exit(EXIT_FAILURE);
+}
+
+static void packet_socket_setup(struct packet_socket *psock)
+{
+ int data_link = -1, bpf_fd = -1, val = -1;
+
+ DEBUGP("calling pcap_create() with %s\n", psock->name);
+ psock->pcap = pcap_create(psock->name, psock->pcap_error);
+ if (psock->pcap == NULL)
+ die_pcap_perror(psock->pcap, "pcap_create");
+
+ if (pcap_set_snaplen(psock->pcap, PACKET_READ_BYTES) != 0)
+ die_pcap_perror(psock->pcap, "pcap_set_snaplen");
+
+ if (pcap_activate(psock->pcap) != 0)
+ die_pcap_perror(psock->pcap,
+ "pcap_activate "
+ "(OpenBSD: another process (tcpdump?) "
+ "using bpf0?)");
+
+ bpf_fd = pcap_get_selectable_fd(psock->pcap);
+ if (bpf_fd < 0)
+ die_pcap_perror(psock->pcap, "pcap_get_selectable_fd");
+
+ /* By default libpcap with BPF waits until a read buffer fills
+ * up before returning any packets. We use BIOCIMMEDIATE to
+ * force the BPF device to return the first packet
+ * immediately.
+ */
+ val = 1;
+ if (ioctl(bpf_fd, BIOCIMMEDIATE, &val) < 0)
+ die_perror("ioctl BIOCIMMEDIATE on bpf fd");
+
+ /* Find data link type. */
+ data_link = pcap_datalink(psock->pcap);
+ DEBUGP("data_link: %d\n", data_link);
+
+ /* Based on the data_link type, calculate the offset of the
+ * packet data in the buffer.
+ */
+ switch (data_link) {
+ case DLT_EN10MB:
+ psock->pcap_offset = 0;
+ break;
+ case DLT_LOOP:
+ case DLT_NULL:
+ psock->pcap_offset = 4;
+ break;
+ case DLT_SLIP:
+ case DLT_RAW:
+ psock->pcap_offset = 0;
+ break;
+ default:
+ die("Unknown data_link type %d\n", data_link);
+ break;
+ }
+}
+
+/* Add a filter so we only sniff packets we want. */
+void packet_socket_set_filter(struct packet_socket *psock,
+ const struct ether_addr *client_ether_addr,
+ const struct ip_address *client_live_ip)
+{
+ const u8 *client_ether = client_ether_addr->ether_addr_octet;
+ struct bpf_program bpf_code;
+ char *filter_str = NULL;
+ char client_live_ip_string[ADDR_STR_LEN];
+
+ ip_to_string(client_live_ip, client_live_ip_string);
+
+ asprintf(&filter_str,
+ "ether src %02x:%02x:%02x:%02x:%02x:%02x and %s src %s",
+ client_ether[0],
+ client_ether[1],
+ client_ether[2],
+ client_ether[3],
+ client_ether[4],
+ client_ether[5],
+ client_live_ip->address_family == AF_INET6 ? "ip6" : "ip",
+ client_live_ip_string);
+
+ DEBUGP("setting BPF filter: %s\n", filter_str);
+
+ if (pcap_compile(psock->pcap, &bpf_code, filter_str, 1, 0) != 0)
+ die_pcap_perror(psock->pcap, "pcap_compile");
+
+ if (pcap_setfilter(psock->pcap, &bpf_code) != 0)
+ die_pcap_perror(psock->pcap, "pcap_setfilter");
+
+ pcap_freecode(&bpf_code);
+ free(filter_str);
+}
+
+struct packet_socket *packet_socket_new(const char *device_name)
+{
+ struct packet_socket *psock = calloc(1, sizeof(struct packet_socket));
+
+ psock->name = strdup(device_name);
+
+ packet_socket_setup(psock);
+
+ return psock;
+}
+
+void packet_socket_free(struct packet_socket *psock)
+{
+ if (psock->name != NULL)
+ free(psock->name);
+
+ pcap_close(psock->pcap);
+
+ memset(psock, 0, sizeof(*psock)); /* paranoia to catch bugs*/
+ free(psock);
+}
+
+int packet_socket_writev(struct packet_socket *psock,
+ const struct iovec *iov, int iovcnt)
+{
+ /* Copy the ethernet header and IP datagram into a single buffer,
+ * since that's all the pcap API supports. TODO: optimize this.
+ */
+
+ u8 *buf = NULL, *p = NULL;
+ int len = 0, i = 0;
+
+ /* Calculate how much space we need. */
+ for (i = 0; i < iovcnt; ++i)
+ len += iov[i].iov_len;
+
+ buf = malloc(len);
+
+ /* Copy into the linear buffer. */
+ p = buf;
+ for (i = 0; i < iovcnt; ++i) {
+ memcpy(p, iov[i].iov_base, iov[i].iov_len);
+ p += iov[i].iov_len;
+ }
+
+ DEBUGP("calling pcap_inject with %d bytes\n", len);
+
+ if (pcap_inject(psock->pcap, buf, len) != len)
+ die_pcap_perror(psock->pcap, "pcap_inject");
+
+ free(buf);
+ return STATUS_OK;
+}
+
+int packet_socket_receive(struct packet_socket *psock,
+ enum direction_t direction,
+ struct packet *packet, int *in_bytes)
+{
+ int status = 0;
+ struct pcap_pkthdr *pkt_header = NULL;
+ const u8 *pkt_data = NULL;
+
+ DEBUGP("calling pcap_next_ex()\n");
+
+ /* Something about the way we're doing BIOCIMMEDIATE
+ * causes libpcap to return 0 if there's no packet
+ * yet, which forces us to spin in this loop until
+ * there's a packet available. If, on the other hand,
+ * we hack libpcap itself to enable its internal
+ * BIOCIMMEDIATE code path that it currently only uses
+ * for AIX, then we don't have to spin
+ * here. TODO(ncardwell): fix this.
+ */
+ while (1) {
+ status = pcap_next_ex(psock->pcap, &pkt_header,
+ &pkt_data);
+ if (status == 1)
+ break; /* got a packet */
+ else if (status == 0)
+ return STATUS_ERR; /* no packet yet */
+ else if (status == -1)
+ die_pcap_perror(psock->pcap, "pcap_next_ex");
+ else if (status == -2)
+ die("pcap_next_ex: EOF in save file?!\n");
+ else
+ die("pcap_next_ex: status: %d\n", status);
+ }
+
+ DEBUGP("time: %u . %u\n",
+ (u32)pkt_header->ts.tv_sec,
+ (u32)pkt_header->ts.tv_usec);
+
+#if defined(__FreeBSD__) || defined(__NetBSD__)
+ packet->time_usecs = timeval_to_usecs(&pkt_header->ts);
+#elif defined(__OpenBSD__)
+ packet->time_usecs = bpf_timeval_to_usecs(&pkt_header->ts);
+#else
+ packet->time_usecs = implement_me("implement me for your platform");
+#endif /* defined(__OpenBSD__) */
+
+ DEBUGP("time_usecs= %llu\n", packet->time_usecs);
+
+ DEBUGP("pcap_next_ex: caplen:%u len:%u offset:%d\n",
+ pkt_header->caplen, pkt_header->len, psock->pcap_offset);
+
+ if (DEBUG_LOGGING) {
+ /* Dump a hex dump of packet sniffed by pcap. */
+ char *hex = NULL;
+ hex_dump(pkt_data, pkt_header->caplen, &hex);
+ DEBUGP("pkt from pcap:\n%s\n", hex);
+ free(hex);
+ }
+
+ if (pkt_header->caplen != pkt_header->len) {
+ die("libpcap unable to capture full packet: "
+ "caplen %u != len %u\n",
+ pkt_header->caplen, pkt_header->len);
+ }
+ assert(pkt_header->len <= packet->buffer_bytes);
+
+ assert(pkt_header->len > psock->pcap_offset);
+ *in_bytes = pkt_header->len - psock->pcap_offset;
+ memcpy(packet->buffer, pkt_data + psock->pcap_offset, *in_bytes);
+
+ return STATUS_OK;
+}
+
+#endif /* USE_LIBPCAP */
diff --git a/test/packetdrill/packet_to_string.c b/test/packetdrill/packet_to_string.c
new file mode 100644
index 0000000..1fd90b2
--- /dev/null
+++ b/test/packetdrill/packet_to_string.c
@@ -0,0 +1,303 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Implementation for generating human-readable representations of IP
+ * packets.
+ */
+
+#include "packet_to_string.h"
+
+#include <stdlib.h>
+#include "socket.h"
+#include "tcp_options_to_string.h"
+
+static void endpoints_to_string(FILE *s, const struct packet *packet)
+{
+ char src_string[ADDR_STR_LEN];
+ char dst_string[ADDR_STR_LEN];
+ struct tuple tuple;
+
+ get_packet_tuple(packet, &tuple);
+
+ fprintf(s, "%s:%u > %s:%u",
+ ip_to_string(&tuple.src.ip, src_string), ntohs(tuple.src.port),
+ ip_to_string(&tuple.dst.ip, dst_string), ntohs(tuple.dst.port));
+}
+
+static void packet_buffer_to_string(FILE *s, struct packet *packet)
+{
+ char *hex = NULL;
+ hex_dump(packet->buffer, packet_end(packet) - packet->buffer, &hex);
+ fputc('\n', s);
+ fprintf(s, "%s", hex);
+ free(hex);
+}
+
+static int ipv4_header_to_string(FILE *s, struct packet *packet, int layer,
+ enum dump_format_t format, char **error)
+{
+ char src_string[ADDR_STR_LEN];
+ char dst_string[ADDR_STR_LEN];
+ struct ip_address src_ip, dst_ip;
+ const struct ipv4 *ipv4 = packet->headers[layer].h.ipv4;
+
+ ip_from_ipv4(&ipv4->src_ip, &src_ip);
+ ip_from_ipv4(&ipv4->dst_ip, &dst_ip);
+
+ fprintf(s, "ipv4 %s > %s: ",
+ ip_to_string(&src_ip, src_string),
+ ip_to_string(&dst_ip, dst_string));
+
+ return STATUS_OK;
+}
+
+static int ipv6_header_to_string(FILE *s, struct packet *packet, int layer,
+ enum dump_format_t format, char **error)
+{
+ char src_string[ADDR_STR_LEN];
+ char dst_string[ADDR_STR_LEN];
+ struct ip_address src_ip, dst_ip;
+ const struct ipv6 *ipv6 = packet->headers[layer].h.ipv6;
+
+ ip_from_ipv6(&ipv6->src_ip, &src_ip);
+ ip_from_ipv6(&ipv6->dst_ip, &dst_ip);
+
+ fprintf(s, "ipv6 %s > %s: ",
+ ip_to_string(&src_ip, src_string),
+ ip_to_string(&dst_ip, dst_string));
+
+ return STATUS_OK;
+}
+
+static int gre_header_to_string(FILE *s, struct packet *packet, int layer,
+ enum dump_format_t format, char **error)
+{
+ const struct gre *gre = packet->headers[layer].h.gre;
+ int i = 0;
+
+ fprintf(s, "gre flags 0x%x proto 0x%04x",
+ ntohs(gre->flags),
+ ntohs(gre->proto));
+
+ if (gre->has_checksum || gre->has_routing) {
+ fprintf(s, " sum 0x%x off 0x%x",
+ ntohs(gre->be16[0]),
+ ntohs(gre->be16[1]));
+ i++;
+ }
+
+ if (gre->has_key) {
+ fprintf(s, " key 0x%x", ntohl(gre->be32[i]));
+ i++;
+ }
+
+ if (gre->has_seq) {
+ fprintf(s, " seq 0x%x", ntohl(gre->be32[i]));
+ i++;
+ }
+
+ fprintf(s, ": ");
+ return STATUS_OK;
+}
+
+static int mpls_header_to_string(FILE *s, struct packet *packet, int layer,
+ enum dump_format_t format, char **error)
+{
+ struct header *header = &packet->headers[layer];
+ int num_entries = header->header_bytes / sizeof(struct mpls);
+ int i = 0;
+
+ fprintf(s, "mpls");
+
+ for (i = 0; i < num_entries; ++i) {
+ const struct mpls *mpls = header->h.mpls + i;
+
+ fprintf(s, " (label %u, tc %u,%s ttl %u)",
+ mpls_entry_label(mpls),
+ mpls_entry_tc(mpls),
+ mpls_entry_stack(mpls) ? " [S]," : "",
+ mpls_entry_ttl(mpls));
+ }
+
+ fprintf(s, ": ");
+ return STATUS_OK;
+}
+
+/* Print a string representation of the TCP packet:
+ * direction opt_ip_info flags seq ack window tcp_options
+ */
+static int tcp_packet_to_string(FILE *s, struct packet *packet,
+ enum dump_format_t format, char **error)
+{
+ int result = STATUS_OK; /* return value */
+
+ if ((format == DUMP_FULL) || (format == DUMP_VERBOSE)) {
+ endpoints_to_string(s, packet);
+ fputc(' ', s);
+ }
+
+
+ /* We print flags in the same order as tcpdump 4.1.1. */
+ if (packet->tcp->fin)
+ fputc('F', s);
+ if (packet->tcp->syn)
+ fputc('S', s);
+ if (packet->tcp->rst)
+ fputc('R', s);
+ if (packet->tcp->psh)
+ fputc('P', s);
+ if (packet->tcp->ack)
+ fputc('.', s);
+ if (packet->tcp->urg)
+ fputc('U', s);
+ if (packet->tcp->ece)
+ fputc('E', s); /* ECN *E*cho sent (ECN) */
+ if (packet->tcp->cwr)
+ fputc('W', s); /* Congestion *W*indow reduced (ECN) */
+
+ fprintf(s, " %u:%u(%u) ",
+ ntohl(packet->tcp->seq),
+ ntohl(packet->tcp->seq) + packet_payload_len(packet),
+ packet_payload_len(packet));
+
+ if (packet->tcp->ack)
+ fprintf(s, "ack %u ", ntohl(packet->tcp->ack_seq));
+
+ if (!(packet->flags & FLAG_WIN_NOCHECK))
+ fprintf(s, "win %u ", ntohs(packet->tcp->window));
+
+ if (packet_tcp_options_len(packet) > 0) {
+ char *tcp_options = NULL;
+ if (tcp_options_to_string(packet, &tcp_options, error))
+ result = STATUS_ERR;
+ else
+ fprintf(s, "<%s>", tcp_options);
+ free(tcp_options);
+ }
+
+ if (format == DUMP_VERBOSE)
+ packet_buffer_to_string(s, packet);
+
+ return result;
+}
+
+static int udp_packet_to_string(FILE *s, struct packet *packet,
+ enum dump_format_t format, char **error)
+{
+ int result = STATUS_OK; /* return value */
+
+ if ((format == DUMP_FULL) || (format == DUMP_VERBOSE)) {
+ endpoints_to_string(s, packet);
+ fputc(' ', s);
+ }
+
+ fprintf(s, "udp (%u)", packet_payload_len(packet));
+
+ if (format == DUMP_VERBOSE)
+ packet_buffer_to_string(s, packet);
+
+ return result;
+}
+
+static int icmpv4_packet_to_string(FILE *s, struct packet *packet,
+ enum dump_format_t format, char **error)
+{
+ fprintf(s, "icmpv4");
+ /* TODO(ncardwell): print type, code; use tables from icmp_packet.c */
+ return STATUS_OK;
+}
+
+static int icmpv6_packet_to_string(FILE *s, struct packet *packet,
+ enum dump_format_t format, char **error)
+{
+ fprintf(s, "icmpv6");
+ /* TODO(ncardwell): print type, code; use tables from icmp_packet.c */
+ return STATUS_OK;
+}
+
+typedef int (*header_to_string_func)(FILE *s, struct packet *packet, int layer,
+ enum dump_format_t format, char **error);
+
+static int encap_header_to_string(FILE *s, struct packet *packet, int layer,
+ enum dump_format_t format, char **error)
+{
+ header_to_string_func printers[HEADER_NUM_TYPES] = {
+ [HEADER_IPV4] = ipv4_header_to_string,
+ [HEADER_IPV6] = ipv6_header_to_string,
+ [HEADER_GRE] = gre_header_to_string,
+ [HEADER_MPLS] = mpls_header_to_string,
+ };
+ header_to_string_func printer = NULL;
+ enum header_t type = packet->headers[layer].type;
+
+ assert(type > HEADER_NONE);
+ assert(type < HEADER_NUM_TYPES);
+ printer = printers[type];
+ assert(printer != NULL);
+ return printer(s, packet, layer, format, error);
+}
+
+
+int packet_to_string(struct packet *packet,
+ enum dump_format_t format,
+ char **ascii_string, char **error)
+{
+ assert(packet != NULL);
+ int result = STATUS_ERR; /* return value */
+ size_t size = 0;
+ FILE *s = open_memstream(ascii_string, &size); /* output string */
+ int i;
+ int header_count = packet_header_count(packet);
+
+ /* Print any encapsulation headers preceding layer 3 and 4 headers. */
+ for (i = 0; i < header_count - 2; ++i) {
+ if (packet->headers[i].type == HEADER_NONE)
+ break;
+ if (encap_header_to_string(s, packet, i, format, error))
+ goto out;
+ }
+
+ if ((packet->ipv4 == NULL) && (packet->ipv6 == NULL)) {
+ fprintf(s, "[NO IP HEADER]");
+ } else {
+ if (packet->tcp != NULL) {
+ if (tcp_packet_to_string(s, packet, format, error))
+ goto out;
+ } else if (packet->udp != NULL) {
+ if (udp_packet_to_string(s, packet, format, error))
+ goto out;
+ } else if (packet->icmpv4 != NULL) {
+ if (icmpv4_packet_to_string(s, packet, format, error))
+ goto out;
+ } else if (packet->icmpv6 != NULL) {
+ if (icmpv6_packet_to_string(s, packet, format, error))
+ goto out;
+ } else {
+ fprintf(s, "[NO TCP OR ICMP HEADER]");
+ }
+ }
+
+ result = STATUS_OK;
+
+out:
+ fclose(s);
+ return result;
+}
diff --git a/test/packetdrill/packet_to_string.h b/test/packetdrill/packet_to_string.h
new file mode 100644
index 0000000..462a4f9
--- /dev/null
+++ b/test/packetdrill/packet_to_string.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Interface for generating human-readable representations of IP packets.
+ */
+
+#ifndef __PACKET_TO_STRING_H__
+#define __PACKET_TO_STRING_H__
+
+#include "packet.h"
+
+enum dump_format_t {
+ DUMP_SHORT, /* brief format used in scripts */
+ DUMP_FULL, /* add local and remote address and port */
+ DUMP_VERBOSE, /* add hex dump */
+};
+
+/* Returns in *ascii_string a human-readable representation of the
+ * packet 'packet'. Returns STATUS_OK on success; on failure returns
+ * STATUS_ERR and sets error message.
+ */
+extern int packet_to_string(struct packet *packet,
+ enum dump_format_t format,
+ char **ascii_string, char **error);
+
+#endif /* __PACKET_TO_STRING_H__ */
diff --git a/test/packetdrill/packet_to_string_test.c b/test/packetdrill/packet_to_string_test.c
new file mode 100644
index 0000000..814cddb
--- /dev/null
+++ b/test/packetdrill/packet_to_string_test.c
@@ -0,0 +1,301 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Test for generating human-readable representations of IP packets.
+ */
+
+#include "packet_to_string.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include "assert.h"
+#include "packet_parser.h"
+
+static void test_tcp_ipv4_packet_to_string(void)
+{
+ /* An IPv4/GRE/IPv4/TCP packet. */
+ u8 data[] = {
+ /* IPv4: */
+ 0x45, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0x2f, 0xb5, 0x75, 0x02, 0x02, 0x02, 0x02,
+ 0x01, 0x01, 0x01, 0x01,
+ /* GRE: */
+ 0x00, 0x00, 0x08, 0x00,
+ /* IPv4, TCP: */
+ 0x45, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0x06, 0x39, 0x11, 0xc0, 0x00, 0x02, 0x01,
+ 0xc0, 0xa8, 0x00, 0x01, 0xcf, 0x3f, 0x1f, 0x90,
+ 0x00, 0x00, 0x00, 0x01, 0x83, 0x4d, 0xa5, 0x5b,
+ 0xa0, 0x10, 0x01, 0x01, 0xdb, 0x2d, 0x00, 0x00,
+ 0x05, 0x0a, 0x83, 0x4d, 0xab, 0x03, 0x83, 0x4d,
+ 0xb0, 0xab, 0x08, 0x0a, 0x00, 0x00, 0x01, 0x2c,
+ 0x60, 0xc2, 0x18, 0x20
+ };
+
+ struct packet *packet = packet_new(sizeof(data));
+
+ /* Populate and parse a packet */
+ memcpy(packet->buffer, data, sizeof(data));
+ char *error = NULL;
+ enum packet_parse_result_t result =
+ parse_packet(packet, sizeof(data), PACKET_LAYER_3_IP,
+ &error);
+ assert(result == PACKET_OK);
+ assert(error == NULL);
+
+ int status = 0;
+ char *dump = NULL, *expected = NULL;
+
+ /* Test a DUMP_SHORT dump */
+ status = packet_to_string(packet, DUMP_SHORT, &dump, &error);
+ assert(status == STATUS_OK);
+ assert(error == NULL);
+ printf("dump = '%s'\n", dump);
+ expected =
+ "ipv4 2.2.2.2 > 1.1.1.1: gre flags 0x0 proto 0x0800: "
+ ". 1:1(0) ack 2202903899 win 257 "
+ "<sack 2202905347:2202906795,TS val 300 ecr 1623332896>";
+ assert(strcmp(dump, expected) == 0);
+ free(dump);
+
+ /* Test a DUMP_FULL dump */
+ status = packet_to_string(packet, DUMP_FULL, &dump, &error);
+ assert(status == STATUS_OK);
+ assert(error == NULL);
+ printf("dump = '%s'\n", dump);
+ expected =
+ "ipv4 2.2.2.2 > 1.1.1.1: gre flags 0x0 proto 0x0800: "
+ "192.0.2.1:53055 > 192.168.0.1:8080 "
+ ". 1:1(0) ack 2202903899 win 257 "
+ "<sack 2202905347:2202906795,TS val 300 ecr 1623332896>";
+ assert(strcmp(dump, expected) == 0);
+ free(dump);
+
+ /* Test a DUMP_VERBOSE dump */
+ status = packet_to_string(packet, DUMP_VERBOSE, &dump, &error);
+ assert(status == STATUS_OK);
+ assert(error == NULL);
+ printf("dump = '%s'\n", dump);
+ expected =
+ "ipv4 2.2.2.2 > 1.1.1.1: gre flags 0x0 proto 0x0800: "
+ "192.0.2.1:53055 > 192.168.0.1:8080 "
+ ". 1:1(0) ack 2202903899 win 257 "
+ "<sack 2202905347:2202906795,TS val 300 ecr 1623332896>"
+ "\n"
+ "0x0000: 45 00 00 54 00 00 00 00 ff 2f b5 75 02 02 02 02 " "\n"
+ "0x0010: 01 01 01 01 00 00 08 00 45 00 00 3c 00 00 00 00 " "\n"
+ "0x0020: ff 06 39 11 c0 00 02 01 c0 a8 00 01 cf 3f 1f 90 " "\n"
+ "0x0030: 00 00 00 01 83 4d a5 5b a0 10 01 01 db 2d 00 00 " "\n"
+ "0x0040: 05 0a 83 4d ab 03 83 4d b0 ab 08 0a 00 00 01 2c " "\n"
+ "0x0050: 60 c2 18 20 " "\n";
+ assert(strcmp(dump, expected) == 0);
+ free(dump);
+
+ packet_free(packet);
+}
+
+static void test_tcp_ipv6_packet_to_string(void)
+{
+ /* An IPv6/GRE/TCP/IPv6 packet. */
+ u8 data[] = {
+ /* IPv6: */
+ 0x60, 0x00, 0x00, 0x00, 0x00, 0x4c, 0x2f, 0xff,
+ 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x22, 0x22,
+ 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x11,
+ /* GRE: */
+ 0x00, 0x00, 0x86, 0xdd,
+ /* IPv6, TCP: */
+ 0x60, 0x00, 0x00, 0x00, 0x00, 0x20, 0x06, 0xff,
+ 0x20, 0x01, 0x0d, 0xb8, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+ 0xfd, 0x3d, 0xfa, 0x7b, 0xd1, 0x7d, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+ 0xd3, 0xe2, 0x1f, 0x90, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x80, 0x02, 0x80, 0x18,
+ 0x06, 0x60, 0x00, 0x00, 0x02, 0x04, 0x03, 0xe8,
+ 0x04, 0x02, 0x01, 0x01, 0x01, 0x03, 0x03, 0x07,
+ };
+
+ struct packet *packet = packet_new(sizeof(data));
+
+ /* Populate and parse a packet */
+ memcpy(packet->buffer, data, sizeof(data));
+ char *error = NULL;
+ enum packet_parse_result_t result =
+ parse_packet(packet, sizeof(data), PACKET_LAYER_3_IP,
+ &error);
+ assert(result == PACKET_OK);
+ assert(error == NULL);
+
+ int status = 0;
+ char *dump = NULL, *expected = NULL;
+
+ /* Test a DUMP_SHORT dump */
+ status = packet_to_string(packet, DUMP_SHORT, &dump, &error);
+ assert(status == STATUS_OK);
+ assert(error == NULL);
+ printf("dump = '%s'\n", dump);
+ expected =
+ "ipv6 2::2222 > 1::1111: gre flags 0x0 proto 0x86dd: "
+ "S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>";
+ assert(strcmp(dump, expected) == 0);
+ free(dump);
+
+ /* Test a DUMP_FULL dump */
+ status = packet_to_string(packet, DUMP_FULL, &dump, &error);
+ assert(status == STATUS_OK);
+ assert(error == NULL);
+ printf("dump = '%s'\n", dump);
+ expected =
+ "ipv6 2::2222 > 1::1111: gre flags 0x0 proto 0x86dd: "
+ "2001:db8::1:54242 > fd3d:fa7b:d17d::1:8080 "
+ "S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>";
+ assert(strcmp(dump, expected) == 0);
+ free(dump);
+
+ /* Test a DUMP_VERBOSE dump */
+ status = packet_to_string(packet, DUMP_VERBOSE, &dump, &error);
+ assert(status == STATUS_OK);
+ assert(error == NULL);
+ printf("dump = '%s'\n", dump);
+ expected =
+ "ipv6 2::2222 > 1::1111: gre flags 0x0 proto 0x86dd: "
+ "2001:db8::1:54242 > fd3d:fa7b:d17d::1:8080 "
+ "S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>\n"
+ "0x0000: 60 00 00 00 00 4c 2f ff 00 02 00 00 00 00 00 00 " "\n"
+ "0x0010: 00 00 00 00 00 00 22 22 00 01 00 00 00 00 00 00 " "\n"
+ "0x0020: 00 00 00 00 00 00 11 11 00 00 86 dd 60 00 00 00 " "\n"
+ "0x0030: 00 20 06 ff 20 01 0d b8 00 00 00 00 00 00 00 00 " "\n"
+ "0x0040: 00 00 00 01 fd 3d fa 7b d1 7d 00 00 00 00 00 00 " "\n"
+ "0x0050: 00 00 00 01 d3 e2 1f 90 00 00 00 00 00 00 00 00 " "\n"
+ "0x0060: 80 02 80 18 06 60 00 00 02 04 03 e8 04 02 01 01 " "\n"
+ "0x0070: 01 03 03 07 " "\n";
+ assert(strcmp(dump, expected) == 0);
+ free(dump);
+
+ packet_free(packet);
+}
+
+static void test_gre_mpls_tcp_ipv4_packet_to_string(void)
+{
+ /* An IPv4/GRE/MPLS/IPv4/TCP packet. */
+ u8 data[] = {
+ /* IPv4: */
+ 0x45, 0x00, 0x00, 0x54, 0x00, 0x00, 0x40, 0x00,
+ 0x40, 0x2f, 0xb7, 0xcf, 0xc0, 0xa8, 0x00, 0x01,
+ 0xc0, 0x00, 0x02, 0x02,
+ /* GRE: */
+ 0x00, 0x00, 0x88, 0x47,
+ /* MPLS: */
+ 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
+ /* IPv4, TCP: */
+ 0x45, 0x00, 0x00, 0x34, 0x86, 0x99, 0x40, 0x00,
+ 0x40, 0x06, 0x31, 0x80, 0xc0, 0xa8, 0x00, 0x01,
+ 0xc0, 0x00, 0x02, 0x01, 0x1f, 0x90, 0xdb, 0xcc,
+ 0x7b, 0x81, 0xc5, 0x7c, 0x00, 0x00, 0x00, 0x01,
+ 0x80, 0x11, 0x01, 0xc5, 0xa6, 0xa6, 0x00, 0x00,
+ 0x01, 0x01, 0x08, 0x0a, 0x07, 0x02, 0x08, 0x43,
+ 0x00, 0x00, 0x00, 0x05
+ };
+
+ struct packet *packet = packet_new(sizeof(data));
+
+ /* Populate and parse a packet */
+ memcpy(packet->buffer, data, sizeof(data));
+ char *error = NULL;
+ enum packet_parse_result_t result =
+ parse_packet(packet, sizeof(data), PACKET_LAYER_3_IP,
+ &error);
+ assert(result == PACKET_OK);
+ assert(error == NULL);
+
+ int status = 0;
+ char *dump = NULL, *expected = NULL;
+
+ /* Test a DUMP_FULL dump */
+ status = packet_to_string(packet, DUMP_FULL, &dump, &error);
+ assert(status == STATUS_OK);
+ assert(error == NULL);
+ printf("dump = '%s'\n", dump);
+ expected =
+ "ipv4 192.168.0.1 > 192.0.2.2: gre flags 0x0 proto 0x8847: "
+ "mpls (label 0, tc 0, ttl 0) "
+ "(label 1048575, tc 7, [S], ttl 255): "
+ "192.168.0.1:8080 > 192.0.2.1:56268 "
+ "F. 2072102268:2072102268(0) ack 1 win 453 "
+ "<nop,nop,TS val 117573699 ecr 5>";
+ assert(strcmp(dump, expected) == 0);
+ free(dump);
+}
+
+static void test_tcp_md5_option_to_string(void)
+{
+ /* An IPv4/TCP packet. */
+ u8 data[] = {
+ /* IPv4, TCP: */
+ 0x45, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0x06, 0x89, 0x56, 0xc0, 0x00, 0x02, 0x01,
+ 0xc0, 0xa8, 0xaf, 0xbb, 0x8a, 0x6f, 0x1f, 0x90,
+ 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01,
+ 0xa0, 0x02, 0x01, 0x00, 0x36, 0x14, 0x00, 0x00,
+ 0x13, 0x12, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+ 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
+ 0x0e, 0x0f, 0x01, 0x01
+ };
+
+ struct packet *packet = packet_new(sizeof(data));
+
+ /* Populate and parse a packet */
+ memcpy(packet->buffer, data, sizeof(data));
+ char *error = NULL;
+ enum packet_parse_result_t result =
+ parse_packet(packet, sizeof(data), PACKET_LAYER_3_IP,
+ &error);
+ assert(result == PACKET_OK);
+ assert(error == NULL);
+
+ int status = 0;
+ char *dump = NULL, *expected = NULL;
+
+ /* Test a DUMP_SHORT dump */
+ status = packet_to_string(packet, DUMP_SHORT, &dump, &error);
+ assert(status == STATUS_OK);
+ assert(error == NULL);
+ printf("dump = '%s'\n", dump);
+ expected =
+ "S 1:1(0) win 256 "
+ "<md5 000102030405060708090a0b0c0d0e0f,nop,nop>";
+ assert(strcmp(dump, expected) == 0);
+ free(dump);
+
+ packet_free(packet);
+}
+
+int main(void)
+{
+ test_tcp_ipv4_packet_to_string();
+ test_tcp_ipv6_packet_to_string();
+ test_gre_mpls_tcp_ipv4_packet_to_string();
+ test_tcp_md5_option_to_string();
+ return 0;
+}
diff --git a/test/packetdrill/packetdrill.c b/test/packetdrill/packetdrill.c
new file mode 100644
index 0000000..4afa038
--- /dev/null
+++ b/test/packetdrill/packetdrill.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * This is the main() for the packetdrill TCP testing tool.
+ */
+
+#include "types.h"
+
+#include <arpa/inet.h>
+#include <getopt.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include "assert.h"
+#include "config.h"
+#include "parse.h"
+#include "run.h"
+#include "script.h"
+#include "system.h"
+#include "wire_server.h"
+
+static void run_init_scripts(struct config *config)
+{
+ char *cp1, *cp2, *scripts, *error;
+
+ if (config->init_scripts == NULL)
+ return;
+
+ cp1 = scripts = strdup(config->init_scripts);
+ while (*cp1 != 0) {
+ cp2 = strstr(cp1, ",");
+ if (cp2 != NULL)
+ *cp2 = 0;
+ if (safe_system(cp1, &error)) {
+ die("%s: error executing init script '%s': %s\n",
+ config->script_path, cp1, error);
+ }
+ if (cp2 == NULL)
+ break;
+ else
+ cp1 = cp2 + 1;
+ }
+ free(scripts);
+}
+
+int main(int argc, char *argv[])
+{
+ struct config config;
+ set_default_config(&config);
+ /* Get command line options and list of test scripts. */
+ char **arg = parse_command_line_options(argc, argv, &config);
+
+ /* If we're running as a server, just listen for connections forever. */
+ if (config.is_wire_server) {
+ if (*arg != NULL) {
+ fprintf(stderr,
+ "error: do not pass script paths to "
+ "the wire server on command line\n");
+ show_usage();
+ exit(EXIT_FAILURE);
+ }
+
+ run_wire_server(&config);
+ return 0;
+ }
+
+ /* Ensure that there is at least one script path, to avoid
+ * confusion between the lack of output caused by "all tests
+ * passing" and "no tests listed on command line".
+ */
+ if (*arg == NULL) {
+ fprintf(stderr, "error: missing script path\n");
+ show_usage();
+ exit(EXIT_FAILURE);
+ }
+
+ /* Parse and run each script on the command line. */
+ for (; *arg != NULL; ++arg) {
+ struct script script;
+ const char *script_path = *arg;
+
+ if (parse_script_and_set_config(argc, argv, &config, &script,
+ script_path, NULL))
+ exit(EXIT_FAILURE);
+
+ /* If --dry_run, then don't actually execute the script. */
+ if (config.dry_run)
+ continue;
+
+ run_init_scripts(&config);
+ run_script(&config, &script);
+ }
+
+ return 0;
+}
diff --git a/test/packetdrill/packetdrill.h b/test/packetdrill/packetdrill.h
new file mode 100644
index 0000000..2fb6b58
--- /dev/null
+++ b/test/packetdrill/packetdrill.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright 2015 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: xiaoj@google.com (Xiao Jia)
+ *
+ * Interface for packetdrill.
+ *
+ * To be tested against as a shared object (*.so) file, implement this
+ * interface, export a function "packetdrill_interface_init", and
+ * initialize the interface struct passed in with your own functions.
+ */
+
+#ifndef __PACKETDRILL_H__
+#define __PACKETDRILL_H__
+
+#include <poll.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/epoll.h>
+#include <unistd.h>
+
+struct packetdrill_interface {
+ void *userdata;
+ void (*free)(void *userdata);
+ int (*socket)(void *userdata, int domain, int type, int protocol);
+ int (*bind)(void *userdata, int sockfd, const struct sockaddr *addr,
+ socklen_t addrlen);
+ int (*listen)(void *userdata, int sockfd, int backlog);
+ int (*accept)(void *userdata, int sockfd, struct sockaddr *addr,
+ socklen_t *addrlen);
+ int (*connect)(void *userdata, int sockfd, const struct sockaddr *addr,
+ socklen_t addrlen);
+ ssize_t (*read)(void *userdata, int fd, void *buf, size_t count);
+ ssize_t (*readv)(void *userdata, int fd, const struct iovec *iov,
+ int iovcnt);
+ ssize_t (*recv)(void *userdata, int sockfd, void *buf, size_t len,
+ int flags);
+ ssize_t (*recvfrom)(void *userdata, int sockfd, void *buf, size_t len,
+ int flags, struct sockaddr *src_addr,
+ socklen_t *addrlen);
+ ssize_t (*recvmsg)(void *userdata, int sockfd, struct msghdr *msg,
+ int flags);
+ ssize_t (*write)(void *userdata, int fd, const void *buf, size_t count);
+ ssize_t (*writev)(void *userdata, int fd, const struct iovec *iov,
+ int iovcnt);
+ ssize_t (*send)(void *userdata, int sockfd, const void *buf, size_t len,
+ int flags);
+ ssize_t (*sendto)(void *userdata, int sockfd, const void *buf,
+ size_t len, int flags,
+ const struct sockaddr *dest_addr, socklen_t addrlen);
+ ssize_t (*sendmsg)(void *userdata, int sockfd, const struct msghdr *msg,
+ int flags);
+ int (*fcntl)(void *userdata, int fd, int cmd, ...);
+ int (*ioctl)(void *userdata, int fd, unsigned long request, ...);
+ int (*close)(void *userdata, int fd);
+ int (*shutdown)(void *userdata, int sockfd, int how);
+ int (*getsockopt)(void *userdata, int sockfd, int level, int optname,
+ void *optval, socklen_t *optlen);
+ int (*setsockopt)(void *userdata, int sockfd, int level, int optname,
+ const void *optval, socklen_t optlen);
+ int (*poll)(void *userdata, struct pollfd *fds, nfds_t nfds,
+ int timeout);
+ /* Send @count bytes of data starting from @buf to the TCP stack.
+ * Return 0 on success or -1 on error. */
+ int (*netdev_send)(void *userdata, const void *buf, size_t count);
+ /* Sniff the next packet leaving the TCP stack.
+ * Put packet data in @buf. @count is passed in as the buffer size.
+ * The actual number of bytes received should be put in @count.
+ * Set @count to 0 if received nothing.
+ * Set @time_usecs to the receive timestamp.
+ * Return 0 on success or -1 on error. */
+ int (*netdev_receive)(void *userdata, void *buf, size_t *count,
+ long long *time_usecs);
+ int (*usleep)(void *userdata, useconds_t usec);
+ int (*gettimeofday)(void *userdata, struct timeval *tv,
+ struct timezone *tz);
+ int (*epoll_create)(void *userdata, int size);
+ int (*epoll_ctl)(void *userdata, int epfd, int op, int fd,
+ struct epoll_event *event);
+ int (*epoll_wait)(void *userdata, int epfd, struct epoll_event *events,
+ int maxevents, int timeout);
+ int (*pipe)(void *userdata, int pipefd[2]);
+ int (*splice)(void *userdata, int fd_in, loff_t *off_in, int fd_out,
+ loff_t *off_out, size_t len, unsigned int flags);
+};
+
+typedef void (*packetdrill_interface_init_t)(const char *flags,
+ struct packetdrill_interface *);
+
+#endif /* __PACKETDRILL_H__ */
diff --git a/test/packetdrill/parse.h b/test/packetdrill/parse.h
new file mode 100644
index 0000000..3ac8eae
--- /dev/null
+++ b/test/packetdrill/parse.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Interface for a module to parse test scripts.
+ */
+
+#ifndef __PARSER_H__
+#define __PARSER_H__
+
+#include "types.h"
+
+#include "assert.h"
+#include "config.h"
+#include "script.h"
+
+/* Copy the script contents into our single linear buffer. */
+extern void copy_script(const char *script_buffer,
+ struct script *script);
+
+/* Read the script file into a single linear buffer. */
+extern void read_script(const char *script_path,
+ struct script *script);
+
+/* The public, top-level call to parse a test script. It first parses the
+ * internal linear script buffer and then fills in the
+ * 'script' object with the internal representation of the
+ * script. Uses the given 'config' object to look up configuration
+ * info needed during parsing (such as whether packets are IPv4 or
+ * IPv6). Passes the given 'callback_invocation' when calling back to
+ * parse_and_finalize_config() after parsing all in-script
+ * options.
+ *
+ * Returns STATUS_OK on success; on failure returns STATUS_ERR. The
+ * implementation for this function is in the bison parser file
+ * parser.y.
+ */
+extern int parse_script(struct config *config,
+ struct script *script,
+ struct invocation *callback_invocation);
+
+/* Config for lexing and parsing. */
+extern struct config *in_config;
+
+#endif /* __PARSER_H__ */
diff --git a/test/packetdrill/parser.y b/test/packetdrill/parser.y
new file mode 100644
index 0000000..70219bd
--- /dev/null
+++ b/test/packetdrill/parser.y
@@ -0,0 +1,1739 @@
+%{
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * This is the parser for the packetdrill script language. It is
+ * processed by the bison parser generator.
+ *
+ * For full documentation see: http://www.gnu.org/software/bison/manual/
+ *
+ * Here is a quick and dirty tutorial on bison:
+ *
+ * A bison parser specification is basically a BNF grammar for the
+ * language you are parsing. Each rule specifies a nonterminal symbol
+ * on the left-hand side and a sequence of terminal symbols (lexical
+ * tokens) and or nonterminal symbols on the right-hand side that can
+ * "reduce" to the symbol on the left hand side. When the parser sees
+ * the sequence of symbols on the right where it "wants" to see a
+ * nonterminal on the left, the rule fires, executing the semantic
+ * action code in curly {} braces as it reduces the right hand side to
+ * the left hand side.
+ *
+ * The semantic action code for a rule produces an output, which it
+ * can reference using the $$ token. The set of possible types
+ * returned in output expressions is given in the %union section of
+ * the .y file. The specific type of the output for a terminal or
+ * nonterminal symbol (corresponding to a field in the %union) is
+ * given by the %type directive in the .y file. The action code can
+ * access the outputs of the symbols on the right hand side by using
+ * the notation $1 for the first symbol, $2 for the second symbol, and
+ * so on.
+ *
+ * The lexer (generated by flex from lexer.l) feeds a stream of
+ * terminal symbols up to this parser. Parser semantic actions can
+ * access the lexer output for a terminal symbol with the same
+ * notation they use for nonterminals.
+ *
+ * Here's an example rule with its semantic action in {} braces:
+ *
+ * tcp_option
+ * ...
+ * | MSS INTEGER {
+ * $$ = tcp_option_new(...);
+ * ...
+ * $$->data.mss.bytes = htons($2);
+ * }
+ *
+ * This rule basically says:
+ *
+ * When the parser wants to see a tcp_option, if it sees an MSS from
+ * the lexer followed by an INTEGER from the lexer then run the
+ * action code that (a) stores in the output $$ a pointer to a
+ * struct tcp_option object, and then (b) stores in that object the
+ * value of the INTEGER token (accessed with $2).
+ *
+ */
+
+/* The first part of the .y file consists of C code that bison copies
+ * directly into the top of the .c file it generates.
+ */
+
+#include "types.h"
+
+#include <arpa/inet.h>
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <netinet/in.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include "gre_packet.h"
+#include "ip.h"
+#include "ip_packet.h"
+#include "icmp_packet.h"
+#include "logging.h"
+#include "mpls.h"
+#include "mpls_packet.h"
+#include "tcp_packet.h"
+#include "udp_packet.h"
+#include "parse.h"
+#include "script.h"
+#include "tcp.h"
+#include "tcp_options.h"
+
+/* This include of the bison-generated .h file must go last so that we
+ * can first include all of the declarations on which it depends.
+ */
+#include "parser.h"
+
+/* Change this YYDEBUG to 1 to get verbose debug output for parsing: */
+#define YYDEBUG 0
+#if YYDEBUG
+extern int yydebug;
+#endif
+
+extern FILE *yyin;
+extern int yylineno;
+extern char *yytext;
+extern int yylex(void);
+extern int yyparse(void);
+extern int yywrap(void);
+extern const char *cleanup_cmd;
+
+/* This mutex guards all parser global variables declared in this file. */
+pthread_mutex_t parser_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+/* The input to the parser: the path name of the script file to parse. */
+static const char* current_script_path = NULL;
+
+/* The starting line number of the input script statement that we're
+ * currently parsing. This may be different than yylineno if bison had
+ * to look ahead and lexically scan a token on the following line to
+ * decide that the current statement is done.
+ */
+static int current_script_line = -1;
+
+/*
+ * We uses this object to look up configuration info needed during
+ * parsing (such as whether packets are IPv4 or IPv6).
+ */
+struct config *in_config = NULL;
+
+/* The output of the parser: an output script containing
+ * 1) a linked list of options
+ * 2) a linked list of events
+ */
+static struct script *out_script = NULL;
+
+/* The test invocation to pass back to parse_and_finalize_config(). */
+struct invocation *invocation;
+
+/* Copy the script contents into our single linear buffer. */
+void copy_script(const char *script_buffer, struct script *script)
+{
+ DEBUGP("copy_script\n");
+
+ free(script->buffer);
+ script->length = strlen(script_buffer);
+ script->buffer = strdup(script_buffer);
+ assert(script->buffer != NULL);
+
+ DEBUGP("copy_script: %d bytes\n", script->length);
+}
+
+/* Read the script file into a single linear buffer. */
+void read_script(const char *script_path, struct script *script)
+{
+ int size = 0;
+
+ DEBUGP("read_script(%s)\n", script_path);
+
+ while (script->buffer == NULL) {
+ struct stat script_info;
+ int fd = -1;
+
+ /* Allocate a buffer big enough for the whole file. */
+ if (stat(script_path, &script_info) != 0)
+ die("parse error: stat() of script file '%s': %s\n",
+ script_path, strerror(errno));
+
+ /* Pick a buffer size larger than the file, so we'll
+ * know if the file grew.
+ */
+ size = max((int)script_info.st_size, size) + 1;
+
+ script->buffer = malloc(size);
+ assert(script->buffer != NULL);
+
+ /* Read the file into our buffer. */
+ fd = open(script_path, O_RDONLY);
+ if (fd < 0)
+ die("parse error opening script file '%s': %s\n",
+ script_path, strerror(errno));
+
+ script->length = read(fd, script->buffer, size);
+ if (script->length < 0)
+ die("parse error reading script file '%s': %s\n",
+ script_path, strerror(errno));
+
+ /* If we filled the buffer, then probably another
+ * process wrote more to the file since our stat call,
+ * so we should try again.
+ */
+ if (script->length == size) {
+ free(script->buffer);
+ script->buffer = NULL;
+ script->length = 0;
+ }
+
+ if (close(fd))
+ die_perror("close");
+ }
+ DEBUGP("read_script: %d bytes\n", script->length);
+}
+
+
+/* The public entry point for the script parser. Parses the
+ * text script file with the given path name and fills in the script
+ * object with the parsed representation.
+ */
+int parse_script(struct config *config,
+ struct script *script,
+ struct invocation *callback_invocation)
+{
+ /* This bison-generated parser is not multi-thread safe, so we
+ * have a lock to prevent more than one thread using the
+ * parser at the same time. This is useful in the wire server
+ * context, where in general we may have more than one test
+ * thread running at the same time.
+ */
+ if (pthread_mutex_lock(&parser_mutex) != 0)
+ die_perror("pthread_mutex_lock");
+
+#if YYDEBUG
+ yydebug = 1;
+#endif
+
+ /* Now parse the script from our buffer. */
+ yyin = fmemopen(script->buffer, script->length, "r");
+ if (yyin == NULL)
+ die_perror("fmemopen: parse error opening script buffer");
+
+ current_script_path = config->script_path;
+ in_config = config;
+ out_script = script;
+ invocation = callback_invocation;
+
+ /* We have to reset the line number here since the wire server
+ * can do more than one yyparse().
+ */
+ yylineno = 1;
+
+ int result = yyparse(); /* invoke bison-generated parser */
+ current_script_path = NULL;
+
+ if (fclose(yyin))
+ die_perror("fclose: error closing script buffer");
+
+ /* Unlock parser. */
+ if (pthread_mutex_unlock(&parser_mutex) != 0)
+ die_perror("pthread_mutex_unlock");
+
+ return result ? STATUS_ERR : STATUS_OK;
+}
+
+/* Bison emits code to call this method when there's a parse-time error.
+ * We print the line number and the error message.
+ */
+static void yyerror(const char *message)
+{
+ fprintf(stderr, "%s:%d: parse error at '%s': %s\n",
+ current_script_path, yylineno, yytext, message);
+}
+
+/* After we finish parsing each line of a script, we analyze the
+ * semantics of the line. If we encounter an error then we print the
+ * error message to stderr and exit with an error.
+ */
+static void semantic_error(const char* message)
+{
+ assert(current_script_line >= 0);
+ die("%s:%d: semantic error: %s\n",
+ current_script_path, current_script_line, message);
+}
+
+/* This standard callback is invoked by flex when it encounters
+ * the end of a file. We return 1 to tell flex to return EOF.
+ */
+int yywrap(void)
+{
+ return 1;
+}
+
+/* Create and initalize a new expression. */
+static struct expression *new_expression(enum expression_t type)
+{
+ struct expression *expression = calloc(1, sizeof(struct expression));
+ expression->type = type;
+ return expression;
+}
+
+/* Create and initalize a new integer expression with the given
+ * literal value and format string.
+ */
+static struct expression *new_integer_expression(s64 num, const char *format)
+{
+ struct expression *expression = new_expression(EXPR_INTEGER);
+ expression->value.num = num;
+ expression->format = format;
+ return expression;
+}
+
+/* Create and initalize a new one-element expression_list. */
+static struct expression_list *new_expression_list(
+ struct expression *expression)
+{
+ struct expression_list *list;
+ list = calloc(1, sizeof(struct expression_list));
+ list->expression = expression;
+ list->next = NULL;
+ return list;
+}
+
+/* Add the expression to the end of the list. */
+static void expression_list_append(struct expression_list *list,
+ struct expression *expression)
+{
+ while (list->next != NULL) {
+ list = list->next;
+ }
+ list->next = new_expression_list(expression);
+}
+
+/* Create and initialize a new option. */
+static struct option_list *new_option(char *name, char *value)
+{
+ struct option_list *opt = calloc(1, sizeof(struct option_list));
+ opt->name = name;
+ opt->value = value;
+ return opt;
+}
+
+/* Create and initialize a new event. */
+static struct event *new_event(enum event_t type)
+{
+ struct event *e = calloc(1, sizeof(struct event));
+ e->type = type;
+ e->time_usecs_end = NO_TIME_RANGE;
+ e->offset_usecs = NO_TIME_RANGE;
+ return e;
+}
+
+static int parse_hex_byte(const char *hex, u8 *byte)
+{
+ if (!isxdigit((int)hex[0]) || !isxdigit((int)hex[1])) {
+ return STATUS_ERR; /* need two hex digits per byte */
+ }
+ char buf[] = { hex[0], hex[1], '\0' };
+ char* buf_end = NULL;
+ u32 byte_value = strtoul(buf, &buf_end, 16);
+ assert(byte_value <= 0xff);
+ assert(buf_end == buf + 2);
+ *byte = byte_value;
+ return STATUS_OK;
+}
+
+/* Converts a hex string in 'hex' into bytes and stores them in a
+ * buffer 'buf' of length 'buf_len' bytes; returns number of bytes in
+ * out_len. Works for hex strings of arbitrary size, such as very long
+ * TCP Fast Open cookies.
+ */
+static int parse_hex_string(const char *hex, u8 *buf, int buf_len,
+ int *out_len)
+{
+ u8 *out = buf;
+ u8 *buf_end = buf + buf_len;
+ while (hex[0] != '\0') {
+ if (out >= buf_end) {
+ return STATUS_ERR; /* ran out of output space */
+ }
+ if (parse_hex_byte(hex, out))
+ return STATUS_ERR; /* bad character */
+ hex += 2;
+ out += 1;
+ }
+ *out_len = out - buf;
+ assert(*out_len <= buf_len);
+ return STATUS_OK;
+}
+
+static struct tcp_option *new_tcp_fast_open_option(const char *cookie_string,
+ char **error, bool exp)
+{
+ int cookie_string_len = strlen(cookie_string);
+ if (cookie_string_len & 1) {
+ asprintf(error,
+ "TCP fast open cookie has an odd number of digits");
+ return NULL;
+ }
+ int cookie_bytes = cookie_string_len / 2; /* 2 hex chars per byte */
+ int max_bytes = exp ? MAX_TCP_FAST_OPEN_EXP_COOKIE_BYTES :
+ MAX_TCP_FAST_OPEN_COOKIE_BYTES;
+ if (cookie_bytes > max_bytes) {
+ asprintf(error, "TCP fast open cookie too long");
+ asprintf(error, "TCP fast open cookie of %d bytes "
+ "exceeds maximum cookie length of %d bytes",
+ cookie_bytes, max_bytes);
+ return NULL;
+ }
+ u8 option_bytes = cookie_bytes + (exp ? TCPOLEN_EXP_FASTOPEN_BASE :
+ TCPOLEN_FASTOPEN_BASE);
+ struct tcp_option *option;
+ option = tcp_option_new(exp ? TCPOPT_EXP : TCPOPT_FASTOPEN,
+ option_bytes);
+ if (exp)
+ option->data.fast_open_exp.magic = htons(TCPOPT_FASTOPEN_MAGIC);
+
+ int parsed_bytes = 0;
+ /* Parse cookie. This should be an ASCII hex string
+ * representing an even number of bytes (4-16 bytes). But we
+ * do not enforce this, since we want to allow test cases that
+ * supply invalid cookies.
+ */
+ if (parse_hex_string(cookie_string,
+ exp ? option->data.fast_open_exp.cookie :
+ option->data.fast_open.cookie,
+ exp ? sizeof(option->data.fast_open_exp.cookie):
+ sizeof(option->data.fast_open.cookie),
+ &parsed_bytes)) {
+ free(option);
+ asprintf(error,
+ "TCP fast open cookie '%s' is not a valid hex string",
+ cookie_string);
+ return NULL;
+ }
+ assert(parsed_bytes == cookie_bytes);
+ return option;
+}
+
+static struct tcp_option *new_md5_option(const char *digest_string,
+ char **error)
+{
+ struct tcp_option *option;
+ int digest_string_len = strlen(digest_string);
+ int digest_bytes = digest_string_len / 2;
+ int parsed_bytes = 0;
+
+ if (digest_bytes > TCP_MD5_DIGEST_LEN) {
+ asprintf(error, "TCP MD5 digest longer than 16 bytes");
+ return NULL;
+ }
+
+ option = tcp_option_new(TCPOPT_MD5SIG, TCPOLEN_MD5_BASE + digest_bytes);
+
+ /* Parse MD5 digest. This should be an ASCII hex string representing 16
+ * bytes. But we allow smaller buffers, since we want to allow test
+ * cases that supply invalid cookies.
+ */
+ if (parse_hex_string(digest_string,
+ option->data.md5.digest,
+ sizeof(option->data.md5.digest),
+ &parsed_bytes)) {
+ free(option);
+ asprintf(error, "TCP MD5 digest is not a valid hex string");
+ return NULL;
+ }
+ assert(parsed_bytes <= digest_bytes);
+ return option;
+}
+
+static struct packet *append_gre(struct packet *packet, struct expression *expr)
+{
+ struct gre *gre = &expr->value.gre;
+ char *error = NULL;
+ if (gre_header_append(packet, gre, &error))
+ semantic_error(error);
+ free(expr);
+ return packet;
+}
+
+%}
+
+%locations
+%expect 3 /* we expect shift/reduce conflicts */
+/* The %union section specifies the set of possible types for values
+ * for all nonterminal and terminal symbols in the grammar.
+ */
+%union {
+ s64 integer;
+ double floating;
+ char *string;
+ char *reserved;
+ s64 time_usecs;
+ enum direction_t direction;
+ enum ip_ecn_t ip_ecn;
+ struct tos_spec tos_spec;
+ struct ip_info ip_info;
+ struct mpls_stack *mpls_stack;
+ struct mpls mpls_stack_entry;
+ u16 port;
+ s32 window;
+ u16 urg_ptr;
+ u32 sequence_number;
+ struct {
+ int protocol; /* IPPROTO_TCP or IPPROTO_UDP */
+ u32 start_sequence;
+ u16 payload_bytes;
+ } tcp_sequence_info;
+ struct option_list *option;
+ struct event *event;
+ struct packet *packet;
+ struct syscall_spec *syscall;
+ struct command_spec *command;
+ struct code_spec *code;
+ struct tcp_option *tcp_option;
+ struct tcp_options *tcp_options;
+ struct expression *expression;
+ struct expression_list *expression_list;
+ struct errno_spec *errno_info;
+ struct {
+ u16 src_port;
+ u16 dst_port;
+ } port_info;
+}
+
+/* The specific type of the output for a symbol is given by the %type
+ * directive. By convention terminal symbols returned from the lexer
+ * have ALL_CAPS names, and nonterminal symbols have lower_case names.
+ */
+%token ELLIPSIS
+%token <reserved> SA_FAMILY SIN_PORT SIN_ADDR _HTONS_ INET_ADDR INET6_ADDR
+%token <reserved> MSG_NAME MSG_IOV MSG_FLAGS MSG_CONTROL
+%token <reserved> CMSG_LEVEL CMSG_TYPE CMSG_DATA
+%token <reserved> FD EVENTS REVENTS ONOFF LINGER
+%token <reserved> U32 U64 PTR
+%token <reserved> ACK ECR EOL MSS NOP SACK SACKOK TIMESTAMP VAL WIN WSCALE
+%token <reserved> URG MD5 FAST_OPEN FAST_OPEN_EXP
+%token <reserved> TOS FLAGS FLOWLABEL
+%token <reserved> ECT0 ECT1 CE ECT01 NO_ECN
+%token <reserved> IPV4 IPV6 ICMP UDP RAW GRE MTU ID
+%token <reserved> MPLS LABEL TC TTL
+%token <reserved> OPTION
+%token <reserved> SUM OFF KEY SEQ
+%token <reserved> NONE CHECKSUM SEQUENCE PRESENT
+%token <reserved> EE_ERRNO EE_CODE EE_DATA EE_INFO EE_ORIGIN EE_TYPE
+%token <reserved> SCM_SEC SCM_NSEC
+%token <floating> FLOAT
+%token <integer> INTEGER HEX_INTEGER
+%token <string> WORD STRING BACK_QUOTED CODE IPV4_ADDR IPV6_ADDR
+%type <direction> direction
+%type <ip_info> ip_info opt_ip_info
+%type <tos_spec> tos_spec
+%type <ip_ecn> ip_ecn
+%type <option> option options opt_options
+%type <event> event events event_time action
+%type <time_usecs> time opt_end_time
+%type <packet> packet_spec tcp_packet_spec udp_packet_spec icmp_packet_spec
+%type <packet> packet_prefix
+%type <syscall> syscall_spec
+%type <command> command_spec
+%type <code> code_spec
+%type <mpls_stack> mpls_stack
+%type <mpls_stack_entry> mpls_stack_entry
+%type <integer> opt_mpls_stack_bottom
+%type <integer> opt_icmp_mtu
+%type <integer> gre_flags_list gre_flags gre_flag
+%type <integer> gre_sum gre_off gre_key gre_seq
+%type <integer> opt_icmp_echo_id
+%type <integer> flow_label
+%type <string> icmp_type opt_icmp_code flags
+%type <string> opt_tcp_fast_open_cookie hex_blob
+%type <string> opt_note note word_list
+%type <string> option_flag option_value script
+%type <string> opt_comma opt_equals
+%type <window> opt_window
+%type <urg_ptr> opt_urg_ptr
+%type <sequence_number> opt_ack
+%type <tcp_sequence_info> seq opt_icmp_echoed
+%type <tcp_options> opt_tcp_options tcp_option_list
+%type <tcp_option> tcp_option sack_block_list sack_block
+%type <string> function_name
+%type <expression_list> expression_list function_arguments
+%type <expression> expression binary_expression array sub_expr_list
+%type <expression> any_int decimal_integer hex_integer
+%type <expression> inaddr in6addr sockaddr msghdr iovec pollfd opt_revents linger
+%type <expression> opt_cmsg cmsg_expr
+%type <expression> scm_timestamping_expr
+%type <expression> sock_extended_err_expr
+%type <expression> mpls_stack_expression
+%type <expression> gre_header_expression
+%type <expression> epollev
+%type <errno_info> opt_errno
+%type <port_info> opt_port_info
+
+%% /* The grammar follows. */
+
+script
+: opt_options opt_init_command events opt_cleanup_command {
+ $$ = NULL; /* The parser output is in out_script */
+}
+;
+
+opt_options
+: {
+ $$ = NULL;
+ parse_and_finalize_config(invocation);
+}
+| options {
+ $$ = $1;
+ parse_and_finalize_config(invocation);
+}
+;
+
+options
+: option {
+ out_script->option_list = $1;
+ $$ = $1; /* return the tail so we can append to it */
+}
+| options option {
+ $1->next = $2;
+ $$ = $2; /* return the tail so we can append to it */
+}
+;
+
+option
+: option_flag '=' option_value {
+ $$ = new_option($1, $3);
+}
+| option_flag {
+ $$ = new_option($1, NULL);
+}
+
+option_flag
+: OPTION { $$ = $1; }
+;
+
+option_value
+: INTEGER { $$ = strdup(yytext); }
+| WORD { $$ = $1; }
+| STRING { $$ = $1; }
+| IPV4_ADDR { $$ = $1; }
+| IPV6_ADDR { $$ = $1; }
+| IPV4 { $$ = strdup("ipv4"); }
+| IPV6 { $$ = strdup("ipv6"); }
+| WORD '=' WORD {
+ /* For consistency, allow syntax like: --define=PROTO=IPPROTO_TCP */
+ char *lhs = $1, *rhs = $3;
+
+ asprintf(&($$), "%s=%s", lhs, rhs);
+ free(lhs);
+ free(rhs);
+}
+| WORD '=' STRING {
+ /* For consistency, allow syntax like: --define=CC="reno" */
+ char *lhs = $1, *rhs = $3;
+
+ asprintf(&($$), "%s=\"%s\"", lhs, rhs);
+ free(lhs);
+ free(rhs);
+}
+| WORD '=' BACK_QUOTED {
+ /* For consistency, allow syntax like: --define=SCRIPT=`cleanup` */
+ char *lhs = $1, *rhs = $3;
+
+ asprintf(&($$), "%s=`%s`", lhs, rhs);
+ free(lhs);
+ free(rhs);
+}
+;
+
+opt_init_command
+: { }
+| init_command { }
+;
+
+init_command
+: command_spec { out_script->init_command = $1; }
+;
+
+events
+: event {
+ out_script->event_list = $1; /* save pointer to event list as output
+ * of parser */
+ $$ = $1; /* return the tail so that we can append to it */
+}
+| events event {
+ $1->next = $2; /* link new event to the end of the existing list */
+ $$ = $2; /* return the tail so that we can append to it */
+}
+;
+
+event
+: event_time action {
+ $$ = $2;
+ $$->line_number = $1->line_number; /* use timestamp's line */
+ $$->time_usecs = $1->time_usecs;
+ $$->time_usecs_end = $1->time_usecs_end;
+ $$->time_type = $1->time_type;
+
+ if ($$->time_usecs_end != NO_TIME_RANGE) {
+ if ($$->time_usecs_end < $$->time_usecs)
+ semantic_error("time range is backwards");
+ }
+ if ($$->time_type == ANY_TIME && ($$->type != PACKET_EVENT ||
+ packet_direction($$->event.packet) != DIRECTION_OUTBOUND)) {
+ yylineno = $$->line_number;
+ semantic_error("event time <star> can only be used with "
+ "outbound packets");
+ } else if (($$->time_type == ABSOLUTE_RANGE_TIME ||
+ $$->time_type == RELATIVE_RANGE_TIME) &&
+ ($$->type != PACKET_EVENT ||
+ packet_direction($$->event.packet) != DIRECTION_OUTBOUND)) {
+ yylineno = $$->line_number;
+ semantic_error("event time range can only be used with "
+ "outbound packets");
+ }
+ free($1);
+}
+;
+
+event_time
+: '+' time {
+ $$ = new_event(INVALID_EVENT);
+ $$->line_number = @2.first_line;
+ $$->time_usecs = $2;
+ $$->time_type = RELATIVE_TIME;
+}
+| time {
+ $$ = new_event(INVALID_EVENT);
+ $$->line_number = @1.first_line;
+ $$->time_usecs = $1;
+ $$->time_type = ABSOLUTE_TIME;
+}
+| '*' {
+ $$ = new_event(INVALID_EVENT);
+ $$->line_number = @1.first_line;
+ $$->time_type = ANY_TIME;
+}
+| time '~' time {
+ $$ = new_event(INVALID_EVENT);
+ $$->line_number = @1.first_line;
+ $$->time_type = ABSOLUTE_RANGE_TIME;
+ $$->time_usecs = $1;
+ $$->time_usecs_end = $3;
+}
+| '+' time '~' '+' time {
+ $$ = new_event(INVALID_EVENT);
+ $$->line_number = @1.first_line;
+ $$->time_type = RELATIVE_RANGE_TIME;
+ $$->time_usecs = $2;
+ $$->time_usecs_end = $5;
+}
+;
+
+time
+: FLOAT {
+ if ($1 < 0) {
+ semantic_error("negative time");
+ }
+ $$ = (s64)($1 * 1.0e6); /* convert float secs to s64 microseconds */
+}
+| INTEGER {
+ if ($1 < 0) {
+ semantic_error("negative time");
+ }
+ $$ = (s64)($1 * 1000000); /* convert int secs to s64 microseconds */
+}
+;
+
+action
+: packet_spec { $$ = new_event(PACKET_EVENT); $$->event.packet = $1; }
+| syscall_spec { $$ = new_event(SYSCALL_EVENT); $$->event.syscall = $1; }
+| command_spec { $$ = new_event(COMMAND_EVENT); $$->event.command = $1; }
+| code_spec { $$ = new_event(CODE_EVENT); $$->event.code = $1; }
+;
+
+packet_spec
+: tcp_packet_spec { $$ = $1; }
+| udp_packet_spec { $$ = $1; }
+| icmp_packet_spec { $$ = $1; }
+;
+
+tcp_packet_spec
+: packet_prefix opt_ip_info opt_port_info flags seq opt_ack opt_window opt_urg_ptr opt_tcp_options {
+ char *error = NULL;
+ struct packet *outer = $1, *inner = NULL;
+ enum direction_t direction = outer->direction;
+
+ if (($2.tos.check == TOS_CHECK_ECN) && ($2.tos.value == ECN_ECT01) &&
+ (direction != DIRECTION_OUTBOUND)) {
+ semantic_error("[ect01] can only be used with outbound packets");
+ }
+
+ if (($9 == NULL) && (direction != DIRECTION_OUTBOUND)) {
+ yylineno = @7.first_line;
+ semantic_error("<...> for TCP options can only be used with "
+ "outbound packets");
+ }
+
+ inner = new_tcp_packet(in_config->wire_protocol,
+ direction, $2, $3.src_port, $3.dst_port, $4,
+ $5.start_sequence, $5.payload_bytes,
+ $6, $7, $8, $9, &error);
+ free($4);
+ free($9);
+ if (inner == NULL) {
+ assert(error != NULL);
+ semantic_error(error);
+ free(error);
+ }
+
+ $$ = packet_encapsulate_and_free(outer, inner);
+}
+;
+
+udp_packet_spec
+: packet_prefix opt_ip_info UDP opt_port_info '(' INTEGER ')' {
+ char *error = NULL;
+ struct packet *outer = $1, *inner = NULL;
+ enum direction_t direction = outer->direction;
+
+ if ($2.tos.check == TOS_CHECK_ECN) {
+ semantic_error("ECN can only be used with TCP packets");
+ }
+
+ if (!is_valid_u16($6)) {
+ semantic_error("UDP payload size out of range");
+ }
+
+ inner = new_udp_packet(in_config->wire_protocol, direction, $2,
+ $6, $4.src_port, $4.dst_port, &error);
+ if (inner == NULL) {
+ assert(error != NULL);
+ semantic_error(error);
+ free(error);
+ }
+
+ $$ = packet_encapsulate_and_free(outer, inner);
+}
+;
+
+icmp_packet_spec
+: packet_prefix opt_ip_info ICMP icmp_type opt_icmp_code opt_icmp_mtu
+ opt_icmp_echo_id opt_icmp_echoed {
+ char *error = NULL;
+ struct packet *outer = $1, *inner = NULL;
+ enum direction_t direction = outer->direction;
+
+ if (($2.tos.check == TOS_CHECK_ECN) && ($2.tos.value == ECN_ECT01) &&
+ (direction != DIRECTION_OUTBOUND)) {
+ semantic_error("[ect01] can only be used with outbound packets");
+ }
+
+ inner = new_icmp_packet(in_config->wire_protocol, direction, $4, $5,
+ $8.protocol, $8.start_sequence,
+ $8.payload_bytes, $2, $6, $7, &error);
+ free($4);
+ free($5);
+ if (inner == NULL) {
+ semantic_error(error);
+ free(error);
+ }
+
+ $$ = packet_encapsulate_and_free(outer, inner);
+}
+;
+
+
+packet_prefix
+: direction {
+ $$ = packet_new(PACKET_MAX_HEADER_BYTES);
+ $$->direction = $1;
+}
+| packet_prefix IPV4 opt_ip_info IPV4_ADDR '>' IPV4_ADDR ':' {
+ char *error = NULL;
+ struct packet *packet = $1;
+ u8 tos = $3.tos.value;
+ u8 ttl = $3.ttl;
+ char *ip_src = $4;
+ char *ip_dst = $6;
+ if (ipv4_header_append(packet, ip_src, ip_dst, tos, ttl, &error))
+ semantic_error(error);
+ free(ip_src);
+ free(ip_dst);
+ $$ = packet;
+}
+| packet_prefix IPV6 opt_ip_info IPV6_ADDR '>' IPV6_ADDR ':' {
+ char *error = NULL;
+ struct packet *packet = $1;
+ u8 tos = $3.tos.value;
+ u8 hop_limit = $3.ttl;
+ char *ip_src = $4;
+ char *ip_dst = $6;
+ if (ipv6_header_append(packet, ip_src, ip_dst, tos, hop_limit, &error))
+ semantic_error(error);
+ free(ip_src);
+ free(ip_dst);
+ $$ = packet;
+}
+| packet_prefix GRE ':' {
+ struct packet *packet = $1;
+ struct expression *expr = new_expression(EXPR_GRE);
+ $$ = append_gre(packet, expr);
+}
+| packet_prefix GRE opt_comma gre_header_expression ':' {
+ struct packet *packet = $1;
+ struct expression *expr = $4;
+ $$ = append_gre(packet, expr);
+}
+| packet_prefix MPLS mpls_stack ':' {
+ char *error = NULL;
+ struct packet *packet = $1;
+ struct mpls_stack *mpls_stack = $3;
+
+ if (mpls_header_append(packet, mpls_stack, &error))
+ semantic_error(error);
+ free(mpls_stack);
+ $$ = packet;
+}
+;
+
+gre_header_expression
+: gre_flags_list opt_comma
+ gre_sum opt_comma
+ gre_off opt_comma
+ gre_key opt_comma
+ gre_seq {
+ $$ = new_expression(EXPR_GRE);
+ $$->value.gre.flags = htons($1);
+ $$->value.gre.be16[0] = htons($3);
+ $$->value.gre.be16[1] = htons($5);
+ $$->value.gre.be32[1] = htonl($7);
+ $$->value.gre.be32[2] = htonl($9);
+}
+;
+
+gre_flags_list
+: FLAGS '[' gre_flags ']' { $$ = $3; }
+| FLAGS any_int { $$ = $2->value.num; }
+;
+
+gre_flags
+: gre_flag { $$ = $1; }
+| gre_flag ',' gre_flags { $$ = $1 | $3; }
+;
+
+gre_flag
+: NONE { $$ = 0; }
+| CHECKSUM PRESENT { $$ = GRE_FLAG_C; }
+| KEY PRESENT { $$ = GRE_FLAG_K; }
+| SEQUENCE PRESENT { $$ = GRE_FLAG_S; }
+;
+
+gre_sum
+: SUM any_int { $$ = $2->value.num; }
+;
+
+gre_off
+: OFF any_int { $$ = $2->value.num; }
+;
+
+gre_key
+: KEY opt_equals any_int { $$ = $3->value.num; }
+;
+
+gre_seq
+: SEQ any_int { $$ = $2->value.num; }
+;
+
+opt_comma
+: { $$ = NULL; }
+| ',' { $$ = NULL; }
+;
+
+opt_equals
+: { $$ = NULL; }
+| '=' { $$ = NULL; }
+;
+
+mpls_stack
+: {
+ $$ = mpls_stack_new();
+}
+| mpls_stack mpls_stack_entry {
+ if (mpls_stack_append($1, $2))
+ semantic_error("too many MPLS labels");
+ $$ = $1;
+}
+;
+
+mpls_stack_entry
+:
+'(' LABEL INTEGER ',' TC INTEGER ',' opt_mpls_stack_bottom TTL INTEGER ')' {
+ char *error = NULL;
+ s64 label = $3;
+ s64 traffic_class = $6;
+ bool is_stack_bottom = $8;
+ s64 ttl = $10;
+ struct mpls mpls;
+
+ if (new_mpls_stack_entry(label, traffic_class, is_stack_bottom, ttl,
+ &mpls, &error))
+ semantic_error(error);
+ $$ = mpls;
+}
+;
+
+opt_mpls_stack_bottom
+: { $$ = 0; }
+| '[' WORD ']' ',' {
+ if (strcmp($2, "S") != 0)
+ semantic_error("expected [S] for MPLS label stack bottom");
+ free($2);
+ $$ = 1;
+}
+;
+
+icmp_type
+: WORD { $$ = $1; }
+;
+
+opt_icmp_code
+: { $$ = NULL; }
+| WORD { $$ = $1; }
+;
+
+/* This specifies the relevant details about the packet echoed by ICMP. */
+opt_icmp_echoed
+: {
+ $$.start_sequence = 0;
+ $$.payload_bytes = 0;
+ $$.protocol = IPPROTO_TCP;
+}
+| '[' UDP '(' INTEGER ')' ']' {
+ $$.start_sequence = 0;
+ $$.payload_bytes = $4;
+ $$.protocol = IPPROTO_UDP;
+}
+| '[' seq ']' {
+ $$ = $2;
+}
+| '[' RAW '(' INTEGER ')' ']' {
+ $$.payload_bytes = $4;
+ $$.protocol = IPPROTO_RAW;
+}
+;
+
+opt_icmp_mtu
+: { $$ = -1; }
+| MTU INTEGER { $$ = $2; }
+;
+
+opt_icmp_echo_id
+: { $$ = 0; }
+| ID INTEGER { $$ = $2; }
+;
+
+opt_port_info
+: {
+ $$.src_port = 0;
+ $$.dst_port = 0;
+}
+| INTEGER '>' INTEGER {
+ if (!is_valid_u16($1)) {
+ semantic_error("src port out of range");
+ }
+ if (!is_valid_u16($3)) {
+ semantic_error("dst port out of range");
+ }
+
+ $$.src_port = $1;
+ $$.dst_port = $3;
+}
+;
+
+direction
+: '<' { $$ = DIRECTION_INBOUND; current_script_line = yylineno; }
+| '>' { $$ = DIRECTION_OUTBOUND; current_script_line = yylineno; }
+;
+
+tos_spec
+: ip_ecn { $$.check = TOS_CHECK_ECN; $$.value = $1; }
+| TOS HEX_INTEGER {
+ s64 tos = $2;
+
+ if (!is_valid_u8(tos)) {
+ semantic_error("tos out of range for 8 bits");
+ }
+
+ $$.check = TOS_CHECK_TOS;
+ $$.value = tos;
+}
+;
+
+ip_ecn
+: NO_ECN { $$ = ECN_NONE; }
+| ECT0 { $$ = ECN_ECT0; }
+| ECT1 { $$ = ECN_ECT1; }
+| ECT01 { $$ = ECN_ECT01; }
+| CE { $$ = ECN_CE; }
+;
+
+flags
+: WORD { $$ = $1; }
+| '.' { $$ = strdup("."); }
+| WORD '.' { asprintf(&($$), "%s.", $1); free($1); }
+| '-' { $$ = strdup(""); } /* no TCP flags set in segment */
+;
+
+flow_label
+: FLOWLABEL HEX_INTEGER {
+ s64 flowlabel = $2;
+
+ if (!is_valid_u20(flowlabel)) {
+ semantic_error("flowlabel out of range for 20 bits");
+ }
+ $$ = flowlabel;
+}
+;
+
+ip_info
+: tos_spec {
+ $$.tos.check = $1.check;
+ $$.tos.value = $1.value;
+ $$.flow_label = 0;
+ $$.ttl = 0;
+}
+| flow_label {
+ $$.tos.check = TOS_CHECK_NONE;
+ $$.tos.value = 0;
+ $$.flow_label = $1;
+ $$.ttl = 0;
+}
+| TTL INTEGER {
+ $$.tos.check = TOS_CHECK_NONE;
+ $$.tos.value = 0;
+ $$.flow_label = 0;
+ $$.ttl = $2;
+}
+| tos_spec ',' flow_label {
+ $$.tos.check = $1.check;
+ $$.tos.value = $1.value;
+ $$.flow_label = $3;
+ $$.ttl = 0;
+}
+;
+
+opt_ip_info
+: {
+ $$.tos.check = TOS_CHECK_NONE;
+ $$.tos.value = 0;
+ $$.flow_label = 0;
+ $$.ttl = 0;
+}
+| '(' ip_info ')' { $$ = $2; }
+| '[' ip_info ']' { $$ = $2; }
+;
+
+seq
+: INTEGER ':' INTEGER '(' INTEGER ')' {
+ if (!is_valid_u32($1)) {
+ semantic_error("TCP start sequence number out of range");
+ }
+ if (!is_valid_u32($3)) {
+ semantic_error("TCP end sequence number out of range");
+ }
+ if (!is_valid_u16($5)) {
+ semantic_error("TCP payload size out of range");
+ }
+ if ($3 != ($1 +$5)) {
+ semantic_error("inconsistent TCP sequence numbers and "
+ "payload size");
+ }
+ $$.start_sequence = $1;
+ $$.payload_bytes = $5;
+ $$.protocol = IPPROTO_TCP;
+}
+;
+
+opt_ack
+: { $$ = 0; }
+| ACK INTEGER {
+ if (!is_valid_u32($2)) {
+ semantic_error("TCP ack sequence number out of range");
+ }
+ $$ = $2;
+}
+;
+
+opt_window
+: { $$ = -1; }
+| WIN INTEGER {
+ if (!is_valid_u16($2)) {
+ semantic_error("TCP window value out of range");
+ }
+ $$ = $2;
+}
+;
+
+opt_urg_ptr
+: { $$ = 0; }
+| URG INTEGER {
+ if (!is_valid_u16($2)) {
+ semantic_error("urg_ptr value out of range");
+ }
+ $$ = $2;
+}
+;
+
+opt_tcp_options
+: { $$ = tcp_options_new(); }
+| '<' tcp_option_list '>' { $$ = $2; }
+| '<' ELLIPSIS '>' { $$ = NULL; /* FLAG_OPTIONS_NOCHECK */ }
+;
+
+tcp_option_list
+: tcp_option {
+ $$ = tcp_options_new();
+ if (tcp_options_append($$, $1)) {
+ semantic_error("TCP option list too long");
+ }
+}
+| tcp_option_list ',' tcp_option {
+ $$ = $1;
+ if (tcp_options_append($$, $3)) {
+ semantic_error("TCP option list too long");
+ }
+}
+;
+
+opt_tcp_fast_open_cookie
+: { $$ = strdup(""); }
+| hex_blob { $$ = $1; }
+;
+
+hex_blob
+: WORD { $$ = $1; }
+| INTEGER { $$ = strdup(yytext); }
+;
+
+tcp_option
+: NOP { $$ = tcp_option_new(TCPOPT_NOP, 1); }
+| EOL { $$ = tcp_option_new(TCPOPT_EOL, 1); }
+| MSS INTEGER {
+ $$ = tcp_option_new(TCPOPT_MAXSEG, TCPOLEN_MAXSEG);
+ if (!is_valid_u16($2)) {
+ semantic_error("mss value out of range");
+ }
+ $$->data.mss.bytes = htons($2);
+}
+| WSCALE INTEGER {
+ $$ = tcp_option_new(TCPOPT_WINDOW, TCPOLEN_WINDOW);
+ if (!is_valid_u8($2)) {
+ semantic_error("window scale shift count out of range");
+ }
+ $$->data.window_scale.shift_count = $2;
+}
+| SACKOK {
+ $$ = tcp_option_new(TCPOPT_SACK_PERMITTED,
+ TCPOLEN_SACK_PERMITTED);
+}
+| SACK sack_block_list {
+ $$ = $2;
+}
+| MD5 hex_blob {
+ char *error = NULL;
+ $$ = new_md5_option($2, &error);
+ free($2);
+ if ($$ == NULL) {
+ assert(error != NULL);
+ semantic_error(error);
+ free(error);
+ }
+}
+| TIMESTAMP VAL INTEGER ECR INTEGER {
+ u32 val, ecr;
+ $$ = tcp_option_new(TCPOPT_TIMESTAMP, TCPOLEN_TIMESTAMP);
+ if (!is_valid_u32($3)) {
+ semantic_error("ts val out of range");
+ }
+ if (!is_valid_u32($5)) {
+ semantic_error("ecr val out of range");
+ }
+ val = $3;
+ ecr = $5;
+ $$->data.time_stamp.val = htonl(val);
+ $$->data.time_stamp.ecr = htonl(ecr);
+}
+| FAST_OPEN opt_tcp_fast_open_cookie {
+ char *error = NULL;
+ $$ = new_tcp_fast_open_option($2, &error, false);
+ free($2);
+ if ($$ == NULL) {
+ assert(error != NULL);
+ semantic_error(error);
+ free(error);
+ }
+}
+| FAST_OPEN_EXP opt_tcp_fast_open_cookie {
+ char *error = NULL;
+ $$ = new_tcp_fast_open_option($2, &error, true);
+ free($2);
+ if ($$ == NULL) {
+ assert(error != NULL);
+ semantic_error(error);
+ free(error);
+ }
+}
+;
+
+sack_block_list
+: sack_block { $$ = $1; }
+| sack_block_list sack_block {
+ const int list_block_bytes = $1->length - 2;
+ assert(list_block_bytes > 0);
+ assert((list_block_bytes % sizeof(struct sack_block)) == 0);
+ const int num_blocks = list_block_bytes / sizeof(struct sack_block);
+ /* Append this SACK block to the end of the array of blocks. */
+ memcpy($1->data.sack.block + num_blocks, $2->data.sack.block,
+ sizeof(struct sack_block));
+ $1->length += sizeof(struct sack_block);
+ free($2);
+ $$ = $1;
+}
+;
+
+sack_block
+: INTEGER ':' INTEGER {
+ $$ = tcp_option_new(TCPOPT_SACK, 2 + sizeof(struct sack_block));
+ if (!is_valid_u32($1)) {
+ semantic_error("TCP SACK left sequence number out of range");
+ }
+ if (!is_valid_u32($3)) {
+ semantic_error("TCP SACK right sequence number out of range");
+ }
+ $$->data.sack.block[0].left = htonl($1);
+ $$->data.sack.block[0].right = htonl($3);
+}
+;
+
+syscall_spec
+: opt_end_time function_name function_arguments '='
+ expression opt_errno opt_note {
+ $$ = calloc(1, sizeof(struct syscall_spec));
+ $$->end_usecs = $1;
+ $$->name = $2;
+ $$->arguments = $3;
+ $$->result = $5;
+ $$->error = $6;
+ $$->note = $7;
+}
+;
+
+opt_end_time
+: { $$ = SYSCALL_NON_BLOCKING; }
+| ELLIPSIS time { $$ = $2; }
+;
+
+function_name
+: WORD { $$ = $1; current_script_line = yylineno; }
+;
+
+function_arguments
+: '(' ')' { $$ = NULL; }
+| '(' expression_list ')' { $$ = $2; }
+;
+
+expression_list
+: expression { $$ = new_expression_list($1); }
+| expression_list ',' expression { $$ = $1; expression_list_append($1, $3); }
+;
+
+expression
+: ELLIPSIS {
+ $$ = new_expression(EXPR_ELLIPSIS);
+}
+| any_int { $$ = $1; }
+| WORD {
+ $$ = new_expression(EXPR_WORD);
+ $$->value.string = $1;
+}
+| STRING {
+ $$ = new_expression(EXPR_STRING);
+ $$->value.string = $1;
+ $$->format = "\"%s\"";
+}
+| STRING ELLIPSIS {
+ $$ = new_expression(EXPR_STRING);
+ $$->value.string = $1;
+ $$->format = "\"%s\"...";
+}
+| binary_expression {
+ $$ = $1;
+}
+| array {
+ $$ = $1;
+}
+| inaddr {
+ $$ = $1;
+}
+| in6addr {
+ $$ = $1;
+}
+| sockaddr {
+ $$ = $1;
+}
+| msghdr {
+ $$ = $1;
+}
+| iovec {
+ $$ = $1;
+}
+| pollfd {
+ $$ = $1;
+}
+| linger {
+ $$ = $1;
+}
+| mpls_stack_expression {
+ $$ = $1;
+}
+| cmsg_expr {
+ $$ = $1;
+}
+| scm_timestamping_expr {
+ $$ = $1;
+}
+| sub_expr_list {
+ $$ = $1;
+}
+| sock_extended_err_expr {
+ $$ = $1;
+}
+| '{' gre_header_expression '}' {
+ $$ = $2;
+}
+| epollev {
+ $$ = $1;
+}
+;
+
+any_int
+: decimal_integer { $$ = $1; }
+| hex_integer { $$ = $1; }
+;
+
+decimal_integer
+: INTEGER {
+ $$ = new_integer_expression($1, "%ld");
+}
+;
+
+hex_integer
+: HEX_INTEGER {
+ $$ = new_integer_expression($1, "%#lx");
+}
+;
+
+binary_expression
+: expression '|' expression { /* bitwise OR */
+ $$ = new_expression(EXPR_BINARY);
+ struct binary_expression *binary =
+ malloc(sizeof(struct binary_expression));
+ binary->op = strdup("|");
+ binary->lhs = $1;
+ binary->rhs = $3;
+ $$->value.binary = binary;
+}
+| WORD '=' expression { /* symbol = value */
+ $$ = new_expression(EXPR_BINARY);
+ struct binary_expression *binary =
+ malloc(sizeof(struct binary_expression));
+ binary->op = strdup("=");
+ binary->lhs = new_expression(EXPR_WORD);
+ binary->lhs->value.string = $1;
+ binary->rhs = $3;
+ $$->value.binary = binary;
+}
+;
+
+array
+: '[' ']' {
+ $$ = new_expression(EXPR_LIST);
+ $$->value.list = NULL;
+}
+| '[' expression_list ']' {
+ $$ = new_expression(EXPR_LIST);
+ $$->value.list = $2;
+}
+;
+
+inaddr
+: INET_ADDR '(' STRING ')' {
+ __be32 ip_address = inet_addr($3);
+ $$ = new_integer_expression(ip_address, "%#lx");
+}
+;
+
+in6addr
+: INET6_ADDR '(' STRING ')' {
+ struct in6_addr ipv6_address;
+ if (inet_pton(AF_INET6, $3, &ipv6_address) != 1) {
+ semantic_error("cannot parse in6_addr");
+ }
+ $$ = new_expression(EXPR_IN6_ADDR);
+ $$->value.address_ipv6 = ipv6_address;
+}
+;
+
+sockaddr
+: '{' SA_FAMILY '=' WORD ','
+ SIN_PORT '=' _HTONS_ '(' INTEGER ')' ','
+ SIN_ADDR '=' INET_ADDR '(' STRING ')' '}' {
+ if (strcmp($4, "AF_INET") == 0) {
+ struct sockaddr_in *ipv4 = malloc(sizeof(struct sockaddr_in));
+ memset(ipv4, 0, sizeof(*ipv4));
+ ipv4->sin_family = AF_INET;
+ ipv4->sin_port = htons($10);
+ if (inet_pton(AF_INET, $17, &ipv4->sin_addr) == 1) {
+ $$ = new_expression(EXPR_SOCKET_ADDRESS_IPV4);
+ $$->value.socket_address_ipv4 = ipv4;
+ } else {
+ free(ipv4);
+ semantic_error("invalid IPv4 address");
+ }
+ } else if (strcmp($4, "AF_INET6") == 0) {
+ struct sockaddr_in6 *ipv6 = malloc(sizeof(struct sockaddr_in6));
+ memset(ipv6, 0, sizeof(*ipv6));
+ ipv6->sin6_family = AF_INET6;
+ ipv6->sin6_port = htons($10);
+ if (inet_pton(AF_INET6, $17, &ipv6->sin6_addr) == 1) {
+ $$ = new_expression(EXPR_SOCKET_ADDRESS_IPV6);
+ $$->value.socket_address_ipv6 = ipv6;
+ } else {
+ free(ipv6);
+ semantic_error("invalid IPv6 address");
+ }
+ }
+}
+;
+
+msghdr
+: '{' MSG_NAME '(' ELLIPSIS ')' '=' ELLIPSIS ','
+ MSG_IOV '(' decimal_integer ')' '=' array ','
+ MSG_FLAGS '=' expression
+ opt_cmsg '}' {
+ struct msghdr_expr *msg_expr = calloc(1, sizeof(struct msghdr_expr));
+ $$ = new_expression(EXPR_MSGHDR);
+ $$->value.msghdr = msg_expr;
+ msg_expr->msg_name = new_expression(EXPR_ELLIPSIS);
+ msg_expr->msg_namelen = new_expression(EXPR_ELLIPSIS);
+ msg_expr->msg_iov = $14;
+ msg_expr->msg_iovlen = $11;
+ msg_expr->msg_control = $19;
+ msg_expr->msg_flags = $18;
+}
+;
+
+opt_cmsg
+: { $$ = new_expression(EXPR_LIST); }
+| ',' MSG_CONTROL '=' array { $$ = $4; }
+;
+
+cmsg_expr
+: '{' CMSG_LEVEL '=' expression ','
+ CMSG_TYPE '=' expression ','
+ CMSG_DATA '=' expression '}' {
+ struct cmsg_expr *cmsg_expr = calloc(1, sizeof(struct cmsg_expr));
+ $$ = new_expression(EXPR_CMSG);
+ $$->value.cmsg = cmsg_expr;
+ cmsg_expr->cmsg_level = $4;
+ cmsg_expr->cmsg_type = $8;
+ cmsg_expr->cmsg_data = $12;
+}
+;
+
+scm_timestamping_expr
+: '{' SCM_SEC '=' INTEGER ','
+ SCM_NSEC '=' INTEGER '}' {
+ struct scm_timestamping_expr *ts_expr =
+ calloc(1, sizeof(struct scm_timestamping_expr));
+ ts_expr->ts[0].tv_sec = $4;
+ ts_expr->ts[0].tv_nsec = $8;
+
+ $$ = new_expression(EXPR_SCM_TIMESTAMPING);
+ $$->value.scm_timestamping = ts_expr;
+}
+;
+
+sub_expr_list
+: '{' expression_list '}' {
+ $$ = new_expression(EXPR_LIST);
+ $$->value.list = $2;
+}
+;
+
+sock_extended_err_expr
+: '{' EE_ERRNO '=' expression ','
+ EE_ORIGIN '=' expression ','
+ EE_TYPE '=' expression ','
+ EE_CODE '=' expression ','
+ EE_INFO '=' expression ','
+ EE_DATA '=' expression '}' {
+ struct sock_extended_err_expr *ee_expr =
+ calloc(1, sizeof(struct sock_extended_err_expr));
+ ee_expr->ee_errno = $4;
+ ee_expr->ee_origin = $8;
+ ee_expr->ee_type = $12;
+ ee_expr->ee_code = $16;
+ ee_expr->ee_info = $20;
+ ee_expr->ee_data = $24;
+ $$ = new_expression(EXPR_SOCK_EXTENDED_ERR);
+ $$->value.sock_extended_err = ee_expr;
+}
+;
+
+iovec
+: '{' ELLIPSIS ',' decimal_integer '}' {
+ struct iovec_expr *iov_expr = calloc(1, sizeof(struct iovec_expr));
+ $$ = new_expression(EXPR_IOVEC);
+ $$->value.iovec = iov_expr;
+ iov_expr->iov_base = new_expression(EXPR_ELLIPSIS);
+ iov_expr->iov_len = $4;
+}
+;
+
+pollfd
+: '{' FD '=' expression ',' EVENTS '=' expression opt_revents '}' {
+ struct pollfd_expr *pollfd_expr = calloc(1, sizeof(struct pollfd_expr));
+ $$ = new_expression(EXPR_POLLFD);
+ $$->value.pollfd = pollfd_expr;
+ pollfd_expr->fd = $4;
+ pollfd_expr->events = $8;
+ pollfd_expr->revents = $9;
+}
+;
+
+epollev
+: '{' EVENTS '=' expression ',' FD '=' expression '}' {
+ struct epollev_expr *epollev_expr = calloc(1, sizeof(struct epollev_expr));
+ $$ = new_expression(EXPR_EPOLLEV);
+ $$->value.epollev = epollev_expr;
+ epollev_expr->events = $4;
+ epollev_expr->fd = $8;
+} | '{' EVENTS '=' expression ',' PTR '=' expression '}' {
+ struct epollev_expr *epollev_expr = calloc(1, sizeof(struct epollev_expr));
+ $$ = new_expression(EXPR_EPOLLEV);
+ $$->value.epollev = epollev_expr;
+ epollev_expr->events = $4;
+ epollev_expr->ptr = $8;
+} | '{' EVENTS '=' expression ',' U32 '=' expression '}' {
+ struct epollev_expr *epollev_expr = calloc(1, sizeof(struct epollev_expr));
+ $$ = new_expression(EXPR_EPOLLEV);
+ $$->value.epollev = epollev_expr;
+ epollev_expr->events = $4;
+ epollev_expr->u32 = $8;
+} | '{' EVENTS '=' expression ',' U64 '=' expression '}' {
+ struct epollev_expr *epollev_expr = calloc(1, sizeof(struct epollev_expr));
+ $$ = new_expression(EXPR_EPOLLEV);
+ $$->value.epollev = epollev_expr;
+ epollev_expr->events = $4;
+ epollev_expr->u64 = $8;
+}
+;
+
+opt_revents
+: { $$ = new_integer_expression(0, "%ld"); }
+| ',' REVENTS '=' expression { $$ = $4; }
+;
+
+linger
+: '{' ONOFF '=' INTEGER ',' LINGER '=' INTEGER '}' {
+ $$ = new_expression(EXPR_LINGER);
+ $$->value.linger.l_onoff = $4;
+ $$->value.linger.l_linger = $8;
+}
+;
+
+mpls_stack_expression
+:
+'{' mpls_stack '}' {
+ $$ = new_expression(EXPR_MPLS_STACK);
+ $$->value.mpls_stack = $2;
+}
+;
+
+opt_errno
+: { $$ = NULL; }
+| WORD note {
+ $$ = malloc(sizeof(struct errno_spec));
+ $$->errno_macro = $1;
+ $$->strerror = $2;
+}
+;
+
+opt_note
+: { $$ = NULL; }
+| note { $$ = $1; }
+;
+
+note
+: '(' word_list ')' { $$ = $2; }
+;
+
+word_list
+: WORD { $$ = $1; }
+| FLAGS { $$ = strdup("flags"); }
+| word_list WORD { asprintf(&($$), "%s %s", $1, $2); free($1); free($2); }
+;
+
+command_spec
+: BACK_QUOTED {
+ $$ = malloc(sizeof(struct command_spec));
+ $$->command_line = $1;
+ current_script_line = yylineno;
+}
+;
+
+code_spec
+: CODE {
+ $$ = calloc(1, sizeof(struct code_spec));
+ $$->text = $1;
+ current_script_line = yylineno;
+ }
+;
+
+opt_cleanup_command
+: { }
+| cleanup_command { }
+;
+
+cleanup_command
+: command_spec {
+ out_script->cleanup_command = $1;
+ cleanup_cmd = out_script->cleanup_command->command_line;
+}
+;
diff --git a/test/packetdrill/pipe.c b/test/packetdrill/pipe.c
new file mode 100644
index 0000000..e4297b2
--- /dev/null
+++ b/test/packetdrill/pipe.c
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: weiwan@google.com (Wei Wang)
+ *
+ * Implementation for pipe related state and logic.
+ */
+
+#include "pipe.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include "run.h"
+
+void pipe_free(struct pipe *pipe)
+{
+ memset(pipe, 0, sizeof(*pipe));
+ free(pipe);
+}
+
+void pipe_close(struct state *state, struct fd_state *fd)
+{
+ pipe_free(fd_to_pipe(fd));
+}
+
+/* Global info about pipe descriptors that point to pipes. */
+struct fd_ops pipe_ops = {
+ .type = FD_PIPE,
+ .close = pipe_close,
+};
+
+struct pipe *pipe_new(struct state *state)
+{
+ struct pipe *pipe = calloc(1, sizeof(struct pipe));
+
+ pipe->fd.ops = &pipe_ops;
+ state_add_fd(state, to_fd(pipe));
+ return pipe;
+}
diff --git a/test/packetdrill/pipe.h b/test/packetdrill/pipe.h
new file mode 100644
index 0000000..6e3561b
--- /dev/null
+++ b/test/packetdrill/pipe.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: weiwan@google.com (Wei Wang)
+ *
+ * Interface for tracking pipes in the kernel under test.
+ */
+
+#ifndef __PIPE_H__
+#define __PIPE_H__
+
+#include "types.h"
+
+#include "fd_state.h"
+
+/* The runtime state for a pipe */
+struct pipe {
+ /* NOTE: struct fd_state must be first field in all fd flavors. */
+ struct fd_state fd; /* info about fd for this pipe */
+};
+
+/* Convert to pipe pointer if the fd has type FD_PIPE,
+ * otherwise return NULL.
+ */
+static inline struct pipe *fd_to_pipe(struct fd_state *fd)
+{
+ if (fd && fd->ops->type == FD_PIPE)
+ return (struct pipe *)fd;
+ else
+ return NULL;
+}
+
+struct state;
+
+/* Allocate and return a new pipe object. */
+extern struct pipe *pipe_new(struct state *state);
+
+#endif /* __PIPE_H__ */
diff --git a/test/packetdrill/platforms.h b/test/packetdrill/platforms.h
new file mode 100644
index 0000000..b61aad6
--- /dev/null
+++ b/test/packetdrill/platforms.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Declarations for platform-specific information.
+ */
+
+#ifndef __PLATFORMS_H__
+#define __PLATFORMS_H__
+
+
+/* ------------------------- Linux --------------------- */
+
+#ifdef linux
+
+#include <features.h>
+#include <linux/types.h>
+
+#define HAVE_OPEN_MEMSTREAM 1
+#define HAVE_FMEMOPEN 1
+#define TUN_PATH "/dev/net/tun"
+#define HAVE_TCP_INFO 1
+#define HAVE_TCP_CC_INFO 1
+#define HAVE_SO_MEMINFO 1
+
+#include "uapi_linux.h"
+
+#else
+
+/* The underscore variants of the kernel-style names are defined in
+ * linux/types.h, but we must define them explicitly for other platforms. */
+typedef u8 __u8;
+typedef u16 __u16;
+typedef u32 __u32;
+typedef u64 __u64;
+
+/* We also use kernel-style names for endian-specific unsigned types. */
+typedef u16 __le16;
+typedef u16 __be16;
+typedef u32 __le32;
+typedef u32 __be32;
+typedef u64 __le64;
+typedef u64 __be64;
+
+typedef u16 __sum16;
+typedef u32 __wsum;
+
+#endif /* linux */
+
+
+/* ------------------------- FreeBSD --------------------- */
+
+#if defined(__FreeBSD__)
+
+#define USE_LIBPCAP 1
+#define TUN_PATH "/dev/tun0"
+#define TUN_DEV "tun0"
+
+#define HAVE_TCP_INFO 1
+#if (__FreeBSD_version < 1000000 && __FreeBSD_version > 902000) || __FreeBSD_version > 1000028
+#define HAVE_FMEMOPEN 1
+#endif
+
+#include "open_memstream.h"
+#include "fmemopen.h"
+
+#endif /* __FreeBSD__ */
+
+/* ------------------------- OpenBSD --------------------- */
+
+#if defined(__OpenBSD__)
+
+#define USE_LIBPCAP 1
+#define TUN_PATH "/dev/tun0"
+#define TUN_DEV "tun0"
+
+#define HAVE_TCP_INFO 0
+
+#include "open_memstream.h"
+#include "fmemopen.h"
+
+#define __always_inline __attribute__((__always_inline__))
+
+#endif /* __OpenBSD__ */
+
+/* ------------------------- NetBSD --------------------- */
+
+#if defined(__NetBSD__)
+
+#define USE_LIBPCAP 1
+#define TUN_PATH "/dev/tun0"
+#define TUN_DEV "tun0"
+
+#define HAVE_TCP_INFO 0
+
+#define HAVE_FMEMOPEN 1
+#include "open_memstream.h"
+
+#define __always_inline __attribute__((__always_inline__))
+
+#endif /* __NetBSD__ */
+
+
+#endif /* __PLATFORMS_H__ */
diff --git a/test/packetdrill/run.c b/test/packetdrill/run.c
new file mode 100644
index 0000000..37ec449
--- /dev/null
+++ b/test/packetdrill/run.c
@@ -0,0 +1,695 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Implementation for the test script execution module.
+ */
+
+#include "run.h"
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <netinet/in.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/times.h>
+#include <unistd.h>
+#include "ip.h"
+#include "logging.h"
+#include "netdev.h"
+#include "wire_client_netdev.h"
+#include "parse.h"
+#include "run_command.h"
+#include "run_packet.h"
+#include "run_system_call.h"
+#include "script.h"
+#include "so_testing.h"
+#include "socket.h"
+#include "system.h"
+#include "tcp.h"
+#include "tcp_options.h"
+
+/* MAX_SPIN_USECS is the maximum amount of time (in microseconds) to
+ * spin waiting for an event. We sleep up until this many microseconds
+ * before a script event. We get the best results on tickless
+ * (CONFIG_NO_HZ=y) kernels when we try to sleep until the exact jiffy
+ * of a script event; this reduces the staleness/noise we see in
+ * jiffies values on tickless kernels, since the kernel updates the
+ * jiffies value at the time we wake, and then we execute the test
+ * event shortly thereafter. The value below was chosen experimentally
+ * based on experiences on a 2.2GHz machine for which there was a
+ * measured overhead of roughly 15 usec for the unlock/usleep/lock
+ * sequence that wait_for_event() must execute while waiting
+ * for the next event.
+ */
+const int MAX_SPIN_USECS = 20;
+/* Global bool init_cmd_exed */
+bool init_cmd_exed = false;
+
+/* Final command to always execute at end of script, in order to clean up: */
+const char *cleanup_cmd;
+
+/* Path of currently-executing script, for use in cleanup command errors: */
+const char *script_path;
+
+struct state *state_new(struct config *config,
+ struct script *script,
+ struct netdev *netdev)
+{
+ struct state *state = calloc(1, sizeof(struct state));
+
+ if (pthread_mutex_init(&state->mutex, NULL) != 0)
+ die_perror("pthread_mutex_init");
+
+ run_lock(state);
+
+ state->config = config;
+ state->script = script;
+ state->netdev = netdev;
+ state->packets = packets_new(state);
+ state->syscalls = syscalls_new(state);
+ state->code = code_new(config);
+ state->fds = NULL;
+ state->num_events = 0;
+ return state;
+}
+
+/* Add the file descriptor to the list of run-time file descriptors. */
+void state_add_fd(struct state *state, struct fd_state *fd)
+{
+ fd->next = state->fds;
+ state->fds = fd;
+}
+
+/* Close all sockets, free all the socket structs, and send a RST
+ * packet to clean up kernel state for each connection.
+ * TODO(ncardwell): centralize error handling and ensure test errors
+ * always result in a call to these clean-up functions, so we can make
+ * sure to reset connections in all cases.
+ */
+static void close_all_fds(struct state *state)
+{
+ struct fd_state *fd = state->fds, *dead_fd = NULL;
+
+ while (fd != NULL) {
+ dead_fd = fd;
+ fd = fd->next;
+ dead_fd->ops->close(state, dead_fd);
+ }
+}
+
+void state_free(struct state *state)
+{
+ /* We have to stop the system call thread first, since it's using
+ * sockets that we want to close and reset.
+ */
+ syscalls_free(state, state->syscalls);
+
+ /* Then we close the sockets and reset the connections, while
+ * we still have a netdev for injecting reset packets to free
+ * per-connection kernel state.
+ */
+ close_all_fds(state);
+
+ netdev_free(state->netdev);
+ packets_free(state->packets);
+ code_free(state->code);
+
+ if (state->wire_client)
+ wire_client_free(state->wire_client);
+
+ if (state->so_instance)
+ so_instance_free(state->so_instance);
+
+ run_unlock(state);
+ if (pthread_mutex_destroy(&state->mutex) != 0)
+ die_perror("pthread_mutex_destroy");
+
+ memset(state, 0, sizeof(*state)); /* paranoia to help catch bugs */
+ free(state);
+}
+
+s64 now_usecs(struct state *state)
+{
+ struct timeval tv;
+ if (state->so_instance) {
+ if (state->so_instance->ifc.gettimeofday(
+ state->so_instance->ifc.userdata, &tv, NULL) < 0)
+ die_perror("gettimeofday");
+ } else {
+ if (gettimeofday(&tv, NULL) < 0)
+ die_perror("gettimeofday");
+ }
+ return timeval_to_usecs(&tv);
+}
+
+/*
+ * Verify that something happened at the expected time.
+ * WARNING: verify_time() should not be looking at state->event
+ * because in some cases (checking the finish time for blocking system
+ * calls) we call verify_time() at a time when state->event
+ * points at an event other than the one whose time we're currently
+ * checking.
+ */
+int verify_time(struct state *state, enum event_time_t time_type,
+ s64 script_usecs, s64 script_usecs_end,
+ s64 live_usecs, const char *description, char **error)
+{
+ s64 expected_usecs = script_usecs - state->script_start_time_usecs;
+ s64 expected_usecs_end = script_usecs_end -
+ state->script_start_time_usecs;
+ s64 actual_usecs = live_usecs - state->live_start_time_usecs;
+ int tolerance_usecs = state->config->tolerance_usecs;
+
+ DEBUGP("expected: %.3f actual: %.3f (secs)\n",
+ usecs_to_secs(script_usecs), usecs_to_secs(actual_usecs));
+
+ if (time_type == ANY_TIME)
+ return STATUS_OK;
+
+ if (time_type == ABSOLUTE_RANGE_TIME ||
+ time_type == RELATIVE_RANGE_TIME) {
+ DEBUGP("expected_usecs_end %.3f\n",
+ usecs_to_secs(script_usecs_end));
+ if (actual_usecs < (expected_usecs - tolerance_usecs) ||
+ actual_usecs > (expected_usecs_end + tolerance_usecs)) {
+ if (time_type == ABSOLUTE_RANGE_TIME) {
+ asprintf(error,
+ "timing error: expected "
+ "%s in time range %.6f~%.6f sec "
+ "but happened at %.6f sec",
+ description,
+ usecs_to_secs(script_usecs),
+ usecs_to_secs(script_usecs_end),
+ usecs_to_secs(actual_usecs));
+ } else if (time_type == RELATIVE_RANGE_TIME) {
+ s64 offset_usecs = state->event->offset_usecs;
+ asprintf(error,
+ "timing error: expected "
+ "%s in relative time range +%.6f~+%.6f "
+ "sec but happened at %+.6f sec",
+ description,
+ usecs_to_secs(script_usecs -
+ offset_usecs),
+ usecs_to_secs(script_usecs_end -
+ offset_usecs),
+ usecs_to_secs(actual_usecs -
+ offset_usecs));
+ }
+ return STATUS_ERR;
+ } else {
+ return STATUS_OK;
+ }
+ }
+
+ if ((actual_usecs < (expected_usecs - tolerance_usecs)) ||
+ (actual_usecs > (expected_usecs + tolerance_usecs))) {
+ asprintf(error,
+ "timing error: "
+ "expected %s at %.6f sec but happened at %.6f sec; "
+ "tolerance %.6f sec",
+ description,
+ usecs_to_secs(script_usecs),
+ usecs_to_secs(actual_usecs),
+ usecs_to_secs(tolerance_usecs));
+ return STATUS_ERR;
+ } else {
+ return STATUS_OK;
+ }
+}
+
+/* Return a static string describing the given event, for error messages. */
+static const char *event_description(struct event *event)
+{
+ enum direction_t direction = DIRECTION_INVALID;
+
+ if ((event->type <= INVALID_EVENT) ||
+ (event->type >= NUM_EVENT_TYPES)) {
+ die("bogus event type: %d", event->type);
+ }
+ switch (event->type) {
+ case PACKET_EVENT:
+ direction = packet_direction(event->event.packet);
+ if (direction == DIRECTION_INBOUND)
+ return "inbound packet";
+ else if (direction == DIRECTION_OUTBOUND)
+ return "outbound packet";
+ else
+ assert(!"bad direction");
+ break;
+ case SYSCALL_EVENT:
+ return "system call start";
+ case COMMAND_EVENT:
+ return "command";
+ case CODE_EVENT:
+ return "data collection for code";
+ case INVALID_EVENT:
+ case NUM_EVENT_TYPES:
+ assert(!"bogus type");
+ break;
+ /* We omit default case so compiler catches missing values. */
+ }
+ return "invalid event";
+}
+
+void check_event_time(struct state *state, s64 live_usecs)
+{
+ char *error = NULL;
+ const char *description = event_description(state->event);
+ if (verify_time(state,
+ state->event->time_type,
+ state->event->time_usecs,
+ state->event->time_usecs_end, live_usecs,
+ description, &error)) {
+ die("%s:%d: %s\n",
+ state->config->script_path,
+ state->event->line_number,
+ error);
+ }
+}
+
+/* Consecutive relative inbound packets should be anchored relative to the
+ * packet start times, to avoid accumulating errors from CPU processing
+ * overheads on consecutive packets.
+ */
+bool is_event_start_time_anchored(struct event *event)
+{
+ return (event->type == PACKET_EVENT &&
+ packet_direction(event->event.packet) == DIRECTION_INBOUND);
+}
+
+/* Set the start (and end time, if applicable) for the event if it
+ * uses wildcard or relative timing.
+ */
+void adjust_relative_event_times(struct state *state, struct event *event)
+{
+ s64 offset_usecs = 0;
+
+ if (event->time_type != ANY_TIME &&
+ event->time_type != RELATIVE_TIME &&
+ event->time_type != RELATIVE_RANGE_TIME)
+ return;
+
+ if (state->last_event &&
+ is_event_start_time_anchored(state->last_event) &&
+ is_event_start_time_anchored(event))
+ offset_usecs = state->last_event->time_usecs;
+ else
+ offset_usecs = now_usecs(state) - state->live_start_time_usecs;
+ event->offset_usecs = offset_usecs;
+
+ event->time_usecs += offset_usecs;
+ if (event->time_type == RELATIVE_RANGE_TIME)
+ event->time_usecs_end += offset_usecs;
+
+ /* Adjust the end time of blocking system calls using relative times. */
+ if (event->time_type == RELATIVE_TIME &&
+ event->type == SYSCALL_EVENT &&
+ is_blocking_syscall(event->event.syscall)) {
+ event->event.syscall->end_usecs += offset_usecs;
+ }
+}
+
+void wait_for_event(struct state *state)
+{
+ s64 event_usecs =
+ script_time_to_live_time_usecs(
+ state, state->event->time_usecs);
+ DEBUGP("waiting until %lld -- now is %lld\n",
+ event_usecs, now_usecs(state));
+ while (1) {
+ const s64 wait_usecs = event_usecs - now_usecs(state);
+ if (wait_usecs <= 0)
+ break;
+
+ /* If we're waiting a long time, and we are on an OS
+ * that we know has a fine-grained usleep(), then
+ * usleep() instead of spinning on the CPU.
+ */
+#ifdef linux
+ /* Since the scheduler may not wake us up precisely
+ * when we tell it to, sleep until just before the
+ * event we're waiting for and then spin.
+ */
+ if (state->num_events > 0 && wait_usecs > MAX_SPIN_USECS) {
+ run_unlock(state);
+ if (state->so_instance) {
+ state->so_instance->ifc.usleep(
+ state->so_instance->ifc.userdata,
+ wait_usecs - MAX_SPIN_USECS);
+ } else {
+ usleep(wait_usecs - MAX_SPIN_USECS);
+ }
+ run_lock(state);
+ }
+#endif
+
+ /* At this point we should only have a millisecond or
+ * two to wait, so we spin.
+ */
+ }
+
+ if (state->num_events > 0)
+ check_event_time(state, now_usecs(state));
+}
+
+int get_next_event(struct state *state, char **error)
+{
+ DEBUGP("gettimeofday: %.6f\n", now_usecs(state)/1000000.0);
+
+ if (state->event == NULL) {
+ /* First event. */
+ state->event = state->script->event_list;
+ state->script_start_time_usecs = state->event->time_usecs;
+ if (state->event->time_usecs != 0) {
+ asprintf(error,
+ "%s:%d: first event should be at time 0\n",
+ state->config->script_path,
+ state->event->line_number);
+ return STATUS_ERR;
+ }
+ } else {
+ /* Move to the next event. */
+ state->script_last_time_usecs = state->event->time_usecs;
+ state->last_event = state->event;
+ state->event = state->event->next;
+ }
+
+ if (state->event == NULL)
+ return STATUS_OK; /* script is done */
+
+ assert((state->event->type > INVALID_EVENT) &&
+ (state->event->type < NUM_EVENT_TYPES));
+
+ if (state->last_event &&
+ is_event_time_absolute(state->last_event) &&
+ is_event_time_absolute(state->event) &&
+ state->event->time_usecs < state->script_last_time_usecs) {
+ asprintf(error,
+ "%s:%d: time goes backward in script "
+ "from %lld usec to %lld usec\n",
+ state->config->script_path,
+ state->event->line_number,
+ state->script_last_time_usecs,
+ state->event->time_usecs);
+ return STATUS_ERR;
+ }
+ return STATUS_OK;
+}
+
+/* Run the given packet event; print warnings/errors, and exit on error. */
+static void run_local_packet_event(struct state *state, struct event *event,
+ struct packet *packet)
+{
+ char *error = NULL;
+ int result = STATUS_OK;
+
+ result = run_packet_event(state, event, packet, &error);
+ if (result == STATUS_WARN) {
+ fprintf(stderr, "%s", error);
+ free(error);
+ } else if (result == STATUS_ERR) {
+ die("%s", error);
+ }
+}
+
+/* For more consistent timing, if there's more than one CPU on this
+ * machine then use a real-time priority. We skip this if there's only
+ * 1 CPU because we do not want to risk making the machine
+ * unresponsive.
+ */
+void set_scheduling_priority(void)
+{
+ /* Get the CPU count and skip this if we only have 1 CPU. */
+ int num_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+ if (num_cpus < 0)
+ die_perror("sysconf(_SC_NPROCESSORS_ONLN)");
+ if (num_cpus <= 1)
+ return;
+
+#if !defined(__OpenBSD__)
+ /* Chose a real-time policy, but use SCHED_RR instead of
+ * SCHED_FIFO, so that we round-robin among real-time threads
+ * of the same priority. In practice this shouldn't matter,
+ * since there will not likely be other realtime threads.
+ */
+ int policy = SCHED_RR;
+
+ /* Use the minimum priority, to be nice. */
+ int priority = sched_get_priority_min(policy);
+ if (priority < 0)
+ die_perror("sched_get_priority_min");
+
+ /* Set the policy and priority for our threads. */
+ struct sched_param param;
+ memset(&param, 0, sizeof(param));
+ param.sched_priority = priority;
+ if (sched_setscheduler(0, policy, &param))
+ DEBUGP("sched_setscheduler failed: %s\n", strerror(errno));
+#endif /* !defined(__OpenBSD__) */
+}
+
+/* To ensure timing that's as consistent as possible, pull all our
+ * pages to RAM and pin them there.
+ */
+void lock_memory(void)
+{
+ if (mlockall(MCL_CURRENT | MCL_FUTURE))
+ die_perror("lockall(MCL_CURRENT | MCL_FUTURE)");
+}
+
+/* Wait for and return the wall time at which we should start the
+ * test, in microseconds. To make test results more reproducible, we
+ * wait for a start time that is well into the middle of a Linux jiffy
+ * (JIFFY_OFFSET_USECS into the jiffy). If you try to run a test
+ * script starting at a time that is too near the edge of a jiffy, and
+ * the test tries (as most do) to schedule events at 1-millisecond
+ * boundaries relative to this start time, then slight CPU or
+ * scheduling variations cause the kernel to record time measurements
+ * that are 1 jiffy too big or too small, so the kernel gets
+ * unexpected RTT and RTT variance values, leading to unexpected RTO
+ * and delayed ACK timer behavior.
+ *
+ * To try to find the edge of a jiffy, we spin and watch the output of
+ * times(2), which increments every time the jiffies clock has
+ * advanced another 10ms. We wait for a few ticks
+ * (TARGET_JIFFY_TICKS) to go by, to reduce noise from warm-up
+ * effects. We could do fancier measuring and filtering here, but so
+ * far this level of complexity seems sufficient.
+ */
+static s64 schedule_start_time_usecs(struct state *state)
+{
+#ifdef linux
+ s64 start_usecs = 0;
+ clock_t last_jiffies = times(NULL);
+ int jiffy_ticks = 0;
+ const int TARGET_JIFFY_TICKS = 10;
+ while (jiffy_ticks < TARGET_JIFFY_TICKS) {
+ clock_t jiffies = times(NULL);
+ if (jiffies != last_jiffies) {
+ start_usecs = now_usecs(state);
+ ++jiffy_ticks;
+ }
+ last_jiffies = jiffies;
+ }
+ const int JIFFY_OFFSET_USECS = 250;
+ start_usecs += JIFFY_OFFSET_USECS;
+ return start_usecs;
+#else
+ return now_usecs(state);
+#endif
+}
+
+/* Run final command we always execute at end of script, to clean up. If there
+ * is a cleanup command at the end of a packetdrill script, we execute that no
+ * matter whether the test passes or fails. This makes the cleanup command a
+ * good place to undo any sysctl settings the script changed, for example.
+ */
+int run_cleanup_command(void)
+{
+ if (cleanup_cmd != NULL && init_cmd_exed) {
+ char *error = NULL;
+
+ if (safe_system(cleanup_cmd, &error)) {
+ fprintf(stderr,
+ "%s: error executing cleanup command: %s\n",
+ script_path, error);
+ free(error);
+ return STATUS_ERR;
+ }
+ }
+ return STATUS_OK;
+}
+
+void run_script(struct config *config, struct script *script)
+{
+ char *error = NULL;
+ struct state *state = NULL;
+ struct netdev *netdev = NULL;
+ struct event *event = NULL;
+
+ DEBUGP("run_script: running script\n");
+
+ set_scheduling_priority();
+ lock_memory();
+
+ /* This interpreter loop runs for local mode or wire client mode. */
+ assert(!config->is_wire_server);
+
+ script_path = config->script_path;
+
+ /* How we use the network is of course a little different in
+ * each of the two cases....
+ */
+ if (config->is_wire_client)
+ netdev = wire_client_netdev_new(config);
+ else if (config->so_filename)
+ netdev = so_netdev_new(config);
+ else
+ netdev = local_netdev_new(config);
+
+ state = state_new(config, script, netdev);
+
+ if (config->is_wire_client) {
+ state->wire_client = wire_client_new();
+ wire_client_init(state->wire_client, config, script);
+ }
+
+ if (config->so_filename) {
+ state->so_instance = so_instance_new();
+ so_instance_init(state->so_instance, config, script, state);
+ }
+
+ init_cmd_exed = false;
+ if (script->init_command != NULL) {
+ if (safe_system(script->init_command->command_line,
+ &error)) {
+ asprintf(&error, "%s: error executing init command: %s\n",
+ config->script_path, error);
+ free(error);
+ exit(EXIT_FAILURE);
+ }
+ init_cmd_exed = true;
+ }
+
+ signal(SIGPIPE, SIG_IGN); /* ignore EPIPE */
+
+ state->live_start_time_usecs = schedule_start_time_usecs(state);
+ DEBUGP("live_start_time_usecs is %lld\n",
+ state->live_start_time_usecs);
+
+ if (state->wire_client != NULL)
+ wire_client_send_client_starting(state->wire_client);
+
+ while (1) {
+ if (get_next_event(state, &error))
+ die("%s", error);
+ event = state->event;
+ if (event == NULL)
+ break;
+
+ if (state->wire_client != NULL)
+ wire_client_next_event(state->wire_client, event);
+
+ /* In wire mode, we adjust relative times after
+ * getting notification that previous packet events
+ * have completed, if any.
+ */
+ adjust_relative_event_times(state, event);
+
+ switch (event->type) {
+ case PACKET_EVENT:
+ /* For wire clients, the server handles packets. */
+ if (!config->is_wire_client) {
+ run_local_packet_event(state, event,
+ event->event.packet);
+ }
+ break;
+ case SYSCALL_EVENT:
+ run_system_call_event(state, event,
+ event->event.syscall);
+ break;
+ case COMMAND_EVENT:
+ run_command_event(state, event,
+ event->event.command);
+ break;
+ case CODE_EVENT:
+ run_code_event(state, event,
+ event->event.code->text);
+ break;
+ case INVALID_EVENT:
+ case NUM_EVENT_TYPES:
+ assert(!"bogus type");
+ break;
+ /* We omit default case so compiler catches missing values. */
+ }
+ state->num_events++;
+ }
+
+ /* Wait for any outstanding packet events we requested on the server. */
+ if (state->wire_client != NULL)
+ wire_client_next_event(state->wire_client, NULL);
+
+ if (run_cleanup_command() == STATUS_ERR)
+ exit(EXIT_FAILURE);
+
+ if (code_execute(state->code, &error)) {
+ die("%s: error executing code: %s\n",
+ state->config->script_path, error);
+ free(error);
+ }
+
+ state_free(state);
+
+ DEBUGP("run_script: done running\n");
+}
+
+int parse_script_and_set_config(int argc, char *argv[],
+ struct config *config,
+ struct script *script,
+ const char *script_path,
+ const char *script_buffer)
+{
+ struct invocation invocation = {
+ .argc = argc,
+ .argv = argv,
+ .config = config,
+ .script = script,
+ };
+
+ DEBUGP("parse_and_run_script: %s\n", script_path);
+ assert(script_path != NULL);
+
+ init_script(script);
+
+ set_default_config(config);
+ config->script_path = strdup(script_path);
+
+ if (script_buffer != NULL)
+ copy_script(script_buffer, script);
+ else
+ read_script(script_path, script);
+
+ return parse_script(config, script, &invocation);
+}
diff --git a/test/packetdrill/run.h b/test/packetdrill/run.h
new file mode 100644
index 0000000..da56454
--- /dev/null
+++ b/test/packetdrill/run.h
@@ -0,0 +1,197 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Interface for the test script execution module.
+ *
+ * Threading And Locking Model
+ *
+ * There are two threads in our process:
+ *
+ * 1) main thread: this is the thread that invokes main() and
+ * does most of the work of test execution.
+ *
+ * 2) blocking system call thread: this is the thread that
+ * executes blocking system calls.
+ *
+ * To keep things as simple as possible, there is a single global
+ * mutex, state->mutex, which protects all global data (data that is
+ * not purely local to a function).
+ *
+ * The main thread holds the global mutex for almost the entire
+ * duration of a test run. It unlocks the mutex only for:
+ *
+ * o sleeping while waiting for the start time of the next event
+ * o waiting for the system call thread to block on a system call
+ * o waiting for the system call thread to exit
+ *
+ * The system call thread runs briefly, only to execute blocking
+ * system calls, and holds the global mutex for the entire duration it
+ * is running, from interpreting system call arguments to processing
+ * system call outputs. It unlocks the mutex only for:
+ *
+ * o sleeping while waiting for the start time of the system call
+ * o the actual function call to invoke the blocking system call itself
+ */
+
+#ifndef __RUN_H__
+#define __RUN_H__
+
+#include "types.h"
+
+#include <netinet/in.h>
+#include <pthread.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+#include "code.h"
+#include "config.h"
+#include "fd_state.h"
+#include "netdev.h"
+#include "run_packet.h"
+#include "run_system_call.h"
+#include "script.h"
+#include "socket.h"
+#include "so_testing.h"
+#include "wire_client.h"
+
+/* Public top-level entry point for executing a test script */
+extern void run_script(struct config *config,
+ struct script *script);
+
+/* Public entry-point to parse a script and finalize config. If the
+ * script_buffer is provided, parse that. Otherwise, read the file
+ * with the given path, parse that.
+ */
+extern int parse_script_and_set_config(int argc, char *argv[],
+ struct config *config,
+ struct script *script,
+ const char *script_path,
+ const char *script_buffer);
+
+/* Private implementation details follow below... */
+
+/* All the runtime state for a test. */
+struct state {
+ pthread_mutex_t mutex; /* global lock for all global state */
+ struct config *config; /* test configuration */
+ struct netdev *netdev; /* for sending/receiving TCP packets */
+ struct packets *packets; /* for processing packets */
+ struct syscalls *syscalls; /* for running system calls */
+ struct fd_state *fds; /* list of all file descriptors */
+ struct socket *socket_under_test; /* socket handling packets */
+ struct script *script; /* script we're running */
+ struct event *event; /* the current event */
+ struct event *last_event; /* previous event */
+ struct code_state *code; /* for running post-processing code */
+ struct wire_client *wire_client; /* for on-the-wire tests */
+ struct so_instance *so_instance; /* for SO testing */
+ s64 script_start_time_usecs; /* time of first event in script */
+ s64 script_last_time_usecs; /* time of previous event in script */
+ s64 live_start_time_usecs; /* time of first event in live test */
+ int num_events; /* events executed so far */
+};
+
+/* Allocate all run-time state for executing a test script. */
+extern struct state *state_new(struct config *config,
+ struct script *script,
+ struct netdev *netdev);
+
+/* Free all run-time state for a test. */
+void state_free(struct state *state);
+
+/* Add the file descriptor to the list of run-time file descriptors. */
+void state_add_fd(struct state *state, struct fd_state *fd);
+
+/* Grab the global lock for all global state. */
+static inline void run_lock(struct state *state)
+{
+ if (pthread_mutex_lock(&state->mutex) != 0)
+ die_perror("pthread_mutex_lock");
+}
+
+/* Release the global lock for all global state. */
+static inline void run_unlock(struct state *state)
+{
+ if (pthread_mutex_unlock(&state->mutex) != 0)
+ die_perror("pthread_mutex_unlock");
+}
+
+/* Get the wall clock time of day in microseconds. */
+extern s64 now_usecs(struct state *state);
+
+/* Convert script time to live wall clock time. */
+static inline s64 script_time_to_live_time_usecs(struct state *state,
+ s64 script_time_usecs)
+{
+ s64 offset_usecs = script_time_usecs - state->script_start_time_usecs;
+ s64 live_time_usecs = state->live_start_time_usecs + offset_usecs;
+ return live_time_usecs;
+}
+
+/* Convert live wall clock time to script time. */
+static inline s64 live_time_to_script_time_usecs(struct state *state,
+ s64 live_time_usecs)
+{
+ s64 offset_usecs = live_time_usecs - state->live_start_time_usecs;
+ s64 script_time_usecs = state->script_start_time_usecs + offset_usecs;
+ return script_time_usecs;
+}
+
+/*
+ * See if something that happened at the given actual live wall time
+ * in microseconds happened reasonably close to the time at which we
+ * wanted it to happen in the script. verify_time compares the
+ * given script and live times and returns STATUS_OK on success or on
+ * failure returns STATUS_ERR and fills in *error using the given
+ * description. The check_event_time variant is a shortcut
+ * for the common case: it looks at the current event and on failure
+ * it prints the error message to stderr and exits with an error
+ * status. For time ranges the end time is specified in script_usecs_end.
+ */
+extern int verify_time(struct state *state, enum event_time_t time_type,
+ s64 script_usecs, s64 script_usecs_end,
+ s64 live_usecs, const char *description, char **error);
+extern void check_event_time(struct state *state, s64 live_usecs);
+
+/* Set the start (and end time, if applicable) for the event if it
+ * uses wildcard or relative timing.
+ */
+extern void adjust_relative_event_times(struct state *state,
+ struct event *event);
+
+/*
+ * Sleep and/or spin until the time at which we want the current event
+ * to happen.
+ */
+extern void wait_for_event(struct state *state);
+
+/* Advance the interpreter state to the next event. */
+extern int get_next_event(struct state *state, char **error);
+
+/* Set a higher priority for ourselves, to reduce test timing noise. */
+extern void set_scheduling_priority(void);
+
+/* Try to pin our pages into RAM. */
+extern void lock_memory(void);
+
+/* Run final command we always execute at end of script, to clean up. */
+extern int run_cleanup_command(void);
+
+#endif /* __RUN_H__ */
diff --git a/test/packetdrill/run_command.c b/test/packetdrill/run_command.c
new file mode 100644
index 0000000..a55e596
--- /dev/null
+++ b/test/packetdrill/run_command.c
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * A module to execute a command from a test script.
+ */
+
+#include "run_command.h"
+
+#include <errno.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include "logging.h"
+#include "run.h"
+#include "script.h"
+#include "system.h"
+
+void run_command_event(
+ struct state *state, struct event *event, struct command_spec *command)
+{
+ DEBUGP("%d: command: `%s`\n", event->line_number,
+ command->command_line);
+
+ /* Wait for the right time before firing off this event. */
+ wait_for_event(state);
+
+ char *error = NULL;
+ if (safe_system(command->command_line, &error))
+ goto error_out;
+ return;
+
+error_out:
+ die("%s:%d: error executing `%s` command: %s\n",
+ state->config->script_path, event->line_number,
+ command->command_line, error);
+ free(error);
+}
diff --git a/test/packetdrill/run_command.h b/test/packetdrill/run_command.h
new file mode 100644
index 0000000..8839b06
--- /dev/null
+++ b/test/packetdrill/run_command.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Interface for a module to execute a command from a test script.
+ */
+
+#ifndef __RUN_COMMAND_H__
+#define __RUN_COMMAND_H__
+
+#include "types.h"
+
+#include "run.h"
+#include "script.h"
+
+/* Execute the command. */
+extern void run_command_event(struct state *state,
+ struct event *event,
+ struct command_spec *command);
+
+#endif /* __RUN_COMMAND_H__ */
diff --git a/test/packetdrill/run_packet.c b/test/packetdrill/run_packet.c
new file mode 100644
index 0000000..9c45d5a
--- /dev/null
+++ b/test/packetdrill/run_packet.c
@@ -0,0 +1,1934 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * A module to execute a packet command from a test script.
+ */
+
+#include "run_packet.h"
+
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#include "checksum.h"
+#include "gre.h"
+#include "logging.h"
+#include "netdev.h"
+#include "packet.h"
+#include "packet_checksum.h"
+#include "packet_to_string.h"
+#include "run.h"
+#include "script.h"
+#include "tcp.h"
+#include "tcp_options_iterator.h"
+#include "tcp_options_to_string.h"
+#include "tcp_packet.h"
+#include "wrap.h"
+
+/* To avoid issues with TIME_WAIT, FIN_WAIT1, and FIN_WAIT2 we use
+ * dynamically-chosen, unique 4-tuples for each test. We implement the
+ * picking of unique ports by binding a socket to port 0 and seeing
+ * what port we are assigned. Note that we keep the socket fd open for
+ * the lifetime of our process to ensure that the port is not
+ * reused by a later test.
+ */
+static u16 ephemeral_port(enum ip_version_t ip_version)
+{
+ int fd = wrap_socket(ip_version, SOCK_STREAM);
+
+ return wrap_bind_listen(fd, ip_version, 0);
+}
+
+/* Return the next ephemeral port to use. We want quick results for
+ * the very common case where there is only one remote port to use
+ * over the course of a test. So we avoid paying the overhead of the
+ * several system calls in ephemeral_port() right before injecting an
+ * incoming SYN by pre-allocating and caching a single port to use
+ * before starting each test.
+ */
+static u16 next_ephemeral_port(struct state *state)
+{
+ if (state->packets->next_ephemeral_port >= 0) {
+ int port = state->packets->next_ephemeral_port;
+ assert(port <= 0xffff);
+ state->packets->next_ephemeral_port = -1;
+ return port;
+ } else {
+ return ephemeral_port(state->config->ip_version);
+ }
+}
+
+/* Add a dump of the given packet to the given error message.
+ * Frees *error and replaces it with a version that has the original
+ * *error followed by the given type and a hex dump of the given
+ * packet.
+ */
+static void add_packet_dump(char **error, const char *type,
+ struct packet *packet, s64 time_usecs,
+ enum dump_format_t format)
+{
+ if (packet->ip_bytes != 0) {
+ char *old_error = *error;
+ char *dump = NULL, *dump_error = NULL;
+
+ packet_to_string(packet, format,
+ &dump, &dump_error);
+ asprintf(error, "%s\n%s packet: %9.6f %s%s%s",
+ old_error, type, usecs_to_secs(time_usecs), dump,
+ dump_error ? "\n" : "",
+ dump_error ? dump_error : "");
+
+ free(dump);
+ free(dump_error);
+ free(old_error);
+ }
+}
+
+/* For verbose runs, print a short packet dump of all live packets. */
+static void verbose_packet_dump(struct state *state, const char *type,
+ struct packet *live_packet, s64 time_usecs)
+{
+ if (state->config->verbose) {
+ char *dump = NULL, *dump_error = NULL;
+
+ packet_to_string(live_packet, DUMP_SHORT,
+ &dump, &dump_error);
+
+ printf("%s packet: %9.6f %s%s%s\n",
+ type, usecs_to_secs(time_usecs), dump,
+ dump_error ? "\n" : "",
+ dump_error ? dump_error : "");
+
+ free(dump);
+ free(dump_error);
+ }
+}
+
+/* See if the live packet matches the live 4-tuple of the socket (UDP/TCP)
+ * or matches the src/dst IP addr for the ICMP socket
+ */
+static struct socket *find_socket_for_live_packet(
+ struct state *state, const struct packet *packet,
+ enum direction_t *direction)
+{
+ struct socket *socket = state->socket_under_test; /* shortcut */
+ if (socket == NULL)
+ return NULL;
+
+ struct tuple packet_tuple, live_outbound, live_inbound;
+ bool is_icmp = (socket->protocol == IPPROTO_ICMP && packet->icmpv4) ||
+ (socket->protocol == IPPROTO_ICMPV6 && packet->icmpv6);
+ get_packet_tuple(packet, &packet_tuple);
+
+ /* Is packet inbound to the socket under test? */
+ socket_get_inbound(&socket->live, &live_inbound);
+ if (is_equal_tuple(&packet_tuple, &live_inbound) ||
+ (is_icmp &&
+ is_equal_ip(&packet_tuple.dst.ip, &live_inbound.dst.ip) &&
+ is_equal_ip(&packet_tuple.src.ip, &live_inbound.src.ip))) {
+ *direction = DIRECTION_INBOUND;
+ DEBUGP("inbound live packet, socket in state %d\n",
+ socket->state);
+ return socket;
+ }
+ /* Is packet outbound from the socket under test? */
+ socket_get_outbound(&socket->live, &live_outbound);
+ if (is_equal_tuple(&packet_tuple, &live_outbound) ||
+ (is_icmp &&
+ is_equal_ip(&packet_tuple.dst.ip, &live_outbound.dst.ip) &&
+ is_equal_ip(&packet_tuple.src.ip, &live_outbound.src.ip))) {
+ *direction = DIRECTION_OUTBOUND;
+ DEBUGP("outbound live packet, socket in state %d\n",
+ socket->state);
+ return socket;
+ }
+
+ return NULL;
+}
+
+/* See if the socket under test is listening and is willing to receive
+ * this incoming SYN packet. If so, create a new child socket, anoint
+ * it as the new socket under test, and return a pointer to
+ * it. Otherwise, return NULL.
+ */
+static struct socket *handle_listen_for_script_packet(
+ struct state *state, const struct packet *packet,
+ enum direction_t direction)
+{
+ /* Does this packet match this socket? For now we only support
+ * testing one socket at a time, so we merely check whether
+ * the socket is listening. (If we were to support testing
+ * more than one socket at a time then we'd want to check to
+ * see if the address tuples in the packet and socket match.)
+ */
+ struct config *config = state->config;
+ struct socket *socket = state->socket_under_test; /* shortcut */
+
+ bool match = (direction == DIRECTION_INBOUND);
+ if (!match)
+ return NULL;
+
+ if (config->is_wire_server) {
+ /* On wire servers we don't see the system calls, so
+ * we won't have any socket_under_test yet.
+ */
+ match = (socket == NULL);
+ } else {
+ /* In local mode we typically know about the socket, but
+ * we also allow null socket_under_test to facilitate tests
+ * where we intentionally want no matching socket.
+ */
+ match = (socket == NULL) ||
+ (socket->state == SOCKET_PASSIVE_LISTENING);
+ }
+ if (!match)
+ return NULL;
+
+ /* Create a child passive socket for this incoming SYN packet.
+ * Any further packets in the test script will be directed to
+ * this child socket.
+ */
+ socket = socket_new(state);
+ state->socket_under_test = socket;
+ assert(socket->state == SOCKET_INIT);
+ socket->state = SOCKET_PASSIVE_PACKET_RECEIVED;
+ socket->address_family = packet_address_family(packet);
+ socket->protocol = packet_ip_protocol(packet);
+
+ /* Set script info for this socket using script packet. */
+ struct tuple tuple;
+ get_packet_tuple(packet, &tuple);
+ socket->script.remote = tuple.src;
+ socket->script.local = tuple.dst;
+ socket->script.remote_isn = ntohl(packet->tcp->seq);
+ socket->fd.script_fd = -1;
+
+ /* Set up the live info for this socket based
+ * on the script packet and our overall config.
+ */
+ socket->live.remote.ip = config->live_remote_ip;
+ socket->live.remote.port = htons(next_ephemeral_port(state));
+ socket->live.local.ip = config->live_local_ip;
+ socket->live.local.port = htons(config->live_bind_port);
+ socket->live.remote_isn = ntohl(packet->tcp->seq);
+ socket->fd.live_fd = -1;
+
+ if (DEBUG_LOGGING) {
+ char local_string[ADDR_STR_LEN];
+ char remote_string[ADDR_STR_LEN];
+ DEBUGP("live: local: %s.%d\n",
+ ip_to_string(&socket->live.local.ip, local_string),
+ ntohs(socket->live.local.port));
+ DEBUGP("live: remote: %s.%d\n",
+ ip_to_string(&socket->live.remote.ip, remote_string),
+ ntohs(socket->live.remote.port));
+ DEBUGP("live: ISN: %u\n", socket->live.remote_isn);
+ }
+
+ return socket;
+}
+
+/* See if the socket under test is a connecting socket that would emit
+ * this outgoing script SYN. If so, return a pointer to the socket;
+ * otherwise, return NULL.
+ */
+static struct socket *handle_connect_for_script_packet(
+ struct state *state, const struct packet *packet,
+ enum direction_t direction)
+{
+ /* Does this packet match this socket? For now we only support
+ * testing one socket at a time, so we merely check whether
+ * the socket is connecting. (If we were to support testing
+ * more than one socket at a time then we'd want to check to
+ * see if the address tuples in the packet and socket match.)
+ */
+ struct config *config = state->config;
+ struct socket *socket = state->socket_under_test; /* shortcut */
+
+ bool match = ((direction == DIRECTION_OUTBOUND) &&
+ packet->tcp->syn && !packet->tcp->ack);
+ if (!match)
+ return NULL;
+
+ if (config->is_wire_server) {
+ /* On wire servers we don't see the system calls, so
+ * we won't have any socket_under_test yet.
+ */
+ match = (socket == NULL);
+ } else {
+ /* In local mode we will certainly know about this socket. */
+ match = ((socket != NULL) &&
+ (socket->state == SOCKET_ACTIVE_CONNECTING));
+ }
+ if (!match)
+ return NULL;
+
+ if (socket == NULL) {
+ /* Wire server. Create a socket for this outbound SYN
+ * packet. Any further packets in the test script are
+ * mapped here.
+ */
+ socket = socket_new(state);
+ state->socket_under_test = socket;
+ assert(socket->state == SOCKET_INIT);
+ socket->address_family = packet_address_family(packet);
+ socket->protocol = packet_ip_protocol(packet);
+
+ socket->fd.script_fd = -1;
+
+ socket->live.remote.ip = config->live_remote_ip;
+ socket->live.remote.port = htons(config->live_connect_port);
+ socket->fd.live_fd = -1;
+ }
+
+ /* Fill in the new info about this connection. */
+ struct tuple tuple;
+ get_packet_tuple(packet, &tuple);
+ socket->state = SOCKET_ACTIVE_SYN_SENT;
+ socket->script.remote = tuple.dst;
+ socket->script.local = tuple.src;
+ socket->script.local_isn = ntohl(packet->tcp->seq);
+
+ return socket;
+}
+
+/* Look for a connecting socket that would emit this outgoing live packet. */
+static struct socket *find_connect_for_live_packet(
+ struct state *state, struct packet *packet,
+ enum direction_t *direction)
+{
+ struct tuple tuple;
+ get_packet_tuple(packet, &tuple);
+
+ *direction = DIRECTION_INVALID;
+ struct socket *socket = state->socket_under_test; /* shortcut */
+ if (!socket)
+ return NULL;
+
+ bool is_udp_match =
+ (packet->udp &&
+ (socket->protocol == IPPROTO_UDP) &&
+ (socket->state == SOCKET_ACTIVE_CONNECTING));
+ bool is_icmp_match =
+ (((packet->icmpv4 && socket->protocol == IPPROTO_ICMP) ||
+ (packet->icmpv6 && socket->protocol == IPPROTO_ICMPV6)) &&
+ (socket->state == SOCKET_ACTIVE_CONNECTING));
+ bool is_tcp_match =
+ (packet->tcp && packet->tcp->syn && !packet->tcp->ack &&
+ (socket->protocol == IPPROTO_TCP) &&
+ (socket->state == SOCKET_ACTIVE_SYN_SENT));
+ if (!is_udp_match && !is_tcp_match && !is_icmp_match)
+ return NULL;
+
+ if (is_icmp_match) {
+ if (!is_equal_ip(&tuple.dst.ip, &socket->live.remote.ip))
+ return NULL;
+ } else {
+ if (!is_equal_ip(&tuple.dst.ip, &socket->live.remote.ip) ||
+ !is_equal_port(tuple.dst.port, socket->live.remote.port))
+ return NULL;
+ }
+
+ *direction = DIRECTION_OUTBOUND;
+ /* Using the details in this outgoing packet, fill in the
+ * new details we've learned about this actively initiated
+ * connection (for which we've seen a connect() call).
+ */
+ socket->live.local.ip = tuple.src.ip;
+ socket->live.local.port = tuple.src.port;
+
+ if (packet->tcp)
+ socket->live.local_isn = ntohl(packet->tcp->seq);
+
+ return socket;
+}
+
+/* Convert outbound TCP timestamp value from scripted value to live value. */
+static int get_outbound_ts_val_mapping(
+ struct socket *socket, u32 script_timestamp, u32 *live_timestamp)
+{
+ DEBUGP("get_outbound_ts_val_mapping\n");
+ DEBUGP("ts_val_mapping %u -> ?\n", ntohl(script_timestamp));
+ if (hash_map_get(socket->ts_val_map,
+ script_timestamp, live_timestamp))
+ return STATUS_OK;
+ return STATUS_ERR;
+}
+
+/* Store script->live mapping for outbound TCP timestamp value. */
+static void set_outbound_ts_val_mapping(
+ struct socket *socket, u32 script_timestamp, u32 live_timestamp)
+{
+ DEBUGP("set_outbound_ts_val_mapping\n");
+ DEBUGP("ts_val_mapping %u -> %u\n",
+ ntohl(script_timestamp), ntohl(live_timestamp));
+ hash_map_set(socket->ts_val_map,
+ script_timestamp, live_timestamp);
+}
+
+/* A helper to find the TCP timestamp option in a packet. Parse the
+ * TCP options and fill in packet->tcp_ts_val with the location of the
+ * TCP timestamp value field (or NULL if there isn't one), and
+ * likewise fill in packet->tcp_ts_ecr with the location of the TCP
+ * timestamp echo reply field (or NULL if there isn't one). Returns
+ * STATUS_OK on success; on failure returns STATUS_ERR and sets
+ * error message.
+ */
+static int find_tcp_timestamp(struct packet *packet, char **error)
+{
+ struct tcp_options_iterator iter;
+ struct tcp_option *option = NULL;
+
+ packet->tcp_ts_val = NULL;
+ packet->tcp_ts_ecr = NULL;
+ for (option = tcp_options_begin(packet, &iter); option != NULL;
+ option = tcp_options_next(&iter, error))
+ if (option->kind == TCPOPT_TIMESTAMP) {
+ packet->tcp_ts_val =
+ (void *)&(option->data.time_stamp.val);
+ packet->tcp_ts_ecr =
+ (void *)&(option->data.time_stamp.ecr);
+ }
+ return *error ? STATUS_ERR : STATUS_OK;
+}
+
+/* A helper to help translate SACK sequence numbers between live and
+ * script space. Specifically, it offsets SACK block sequence numbers
+ * by the given 'ack_offset'. Returns STATUS_OK on success; on
+ * failure returns STATUS_ERR and sets error message.
+ */
+static int offset_sack_blocks(struct packet *packet,
+ u32 ack_offset, char **error)
+{
+ struct tcp_options_iterator iter;
+ struct tcp_option *option = NULL;
+ for (option = tcp_options_begin(packet, &iter); option != NULL;
+ option = tcp_options_next(&iter, error)) {
+ if (option->kind == TCPOPT_SACK) {
+ int num_blocks = 0;
+ if (num_sack_blocks(option->length,
+ &num_blocks, error))
+ return STATUS_ERR;
+ int i = 0;
+ for (i = 0; i < num_blocks; ++i) {
+ u32 val;
+ val = ntohl(option->data.sack.block[i].left);
+ val += ack_offset;
+ option->data.sack.block[i].left = htonl(val);
+ val = ntohl(option->data.sack.block[i].right);
+ val += ack_offset;
+ option->data.sack.block[i].right = htonl(val);
+ }
+ }
+ }
+ return *error ? STATUS_ERR : STATUS_OK;
+}
+
+
+/* Rewrite the TCP sequence number echoed by the ICMP packet.
+ * The Linux TCP layer ignores ICMP messages with bogus sequence numbers.
+ */
+static int map_inbound_icmp_tcp_packet(
+ struct socket *socket, struct packet *live_packet, char **error)
+{
+ u32 *seq = packet_echoed_tcp_seq(live_packet);
+ bool is_syn = false;
+ u32 seq_offset = local_seq_script_to_live_offset(socket, is_syn);
+ *seq = htonl(ntohl(*seq) + seq_offset);
+ return STATUS_OK;
+}
+
+/* UDP headers echoed by ICMP messages need no special rewriting. */
+static int map_inbound_icmp_udp_packet(
+ struct socket *socket, struct packet *live_packet, char **error)
+{
+ return STATUS_OK;
+}
+
+static int map_inbound_icmp_packet(
+ struct socket *socket, struct packet *live_packet, char **error)
+{
+ if (live_packet->echoed_header) {
+ if (packet_echoed_ip_protocol(live_packet) == IPPROTO_TCP)
+ return map_inbound_icmp_tcp_packet(socket, live_packet, error);
+ else if (packet_echoed_ip_protocol(live_packet) == IPPROTO_UDP)
+ return map_inbound_icmp_udp_packet(socket, live_packet, error);
+ else
+ assert(!"unsupported layer 4 protocol echoed in ICMP packet");
+ return STATUS_ERR;
+ } else {
+ return STATUS_OK;
+ }
+}
+
+/* Rewrite the IP and TCP, UDP, or ICMP fields in 'live_packet', mapping
+ * inbound packet values (address 4-tuple and sequence numbers in seq,
+ * ACK, SACK blocks) from script values to live values, so that we can
+ * inject this packet into the kernel and have the kernel accept it
+ * for the given socket and process it. Returns STATUS_OK on success;
+ * on failure returns STATUS_ERR and sets error message.
+ */
+static int map_inbound_packet(
+ struct state *state,
+ struct socket *socket, struct packet *live_packet, char **error)
+{
+ DEBUGP("map_inbound_packet\n");
+
+ /* Remap packet to live values. */
+ struct tuple live_inbound;
+ u16 src_port = 0;
+ u16 dst_port = 0;
+
+ if (live_packet->tcp) {
+ src_port = live_packet->tcp->src_port;
+ dst_port = live_packet->tcp->dst_port;
+ } else if (live_packet->udp) {
+ src_port = live_packet->udp->src_port;
+ dst_port = live_packet->udp->dst_port;
+ }
+ socket_get_inbound(&socket->live, &live_inbound);
+ set_packet_tuple(live_packet, &live_inbound);
+ /* restore preset src and dst port */
+ if (live_packet->tcp) {
+ if (src_port)
+ live_packet->tcp->src_port = src_port;
+ if (dst_port)
+ live_packet->tcp->dst_port = dst_port;
+ } else if (live_packet->udp) {
+ if (src_port)
+ live_packet->udp->src_port = src_port;
+ if (dst_port)
+ live_packet->udp->dst_port = dst_port;
+ }
+
+ live_packet->mss = state->config->mss;
+
+ if ((live_packet->icmpv4 != NULL) || (live_packet->icmpv6 != NULL))
+ return map_inbound_icmp_packet(socket, live_packet, error);
+
+ /* If no TCP headers to rewrite, then we're done. */
+ if (live_packet->tcp == NULL)
+ return STATUS_OK;
+
+ /* Remap the sequence number from script sequence number to live. */
+ const bool is_syn = live_packet->tcp->syn;
+ const u32 seq_offset = remote_seq_script_to_live_offset(socket, is_syn);
+ live_packet->tcp->seq =
+ htonl(ntohl(live_packet->tcp->seq) + seq_offset);
+
+ /* Remap the ACK and SACKs from script sequence number to live. */
+ const u32 ack_offset = local_seq_script_to_live_offset(socket, is_syn);
+ if (live_packet->tcp->ack)
+ live_packet->tcp->ack_seq =
+ htonl(ntohl(live_packet->tcp->ack_seq) + ack_offset);
+ if (offset_sack_blocks(live_packet, ack_offset, error))
+ return STATUS_ERR;
+
+ /* Find the timestamp echo reply is, so we can remap that below. */
+ if (find_tcp_timestamp(live_packet, error))
+ return STATUS_ERR;
+
+ /* Remap TCP timestamp echo reply from script value to a live
+ * value. We say "a" rather than "the" live value because
+ * there could be multiple live values corresponding to the
+ * same script value if a live test replay flips to a new
+ * jiffie in a spot where the script did not.
+ */
+ if (live_packet->tcp->ack && (live_packet->tcp_ts_ecr != NULL)) {
+ u32 live_ts_ecr = 0;
+
+ if (get_outbound_ts_val_mapping(socket,
+ packet_tcp_ts_ecr(live_packet),
+ &live_ts_ecr) == STATUS_OK) {
+ /* TS ecr refers to an exact outbound TS val. */
+ packet_set_tcp_ts_ecr(live_packet, live_ts_ecr);
+ } else if (state->config->tcp_ts_ecr_scaled &&
+ socket->found_first_tcp_ts) {
+ /* Interpolate to approximate TS ecr. By
+ * default we verify that inbound TCP
+ * timestamp ECR values reflect earlier
+ * outbound TCP timestamp VAL values, since
+ * this is what well-behaved stacks will do.
+ */
+ live_ts_ecr = (packet_tcp_ts_ecr(live_packet) -
+ socket->first_script_ts_val +
+ socket->first_actual_ts_val);
+ packet_set_tcp_ts_ecr(live_packet, live_ts_ecr);
+ } else {
+ asprintf(error,
+ "unable to infer live TS ecr for "
+ "script TS ecr %u; "
+ "no matching or preceding outound TS val",
+ packet_tcp_ts_ecr(live_packet));
+ return STATUS_ERR;
+ }
+ }
+
+ return STATUS_OK;
+}
+
+static int tcp_convert_seq_number(struct socket *socket, struct packet *packet,
+ char **error)
+{
+ /* Rewrite TCP sequence number from live to script space. */
+ const bool is_syn = packet->tcp->syn;
+ const u32 seq_offset = local_seq_live_to_script_offset(socket, is_syn);
+ packet->tcp->seq =
+ htonl(ntohl(packet->tcp->seq) + seq_offset);
+
+ /* Rewrite ACKs and SACKs from live to script space. */
+ const u32 ack_offset = remote_seq_live_to_script_offset(socket, is_syn);
+ if (packet->tcp->ack)
+ packet->tcp->ack_seq =
+ htonl(ntohl(packet->tcp->ack_seq) + ack_offset);
+ if (offset_sack_blocks(packet, ack_offset, error))
+ return STATUS_ERR;
+
+ return STATUS_OK;
+}
+
+/* Transforms values in the 'actual_packet' by mapping outbound packet
+ * values in the sniffed 'live_packet' (address 4-tuple, sequence
+ * number in seq, timestamp value) from live values to script values
+ * in the space of 'script_packet'. This will allow us to compare a
+ * packet sent by the kernel to the packet expected by the script.
+ */
+static int map_outbound_live_packet(
+ struct socket *socket,
+ struct packet *live_packet,
+ struct packet *actual_packet,
+ struct packet *script_packet,
+ char **error)
+{
+ DEBUGP("map_outbound_live_packet\n");
+
+ struct tuple live_packet_tuple, live_outbound, script_outbound;
+
+ /* Verify packet addresses are outbound and live for this socket. */
+ get_packet_tuple(live_packet, &live_packet_tuple);
+ socket_get_outbound(&socket->live, &live_outbound);
+ if (socket->protocol == IPPROTO_ICMP ||
+ socket->protocol == IPPROTO_ICMPV6)
+ assert(is_equal_ip(&live_packet_tuple.src.ip, &live_outbound.src.ip) &&
+ is_equal_ip(&live_packet_tuple.dst.ip, &live_outbound.dst.ip));
+ else
+ assert(is_equal_tuple(&live_packet_tuple, &live_outbound));
+
+ /* Rewrite 4-tuple to be outbound script values. */
+ socket_get_outbound(&socket->script, &script_outbound);
+ set_packet_tuple(actual_packet, &script_outbound);
+
+ /* If no TCP headers to rewrite, then we're done. */
+ if (live_packet->tcp == NULL)
+ return STATUS_OK;
+
+ /* Extract location of script and actual TCP timestamp values. */
+ if (find_tcp_timestamp(script_packet, error))
+ return STATUS_ERR;
+ if (find_tcp_timestamp(actual_packet, error))
+ return STATUS_ERR;
+ if ((script_packet->tcp_ts_val != NULL) &&
+ (actual_packet->tcp_ts_val != NULL)) {
+ u32 script_ts_val = packet_tcp_ts_val(script_packet);
+ u32 actual_ts_val = packet_tcp_ts_val(actual_packet);
+ u32 script_ts_ecr = packet_tcp_ts_ecr(script_packet);
+ u32 actual_ts_ecr = packet_tcp_ts_ecr(actual_packet);
+
+ /* Remember script->actual TS val mapping for later. */
+ set_outbound_ts_val_mapping(socket,
+ script_ts_val,
+ actual_ts_val);
+
+ /* Find baseline for socket's live->script TS val mapping. */
+ if (!socket->found_first_tcp_ts) {
+ socket->found_first_tcp_ts = true;
+ socket->first_script_ts_val = script_ts_val;
+ socket->first_actual_ts_val = actual_ts_val;
+ socket->first_script_ts_ecr = script_ts_ecr;
+ socket->first_actual_ts_ecr = actual_ts_ecr;
+ }
+
+ /* Rewrite TCP timestamp value to script space, so we
+ * can compare the script and actual outbound TCP
+ * timestamp val.
+ */
+ packet_set_tcp_ts_val(actual_packet,
+ socket->first_script_ts_val +
+ (actual_ts_val -
+ socket->first_actual_ts_val));
+ }
+
+ return STATUS_OK;
+}
+
+/* Verify IP and TCP checksums on an outbound live packet. */
+static int verify_outbound_live_checksums(struct packet *live_packet,
+ char **error)
+{
+ /* Verify IP header checksum. */
+ if ((live_packet->ipv4 != NULL) &&
+ ipv4_checksum(live_packet->ipv4,
+ ipv4_header_len(live_packet->ipv4))) {
+ asprintf(error, "bad outbound IP checksum");
+ return STATUS_ERR;
+ }
+
+ /* TODO(ncardwell): Verify TCP and UDP checksum. This is a little
+ * subtle, due to TCP checksum offloading.
+ */
+
+ return STATUS_OK;
+}
+
+/* Check whether the given field of a packet matches the expected
+ * value, and emit a human-readable error message if not.
+ */
+static int check_field(
+ const char *name, /* human-readable name of the header field */
+ u32 expected, /* value script hopes to see */
+ u32 actual, /* actual value seen during test */
+ char **error) /* human-readable error string on failure */
+{
+ if (actual != expected) {
+ asprintf(error, "live packet field %s: "
+ "expected: %u (0x%x) vs actual: %u (0x%x)",
+ name, expected, expected, actual, actual);
+ return STATUS_ERR;
+ }
+ return STATUS_OK;
+}
+
+/* Verify that the actual TOS byte is as the script expected. */
+static int verify_outbound_live_tos(enum tos_chk_t tos_chk,
+ u8 actual_tos_byte,
+ u8 script_tos_byte,
+ char **error)
+{
+ if (tos_chk == TOS_CHECK_ECN) {
+ u8 actual_ecn_bits = actual_tos_byte & IP_ECN_MASK;
+ if (script_tos_byte == ECN_ECT01) {
+ if ((actual_ecn_bits != IP_ECN_ECT0) &&
+ (actual_ecn_bits != IP_ECN_ECT1)) {
+ asprintf(error, "live packet field ip_ecn: "
+ "expected: 0x1 or 0x2 vs actual: 0x%x",
+ actual_ecn_bits);
+ return STATUS_ERR;
+ }
+ } else if (check_field("ip_ecn",
+ script_tos_byte,
+ actual_ecn_bits,
+ error)) {
+ return STATUS_ERR;
+ }
+ } else if (tos_chk == TOS_CHECK_TOS) {
+ if (check_field("tos",
+ script_tos_byte,
+ actual_tos_byte, error)) {
+ return STATUS_ERR;
+ }
+ }
+
+ return STATUS_OK;
+}
+
+
+static int verify_outbound_live_ttl_or_hl(u8 actual_ttl_byte,
+ u8 script_ttl_byte,
+ char **error)
+{
+ if (script_ttl_byte != TTL_CHECK_NONE) {
+ if (check_field("ttl",
+ script_ttl_byte,
+ actual_ttl_byte, error)) {
+ return STATUS_ERR;
+ }
+ }
+
+ return STATUS_OK;
+}
+
+/* How many bytes should we tack onto the script packet to account for
+ * the actual TCP options we did see?
+ */
+static int tcp_options_allowance(const struct packet *actual_packet,
+ const struct packet *script_packet)
+{
+ if (script_packet->flags & FLAG_OPTIONS_NOCHECK)
+ return packet_tcp_options_len(actual_packet);
+ else
+ return 0;
+}
+
+/* Verify that required actual IPv4 header fields are as the script expected. */
+static int verify_ipv4(
+ const struct packet *actual_packet,
+ const struct packet *script_packet,
+ int layer, bool strict, char **error)
+{
+ const struct ipv4 *actual_ipv4 = actual_packet->headers[layer].h.ipv4;
+ const struct ipv4 *script_ipv4 = script_packet->headers[layer].h.ipv4;
+
+ if (check_field("ipv4_version",
+ script_ipv4->version,
+ actual_ipv4->version, error) ||
+ check_field("ipv4_protocol",
+ script_ipv4->protocol,
+ actual_ipv4->protocol, error) ||
+ check_field("ipv4_header_length",
+ script_ipv4->ihl,
+ actual_ipv4->ihl, error) ||
+ (strict && check_field("ipv4_total_length",
+ (ntohs(script_ipv4->tot_len) +
+ tcp_options_allowance(actual_packet,
+ script_packet)),
+ ntohs(actual_ipv4->tot_len), error)))
+ return STATUS_ERR;
+
+ if (verify_outbound_live_tos(script_packet->tos_chk,
+ ipv4_tos_byte(actual_ipv4),
+ ipv4_tos_byte(script_ipv4),
+ error))
+ return STATUS_ERR;
+
+ if (verify_outbound_live_ttl_or_hl(ipv4_ttl_byte(actual_ipv4),
+ ipv4_ttl_byte(script_ipv4),
+ error))
+ return STATUS_ERR;
+
+ return STATUS_OK;
+}
+
+/* Verify that required actual IPv6 header fields are as the script expected. */
+static int verify_ipv6(
+ const struct packet *actual_packet,
+ const struct packet *script_packet,
+ int layer, bool strict, char **error)
+{
+ const struct ipv6 *actual_ipv6 = actual_packet->headers[layer].h.ipv6;
+ const struct ipv6 *script_ipv6 = script_packet->headers[layer].h.ipv6;
+
+ if (check_field("ipv6_version",
+ script_ipv6->version,
+ actual_ipv6->version, error) ||
+ (strict && check_field("ipv6_payload_len",
+ (ntohs(script_ipv6->payload_len) +
+ tcp_options_allowance(actual_packet,
+ script_packet)),
+ ntohs(actual_ipv6->payload_len), error)) ||
+ check_field("ipv6_next_header",
+ script_ipv6->next_header,
+ actual_ipv6->next_header, error))
+ return STATUS_ERR;
+
+ if (verify_outbound_live_tos(script_packet->tos_chk,
+ ipv6_tos_byte(actual_ipv6),
+ ipv6_tos_byte(script_ipv6),
+ error))
+ return STATUS_ERR;
+
+ if (verify_outbound_live_ttl_or_hl(ipv6_hoplimit_byte(actual_ipv6),
+ ipv6_hoplimit_byte(script_ipv6),
+ error))
+ return STATUS_ERR;
+
+ return STATUS_OK;
+}
+
+/* Verify that required actual TCP header fields are as the script expected. */
+static int verify_tcp(
+ const struct packet *actual_packet,
+ const struct packet *script_packet,
+ int layer, bool strict, char **error)
+{
+ const struct tcp *actual_tcp = actual_packet->headers[layer].h.tcp;
+ const struct tcp *script_tcp = script_packet->headers[layer].h.tcp;
+ int script_payload_len = packet_payload_len(script_packet);
+
+ if (check_field("tcp_data_offset",
+ (script_tcp->doff +
+ tcp_options_allowance(actual_packet,
+ script_packet)/sizeof(u32)),
+ actual_tcp->doff, error) ||
+ (strict && check_field("tcp_fin",
+ script_tcp->fin,
+ actual_tcp->fin, error)) ||
+ check_field("tcp_syn",
+ script_tcp->syn,
+ actual_tcp->syn, error) ||
+ check_field("tcp_rst",
+ script_tcp->rst,
+ actual_tcp->rst, error) ||
+ (strict && check_field("tcp_psh",
+ script_tcp->psh,
+ actual_tcp->psh, error)) ||
+ check_field("tcp_ack",
+ script_tcp->ack,
+ actual_tcp->ack, error) ||
+ check_field("tcp_urg",
+ script_tcp->urg,
+ actual_tcp->urg, error) ||
+ check_field("tcp_ece",
+ script_tcp->ece,
+ actual_tcp->ece, error) ||
+ (strict && check_field("tcp_cwr",
+ script_tcp->cwr,
+ actual_tcp->cwr, error)) ||
+ check_field("tcp_reserved_bits",
+ script_tcp->res1,
+ actual_tcp->res1, error) ||
+ (strict && check_field("tcp_seq",
+ ntohl(script_tcp->seq),
+ ntohl(actual_tcp->seq), error)) ||
+ (!strict && check_field("tcp_seq",
+ ntohl(script_tcp->seq) + script_payload_len,
+ ntohl(actual_tcp->seq), error)) ||
+ check_field("tcp_ack_seq",
+ ntohl(script_tcp->ack_seq),
+ ntohl(actual_tcp->ack_seq), error) ||
+ (script_packet->flags & FLAG_WIN_NOCHECK ? STATUS_OK :
+ check_field("tcp_window",
+ ntohs(script_tcp->window),
+ ntohs(actual_tcp->window), error)) ||
+ check_field("tcp_urg_ptr",
+ ntohs(script_tcp->urg_ptr),
+ ntohs(actual_tcp->urg_ptr), error))
+ return STATUS_ERR;
+
+ return STATUS_OK;
+}
+
+/* Verify that required actual UDP header fields are as the script expected. */
+static int verify_udp(
+ const struct packet *actual_packet,
+ const struct packet *script_packet,
+ int layer, bool strict, char **error)
+{
+ const struct udp *actual_udp = actual_packet->headers[layer].h.udp;
+ const struct udp *script_udp = script_packet->headers[layer].h.udp;
+
+ /* udp_len is either filled in by packetdrill or specified by user.
+ * If strict is set, we should check udp_len
+ */
+ if (strict &&
+ check_field("udp_len",
+ ntohs(script_udp->len),
+ ntohs(actual_udp->len), error))
+ return STATUS_ERR;
+ return STATUS_OK;
+}
+
+/* Verify that required actual GRE header fields are as the script expected. */
+static int verify_gre(
+ const struct packet *actual_packet,
+ const struct packet *script_packet,
+ int layer, bool strict, char **error)
+{
+ const struct gre *actual_gre = actual_packet->headers[layer].h.gre;
+ const struct gre *script_gre = script_packet->headers[layer].h.gre;
+ int i = 0;
+
+ if (check_field("gre_len",
+ gre_len(script_gre),
+ gre_len(actual_gre), error))
+ return STATUS_ERR;
+ if (script_gre->flags != actual_gre->flags) {
+ asprintf(error, "mismatch in GRE flags");
+ return STATUS_ERR;
+ }
+ if (script_gre->proto != actual_gre->proto) {
+ asprintf(error, "mismatch in GRE proto");
+ return STATUS_ERR;
+ }
+
+ if (script_gre->has_checksum || script_gre->has_routing) {
+ if (script_gre->be16[0] != actual_gre->be16[0]) {
+ asprintf(error, "mismatch in GRE sum");
+ return STATUS_ERR;
+ }
+ if (script_gre->be16[1] != actual_gre->be16[1]) {
+ asprintf(error, "mismatch in GRE off");
+ return STATUS_ERR;
+ }
+ i++;
+ }
+ if (script_gre->has_key) {
+ if (script_gre->be32[i] != actual_gre->be32[i]) {
+ asprintf(error, "mismatch in GRE key");
+ return STATUS_ERR;
+ }
+ i++;
+ }
+ if (script_gre->has_seq) {
+ if (script_gre->be32[i] != actual_gre->be32[i]) {
+ asprintf(error, "mismatch in GRE seq");
+ return STATUS_ERR;
+ }
+ i++;
+ }
+ return STATUS_OK;
+}
+
+/* Verify that required actual MPLS header fields are as the script expected. */
+static int verify_mpls(
+ const struct packet *actual_packet,
+ const struct packet *script_packet,
+ int layer, bool strict, char **error)
+{
+ const struct header *actual_header = &actual_packet->headers[layer];
+ const struct header *script_header = &script_packet->headers[layer];
+ const struct mpls *actual_mpls = actual_packet->headers[layer].h.mpls;
+ const struct mpls *script_mpls = script_packet->headers[layer].h.mpls;
+ int num_entries = script_header->header_bytes / sizeof(struct mpls);
+ int i = 0;
+
+ if (script_header->header_bytes != actual_header->header_bytes) {
+ asprintf(error, "mismatch in MPLS label stack depth");
+ return STATUS_ERR;
+ }
+
+ for (i = 0; i < num_entries; ++i) {
+ const struct mpls *actual_entry = actual_mpls + i;
+ const struct mpls *script_entry = script_mpls + i;
+ if (memcmp(actual_entry, script_entry, sizeof(*script_entry))) {
+ asprintf(error, "mismatch in MPLS label %d", i);
+ return STATUS_ERR;
+ }
+ }
+
+ return STATUS_OK;
+}
+
+/* Verify type and code field in ICMPv4 header */
+static int verify_icmpv4(
+ const struct packet *actual_packet,
+ const struct packet *script_packet,
+ int layer, bool strict, char **error)
+{
+ const struct icmpv4 *actual_icmpv4 = actual_packet->headers[layer].h.icmpv4;
+ const struct icmpv4 *script_icmpv4 = script_packet->headers[layer].h.icmpv4;
+
+ if (check_field("icmp_type",
+ script_icmpv4->type,
+ actual_icmpv4->type, error))
+ return STATUS_ERR;
+ if (check_field("icmp_code",
+ script_icmpv4->code,
+ actual_icmpv4->code, error))
+ return STATUS_ERR;
+ return STATUS_OK;
+}
+
+/* Verify type and code field in ICMPv6 header */
+static int verify_icmpv6(
+ const struct packet *actual_packet,
+ const struct packet *script_packet,
+ int layer, bool strict, char **error)
+{
+ const struct icmpv6 *actual_icmpv6 = actual_packet->headers[layer].h.icmpv6;
+ const struct icmpv6 *script_icmpv6 = script_packet->headers[layer].h.icmpv6;
+
+ if (check_field("icmp_type",
+ script_icmpv6->type,
+ actual_icmpv6->type, error))
+ return STATUS_ERR;
+ if (check_field("icmp_code",
+ script_icmpv6->code,
+ actual_icmpv6->code, error))
+ return STATUS_ERR;
+ return STATUS_OK;
+}
+
+typedef int (*verifier_func)(
+ const struct packet *actual_packet,
+ const struct packet *script_packet,
+ int layer, bool strict, char **error);
+
+/* Verify that required actual header fields are as the script expected. */
+static int verify_header(
+ const struct packet *actual_packet,
+ const struct packet *script_packet,
+ int layer, bool strict, char **error)
+{
+ verifier_func verifiers[HEADER_NUM_TYPES] = {
+ [HEADER_IPV4] = verify_ipv4,
+ [HEADER_IPV6] = verify_ipv6,
+ [HEADER_GRE] = verify_gre,
+ [HEADER_MPLS] = verify_mpls,
+ [HEADER_TCP] = verify_tcp,
+ [HEADER_UDP] = verify_udp,
+ [HEADER_ICMPV4] = verify_icmpv4,
+ [HEADER_ICMPV6] = verify_icmpv6,
+ };
+ verifier_func verifier = NULL;
+ const struct header *actual_header = &actual_packet->headers[layer];
+ const struct header *script_header = &script_packet->headers[layer];
+ enum header_t type = script_header->type;
+
+ if (script_header->type != actual_header->type) {
+ asprintf(error, "live packet header layer %d: "
+ "expected: %s header vs actual: %s header",
+ layer,
+ header_type_info(script_header->type)->name,
+ header_type_info(actual_header->type)->name);
+ return STATUS_ERR;
+ }
+
+ assert(type > HEADER_NONE);
+ assert(type < HEADER_NUM_TYPES);
+ verifier = verifiers[type];
+ assert(verifier != NULL);
+ return verifier(actual_packet, script_packet, layer, strict, error);
+}
+
+/* Verify that required actual header fields are as the script expected.
+ * This function has a secondary use. It is also used to verify compatibility
+ * between two consecutive packet fragments that are candidates for aggregation.
+ * The later mode is invoked with parameter 'strict' set to false.
+ */
+static int verify_outbound_live_headers(
+ const struct packet *actual_packet,
+ const struct packet *script_packet, bool strict, char **error)
+{
+ const int actual_headers = packet_header_count(actual_packet);
+ const int script_headers = packet_header_count(script_packet);
+ int i;
+
+ assert((actual_packet->ipv4 != NULL) || (actual_packet->ipv6 != NULL));
+ assert((actual_packet->tcp != NULL) || (actual_packet->udp != NULL) ||
+ (actual_packet->icmpv4 != NULL) || (actual_packet->icmpv6 != NULL));
+
+ if (actual_headers != script_headers) {
+ asprintf(error, "%s packet header layers: "
+ "expected: %d headers vs actual: %d headers",
+ (strict ? "live" : "aggregate candidate"),
+ script_headers, actual_headers);
+ return STATUS_ERR;
+ }
+
+ /* Compare actual vs script headers, layer by layer. */
+ for (i = 0; i < ARRAY_SIZE(script_packet->headers); ++i) {
+ if (script_packet->headers[i].type == HEADER_NONE)
+ break;
+
+ if (verify_header(actual_packet, script_packet, i, strict,
+ error))
+ return STATUS_ERR;
+ }
+
+ return STATUS_OK;
+}
+
+/* Verify the flow label in IPv6 header is as expected
+ * Note: the initial value for a flow comes from the first packet
+ */
+static int verify_outbound_live_ipv6_flowlabel(
+ struct socket *socket,
+ struct packet *actual_packet,
+ struct packet *script_packet, char **error)
+{
+ u32 script_flowlabel = ipv6_flow_label(script_packet->ipv6);
+ u32 live_flowlabel = ipv6_flow_label(actual_packet->ipv6);
+
+ /* Return directly if script_flowlabel is not set */
+ if (!script_flowlabel)
+ return STATUS_OK;
+ /* Check flowlabel is marked in live packet */
+ if (!live_flowlabel) {
+ asprintf(error,
+ "flowlabel unmarked in the live packet");
+ return STATUS_ERR;
+ }
+ /* Initialize flowlabel_map in socket */
+ if (!socket->flowlabel_map.flowlabel_script) {
+ socket->flowlabel_map.flowlabel_script = script_flowlabel;
+ socket->flowlabel_map.flowlabel_live = live_flowlabel;
+ return STATUS_OK;
+ }
+ /* Expecting different flowlabel */
+ if (script_flowlabel != socket->flowlabel_map.flowlabel_script) {
+ if (live_flowlabel != socket->flowlabel_map.flowlabel_live) {
+ socket->flowlabel_map.flowlabel_script = script_flowlabel;
+ socket->flowlabel_map.flowlabel_live = live_flowlabel;
+ return STATUS_OK;
+ } else {
+ asprintf(error,
+ "expected a different flowlabel but got "
+ "the same");
+ return STATUS_ERR;
+ }
+ /* Expecting consistent flowlabel */
+ } else {
+ if (live_flowlabel == socket->flowlabel_map.flowlabel_live) {
+ return STATUS_OK;
+ } else {
+ asprintf(error,
+ "inconsistent flowlabels for this packet: "
+ "expected: 0x%x vs actual: 0x%x",
+ socket->flowlabel_map.flowlabel_live,
+ live_flowlabel);
+ return STATUS_ERR;
+ }
+ }
+ /* Should not reach here, only for compiling */
+ return STATUS_OK;
+}
+
+static int verify_outbound_tcp_option(
+ struct config *config,
+ struct packet *actual_packet,
+ struct packet *script_packet,
+ struct tcp_option *actual_option,
+ struct tcp_option *script_option,
+ char **error)
+{
+ u32 script_ts_val, actual_ts_val;
+ int ts_val_tick_usecs;
+ long tolerance_usecs;
+
+ tolerance_usecs = config->tolerance_usecs;
+
+ switch (actual_option->kind) {
+ case TCPOPT_EOL:
+ case TCPOPT_NOP:
+ break;
+ case TCPOPT_TIMESTAMP:
+ script_ts_val = packet_tcp_ts_val(script_packet);
+ actual_ts_val = packet_tcp_ts_val(actual_packet);
+
+ ts_val_tick_usecs = config->tcp_ts_tick_usecs;
+
+ /* See if the deviation from the script TS val is
+ * within our configured tolerance.
+ */
+ if (ts_val_tick_usecs &&
+ ((abs((s32)(actual_ts_val - script_ts_val)) *
+ ts_val_tick_usecs) >
+ tolerance_usecs)) {
+ asprintf(error, "bad outbound TCP timestamp value, tolerance %ld", tolerance_usecs);
+ return STATUS_ERR;
+ }
+ break;
+
+ default:
+ if (script_option->length != actual_option->length) {
+ asprintf(error,
+ "bad lengths for outbound TCP option %d",
+ script_option->kind);
+ return STATUS_ERR;
+ }
+ if (script_option->length > 2 &&
+ memcmp(&actual_option->data, &script_option->data,
+ actual_option->length - 2) != 0) {
+ asprintf(error, "bad value outbound TCP option %d",
+ script_option->kind);
+ return STATUS_ERR;
+ }
+ }
+
+ return STATUS_OK;
+}
+
+/* Verify that the TCP option values matched expected values. */
+static int verify_outbound_live_tcp_options(
+ struct config *config,
+ struct packet *actual_packet,
+ struct packet *script_packet, char **error)
+{
+ struct tcp_options_iterator a_iter, s_iter;
+
+ struct tcp_option *a_opt, *s_opt;
+
+ /* See if we should validate TCP options at all. */
+ if (script_packet->flags & FLAG_OPTIONS_NOCHECK)
+ return STATUS_OK;
+
+ a_opt = tcp_options_begin(actual_packet, &a_iter),
+ s_opt = tcp_options_begin(script_packet, &s_iter);
+
+ /* TCP options are expected to be a deterministic order. */
+ while (a_opt != NULL || s_opt != NULL) {
+ if (a_opt == NULL || s_opt == NULL ||
+ a_opt->kind != s_opt->kind) {
+ asprintf(error, "bad outbound TCP options");
+ return STATUS_ERR;
+ }
+
+ if (verify_outbound_tcp_option(config, actual_packet,
+ script_packet, a_opt, s_opt,
+ error) != STATUS_OK) {
+ return STATUS_ERR;
+ }
+
+ a_opt = tcp_options_next(&a_iter, error);
+ s_opt = tcp_options_next(&s_iter, error);
+ }
+ return STATUS_OK;
+}
+
+/* Verify TCP/UDP payload matches expected value. */
+static int verify_outbound_live_payload(
+ struct packet *actual_packet,
+ struct packet *script_packet, char **error)
+{
+ /* Diff the TCP/UDP data payloads. We've already implicitly
+ * checked their length by checking the IP and TCP/UDP headers.
+ */
+ assert(packet_payload_len(actual_packet) ==
+ packet_payload_len(script_packet));
+ if (memcmp(packet_payload(script_packet),
+ packet_payload(actual_packet),
+ packet_payload_len(script_packet)) != 0) {
+ asprintf(error, "incorrect outbound data payload");
+ return STATUS_ERR;
+ }
+ return STATUS_OK;
+}
+
+/* Verify that the outbound packet correctly matches the expected
+ * outbound packet from the script.
+ * Return STATUS_OK upon success. If non_fatal_packet is unset in the
+ * config, return STATUS_ERR upon all failures. With non_fatal_packet,
+ * return STATUS_WARN upon non-fatal failures.
+ */
+static int verify_outbound_live_packet(
+ struct state *state, struct socket *socket,
+ struct packet *script_packet, struct packet *live_packet,
+ char **error)
+{
+ DEBUGP("verify_outbound_live_packet\n");
+
+ int result = STATUS_ERR; /* return value */
+ bool non_fatal = false; /* ok to continue on error? */
+ enum event_time_t time_type = state->event->time_type;
+ s64 script_usecs = state->event->time_usecs;
+ s64 script_usecs_end = state->event->time_usecs_end;
+
+ /* The "actual" packet will be the live packet with values
+ * mapped into script space.
+ */
+ struct packet *actual_packet = packet_copy(live_packet);
+ s64 actual_usecs = live_time_to_script_time_usecs(
+ state, live_packet->time_usecs);
+
+ /* Before mapping, see if the live outgoing checksums are correct. */
+ if (verify_outbound_live_checksums(live_packet, error))
+ goto out;
+
+ /* Map live packet values into script space for easy comparison. */
+ if (map_outbound_live_packet(
+ socket, live_packet, actual_packet, script_packet, error))
+ goto out;
+
+ /* Verify actual IP, TCP/UDP header values matched expected ones. */
+ if (verify_outbound_live_headers(actual_packet, script_packet, true,
+ error)) {
+ non_fatal = true;
+ goto out;
+ }
+
+ if (script_packet->ipv6) {
+ if (verify_outbound_live_ipv6_flowlabel(socket, actual_packet,
+ script_packet, error)) {
+ non_fatal = true;
+ goto out;
+ }
+ }
+ if (script_packet->tcp) {
+ /* Verify TCP options matched expected values. */
+ if (verify_outbound_live_tcp_options(
+ state->config, actual_packet, script_packet,
+ error)) {
+ non_fatal = true;
+ goto out;
+ }
+ }
+
+ /* Verify TCP/UDP payload matches expected value.
+ * We skip the payload check for ICMP packets as the payload generated
+ * by packetdrill is incomplete.
+ */
+ if (!actual_packet->icmpv4 && !actual_packet->icmpv6) {
+ if (verify_outbound_live_payload(actual_packet, script_packet, error)) {
+ non_fatal = true;
+ goto out;
+ }
+ }
+
+ /* Verify that kernel sent packet at the time the script expected. */
+ DEBUGP("packet time_usecs: %lld\n", live_packet->time_usecs);
+ if (verify_time(state, time_type, script_usecs,
+ script_usecs_end, live_packet->time_usecs,
+ "outbound packet", error)) {
+ non_fatal = true;
+ goto out;
+ }
+
+ result = STATUS_OK;
+
+out:
+ add_packet_dump(error, "script", script_packet, script_usecs,
+ DUMP_SHORT);
+ if (actual_packet != NULL) {
+ add_packet_dump(error, "actual", actual_packet, actual_usecs,
+ DUMP_SHORT);
+ packet_free(actual_packet);
+ }
+ if (result == STATUS_ERR &&
+ non_fatal &&
+ state->config->non_fatal_packet) {
+ result = STATUS_WARN;
+ }
+ return result;
+}
+
+/* Check compatibility between two sequential packet fragments that are
+ * candidates for aggregation. Parameter current_payload represents the payload
+ * of the current fragment, while expected_payload repesents how much we need to
+ * match the script packet payload, and it is used to determine if this fragment
+ * is the last one.
+ */
+static int verify_packet_fragments(
+ const struct packet *current_fragment,
+ const struct packet *previous_fragment,
+ int current_payload, int sniffed_payload,
+ int expected_total_payload, char **error)
+{
+ /* Ensure that current fragment headers match the previous fragment
+ * headers.
+ */
+ if (verify_outbound_live_headers(current_fragment, previous_fragment,
+ false, error))
+ return STATUS_ERR;
+
+ /* If this is not the last fragment, also check that its payload matches
+ * the payload of the previous fragment.
+ */
+ if (current_payload < expected_total_payload - sniffed_payload &&
+ current_payload != packet_payload_len(previous_fragment)) {
+ asprintf(error, "fragment payload: expected %d bytes vs "
+ "actual %d bytes; "
+ "total payload: expected %d bytes vs "
+ "actual %d bytes",
+ packet_payload_len(previous_fragment),
+ current_payload,
+ expected_total_payload,
+ current_payload + sniffed_payload);
+ return STATUS_ERR;
+ }
+ return STATUS_OK;
+}
+
+/* Sniff the next outbound live packet and return it. */
+static int sniff_outbound_live_packet(
+ struct state *state, struct socket *expected_socket,
+ struct packet **packet, char **error)
+{
+ DEBUGP("sniff_outbound_live_packet\n");
+ struct socket *socket = NULL;
+ enum direction_t direction = DIRECTION_INVALID;
+ assert(*packet == NULL);
+ while (1) {
+ if (netdev_receive(state->netdev, packet, error))
+ return STATUS_ERR;
+ /* See if the packet matches an existing, known socket. */
+ socket = find_socket_for_live_packet(state, *packet,
+ &direction);
+ if ((socket != NULL) && (direction == DIRECTION_OUTBOUND))
+ break;
+ /* See if the packet matches a recent connect() call. */
+ socket = find_connect_for_live_packet(state, *packet,
+ &direction);
+ if ((socket != NULL) && (direction == DIRECTION_OUTBOUND))
+ break;
+ packet_free(*packet);
+ *packet = NULL;
+ }
+
+ assert(*packet != NULL);
+ assert(socket != NULL);
+ assert(direction == DIRECTION_OUTBOUND);
+
+ if (socket != expected_socket) {
+ asprintf(error, "packet is not for expected socket");
+ return STATUS_ERR;
+ }
+ return STATUS_OK;
+}
+
+/* Return true iff the given packet could be sent/received by the socket. */
+static bool is_script_packet_match_for_socket(
+ struct state *state, struct packet *packet, struct socket *socket)
+{
+ const bool is_packet_icmp = (packet->icmpv4 || packet->icmpv6);
+
+ if (socket->protocol == IPPROTO_TCP)
+ return packet->tcp || is_packet_icmp;
+ else if (socket->protocol == IPPROTO_UDP)
+ return packet->udp || is_packet_icmp;
+ else if (socket->protocol == IPPROTO_ICMP)
+ return (packet->icmpv4 != NULL);
+ else if (socket->protocol == IPPROTO_ICMPV6)
+ return (packet->icmpv6 != NULL);
+ else
+ assert(!"unsupported layer 4 protocol in socket");
+ return false;
+}
+
+/* Find or create a socket object matching the given packet. */
+static int find_or_create_socket_for_script_packet(
+ struct state *state, struct packet *packet,
+ enum direction_t direction, struct socket **socket,
+ char **error)
+{
+ *socket = NULL;
+
+ DEBUGP("find_or_create_socket_for_script_packet\n");
+
+ if (packet->tcp != NULL) {
+ /* Is this an inbound packet matching a listening
+ * socket? If so, this call will create a new child
+ * socket object.
+ */
+ *socket = handle_listen_for_script_packet(state,
+ packet, direction);
+ if (*socket != NULL)
+ return STATUS_OK;
+
+ /* Is this an outbound packet matching a connecting socket? */
+ *socket = handle_connect_for_script_packet(state,
+ packet, direction);
+ if (*socket != NULL)
+ return STATUS_OK;
+ }
+ /* See if there is an existing connection to handle this packet. */
+ if (state->socket_under_test != NULL &&
+ is_script_packet_match_for_socket(state, packet,
+ state->socket_under_test)) {
+ *socket = state->socket_under_test;
+ return STATUS_OK;
+ }
+ /* For tcp packet with foreign port, use state->socket_under_test */
+ if (packet->tcp &&
+ (packet->tcp->src_port || packet->tcp->dst_port)) {
+ *socket = state->socket_under_test;
+ return STATUS_OK;
+ }
+ /* For udp packet with foreign port, use state->socket_under_test */
+ if (packet->udp &&
+ (packet->udp->src_port || packet->udp->dst_port)) {
+ *socket = state->socket_under_test;
+ return STATUS_OK;
+ }
+
+ asprintf(error, "no matching socket for script packet");
+ return STATUS_ERR;
+}
+
+/* Perform the action implied by an outbound packet in a script
+ * Return STATUS_OK upon success. Without --use_expect, return STATUS_ERR
+ * upon all failures. With --use_expect, return STATUS_WARN upon non-fatal
+ * failures.
+ */
+static int do_outbound_script_packet(
+ struct state *state, struct packet *packet,
+ struct socket *socket, char **error)
+{
+ DEBUGP("do_outbound_script_packet\n");
+ int result = STATUS_ERR; /* return value */
+ struct packet *live_packet = NULL;
+ struct packet_list *sniffed_packets_start = NULL; /* list head */
+ struct packet_list *sniffed_packets_end = NULL; /* list tail */
+ int packet_count = 0; /* number of sniffed packets */
+ int expected_payload_len = packet_payload_len(packet);
+ int sniffed_payload_len = 0;
+
+ if ((socket->state == SOCKET_PASSIVE_PACKET_RECEIVED) &&
+ packet->tcp && packet->tcp->syn && packet->tcp->ack) {
+ /* Script says we should see an outbound server SYNACK. */
+ socket->script.local_isn = ntohl(packet->tcp->seq);
+ DEBUGP("SYNACK script.local_isn: %u\n",
+ socket->script.local_isn);
+ }
+
+ DEBUGP("Expecting packet with payload %d bytes\n",
+ expected_payload_len);
+ /* To allow remote mode execution of scripts that expect TSO or GSO
+ * segmentation, this loop aggregates sequential outbound packets until
+ * they reach the length expected by the script.
+ * We explicitly disable aggregation in local mode, to facilitate
+ * testing of certain features, such as automatic packet sizing, without
+ * interferences from the packet aggregation algorithm.
+ */
+ do {
+ struct packet_list *sniffed = NULL;
+ int live_payload = 0;
+
+ /* Sniff outbound live packet and verify it's for the right
+ * socket.
+ */
+ if (sniff_outbound_live_packet(state, socket, &live_packet,
+ error))
+ goto out;
+ live_payload = packet_payload_len(live_packet);
+ DEBUGP("Sniffed packet with payload %d bytes\n", live_payload);
+
+ if ((socket->state == SOCKET_PASSIVE_PACKET_RECEIVED) &&
+ packet->tcp && packet->tcp->syn && packet->tcp->ack) {
+ socket->state = SOCKET_PASSIVE_SYNACK_SENT;
+ socket->live.local_isn = ntohl(live_packet->tcp->seq);
+ DEBUGP("SYNACK live.local_isn: %u\n",
+ socket->live.local_isn);
+ }
+
+ verbose_packet_dump(state, "outbound sniffed", live_packet,
+ live_time_to_script_time_usecs(
+ state, live_packet->time_usecs));
+
+ /* Save the TCP header so we can reset the connection at the
+ * end.
+ */
+ if (live_packet->tcp) {
+ socket->last_outbound_tcp_header = *(live_packet->tcp);
+ /* Rewrite TCP sequence number */
+ if (tcp_convert_seq_number(socket, live_packet, error))
+ goto out;
+ }
+
+ sniffed = packet_list_new();
+ sniffed->packet = live_packet;
+ /* Reset live_packet so we can sniff the next packet if
+ * needed.
+ */
+ live_packet = NULL;
+ if (sniffed_packets_start == NULL) {
+ sniffed_packets_start = sniffed;
+ } else {
+ /* In remote mode, we do not see socket openings on the
+ * server side. Sometimes, there are residual packets
+ * sent over an old socket that packetdrill tests do not
+ * explicitly account for. In local mode, such packets
+ * are ignored because they don't match the test socket.
+ * In remote mode, we sniff them because we do not know
+ * any better. However, while we are trying to sniff
+ * enough packets for the expected payload, we can check
+ * if the source IPs/ports of the packets match. Discard
+ * packets on a missmatch and start anew.
+ * TODO(gmx): send syscall state from the wire_client to
+ * the wire_server and get rid of this test.
+ */
+ struct tuple old_packet_tuple, new_packet_tuple;
+ get_packet_tuple(sniffed_packets_end->packet,
+ &old_packet_tuple);
+ get_packet_tuple(sniffed->packet, &new_packet_tuple);
+
+ if (!is_equal_tuple(&old_packet_tuple,
+ &new_packet_tuple)) {
+ /* Discard old packets. */
+ DEBUGP("Discarding previously sniffed %d "
+ "packets with total payload %d\n",
+ packet_count, sniffed_payload_len);
+ packet_list_free(sniffed_packets_start);
+ packet_count = 0;
+ sniffed_payload_len = 0;
+ sniffed_packets_start = sniffed;
+ } else {
+ if (verify_packet_fragments(
+ sniffed->packet,
+ sniffed_packets_end->packet,
+ live_payload,
+ sniffed_payload_len,
+ expected_payload_len,
+ error)) {
+ packet_list_free(sniffed);
+ goto out;
+ }
+ sniffed_packets_end->next = sniffed;
+ }
+ }
+ sniffed_packets_end = sniffed;
+ packet_count++;
+ sniffed_payload_len += live_payload;
+ } while (sniffed_payload_len < expected_payload_len &&
+ (!state->config->strict_segments ||
+ state->config->is_wire_server));
+
+ /* Check that we matched the payload size. */
+ if (sniffed_payload_len != expected_payload_len) {
+ asprintf(error, "live packet payload: expected %d bytes vs "
+ "actual %d bytes",
+ expected_payload_len, sniffed_payload_len);
+ goto out;
+ }
+ /* If we have just one packet, use it directly, no need to incur the
+ * aggregation overhead.
+ */
+ if (packet_count == 1) {
+ live_packet = sniffed_packets_start->packet;
+ sniffed_packets_start->packet = NULL;
+ } else {
+ if (DEBUG_LOGGING) {
+ char *debug = NULL;
+ add_packet_dump(&debug, "first",
+ sniffed_packets_start->packet,
+ live_time_to_script_time_usecs(state,
+ sniffed_packets_start->packet->time_usecs),
+ DUMP_FULL);
+ DEBUGP("%s\n", debug);
+ free(debug);
+ debug = NULL;
+ add_packet_dump(&debug, "last",
+ sniffed_packets_end->packet,
+ live_time_to_script_time_usecs(state,
+ sniffed_packets_end->packet->time_usecs),
+ DUMP_FULL);
+ DEBUGP("%s\n", debug);
+ free(debug);
+ }
+ live_packet = aggregate_packets(sniffed_packets_start,
+ sniffed_packets_end,
+ sniffed_payload_len);
+ if (DEBUG_LOGGING) {
+ char *debug = NULL;
+ add_packet_dump(&debug, "live", live_packet,
+ live_time_to_script_time_usecs(state,
+ live_packet->time_usecs),
+ DUMP_FULL);
+ DEBUGP("%s\n", debug);
+ free(debug);
+ }
+ }
+ /* Verify the bits the kernel sent were what the script expected. */
+ result = verify_outbound_live_packet(
+ state, socket, packet, live_packet, error);
+
+out:
+ if (live_packet != NULL)
+ packet_free(live_packet);
+ packet_list_free(sniffed_packets_start);
+ return result;
+}
+
+/* Checksum the packet and inject it into the kernel under test. */
+static int send_live_ip_packet(struct netdev *netdev,
+ struct packet *packet)
+{
+ assert(packet->ip_bytes > 0);
+ /* We do IPv4 and IPv6 */
+ assert(packet->ipv4 || packet->ipv6);
+ /* We only do TCP, UDP, and ICMP */
+ assert(packet->tcp || packet->udp || packet->icmpv4 || packet->icmpv6);
+
+ /* Fill in layer 3 and layer 4 checksums */
+ checksum_packet(packet);
+
+ return netdev_send(netdev, packet);
+}
+
+/* Perform the action implied by an inbound packet in a script */
+static int do_inbound_script_packet(
+ struct state *state, struct packet *packet,
+ struct socket *socket, char **error)
+{
+ DEBUGP("do_inbound_script_packet\n");
+ int result = STATUS_ERR; /* return value */
+
+ if ((socket->state == SOCKET_PASSIVE_SYNACK_SENT) &&
+ packet->tcp && packet->tcp->ack) {
+ /* Received the ACK that completes the 3-way handshake. */
+ socket->state = SOCKET_PASSIVE_SYNACK_ACKED;
+ } else if ((socket->state == SOCKET_ACTIVE_SYN_SENT) &&
+ packet->tcp && packet->tcp->syn && packet->tcp->ack) {
+ /* Received the server's SYNACK, which ACKs our SYN. */
+ socket->state = SOCKET_ACTIVE_SYN_ACKED;
+ socket->script.remote_isn = ntohl(packet->tcp->seq);
+ socket->live.remote_isn = ntohl(packet->tcp->seq);
+ }
+
+ /* For anyip UDP/ICMP rx test, no tx packet will be sent before rx.
+ * So the 4-tuple addr in socket->live are all 0.
+ * We need to fill in 4-tuple first before sending pkt out.
+ */
+ if (state->config->is_anyip && (packet->udp ||
+ packet->icmpv4 ||
+ packet->icmpv6)) {
+ socket->live.remote.ip = state->config->live_remote_ip;
+ socket->live.local.ip = state->config->live_local_ip;
+ if (!(socket->live.remote.port))
+ socket->live.remote.port = htons(state->config->live_connect_port);
+ if (!(socket->live.local.port))
+ socket->live.local.port = htons(state->config->live_bind_port);
+ }
+
+ /* Start with a bit-for-bit copy of the packet from the script. */
+ struct packet *live_packet = packet_copy(packet);
+ /* Map packet fields from script values to live values. */
+ if (map_inbound_packet(state, socket, live_packet, error))
+ goto out;
+
+ verbose_packet_dump(state, "inbound injected", live_packet,
+ live_time_to_script_time_usecs(
+ state, now_usecs(state)));
+
+ if (live_packet->tcp) {
+ /* Save the TCP header so we can reset the connection later. */
+ socket->last_injected_tcp_header = *(live_packet->tcp);
+ socket->last_injected_tcp_payload_len =
+ packet_payload_len(live_packet);
+ }
+
+ /* Inject live packet into kernel. */
+ result = send_live_ip_packet(state->netdev, live_packet);
+
+out:
+ packet_free(live_packet);
+ return result;
+}
+
+int run_packet_event(
+ struct state *state, struct event *event, struct packet *packet,
+ char **error)
+{
+ DEBUGP("%d: packet\n", event->line_number);
+
+ char *err = NULL;
+ struct socket *socket = NULL;
+ int result = STATUS_ERR;
+
+ enum direction_t direction = packet_direction(packet);
+ assert(direction != DIRECTION_INVALID);
+
+ if (find_or_create_socket_for_script_packet(
+ state, packet, direction, &socket, &err))
+ goto out;
+
+ assert(socket != NULL);
+
+ if (direction == DIRECTION_OUTBOUND) {
+ /* We don't wait for outbound event packets because we
+ * want to start sniffing ASAP in order to see if
+ * packets go out earlier than the script specifies.
+ */
+ result = do_outbound_script_packet(state, packet, socket, &err);
+ if (result == STATUS_WARN)
+ goto out;
+ else if (result == STATUS_ERR)
+ goto out;
+ } else if (direction == DIRECTION_INBOUND) {
+ wait_for_event(state);
+ if (do_inbound_script_packet(state, packet, socket, &err))
+ goto out;
+ } else {
+ assert(!"bad direction"); /* internal bug */
+ }
+
+ return STATUS_OK; /* everything went fine */
+
+out:
+ /* Format a more complete error message and return that. */
+ asprintf(error, "%s:%d: %s handling packet: %s\n",
+ state->config->script_path, event->line_number,
+ result == STATUS_ERR ? "error" : "warning", err);
+ free(err);
+ return result;
+}
+
+/* Inject a TCP RST packet to clear the connection state out of the
+ * kernel, so the connection does not continue to retransmit packets
+ * that may be sniffed during later test executions and cause false
+ * negatives.
+ */
+int reset_connection(struct state *state, struct socket *socket)
+{
+ char *error = NULL;
+ u32 seq = 0, ack_seq = 0;
+ u16 window = 0;
+ struct packet *packet = NULL;
+ struct tuple live_inbound;
+ const struct ip_info ip_info = {{TOS_CHECK_NONE, 0}, 0};
+ int result = 0;
+
+ /* Pick TCP header fields to be something the kernel will accept. */
+ if (socket->last_injected_tcp_header.ack) {
+ /* If we've already injected something, then use a sequence
+ * number right after the last one we injected, and ACK
+ * the last thing we ACKed, and offer the same receive
+ * window we last offered.
+ */
+ seq = (ntohl(socket->last_injected_tcp_header.seq) +
+ (socket->last_injected_tcp_header.syn ? 1 : 0) +
+ (socket->last_injected_tcp_header.fin ? 1 : 0) +
+ socket->last_injected_tcp_payload_len);
+ ack_seq = ntohl(socket->last_injected_tcp_header.ack_seq);
+ window = ntohs(socket->last_injected_tcp_header.window);
+ } else if (socket->last_outbound_tcp_header.ack) {
+ /* If the kernel ACKed something, then just make sure
+ * we use the sequence number it ACKed, which will be
+ * something it expects.
+ */
+ seq = ntohl(socket->last_outbound_tcp_header.ack_seq);
+ ack_seq = ntohl(socket->last_outbound_tcp_header.seq);
+ } else {
+ /* If the kernel didn't ACK anything, then it probably
+ * sent only an initial SYN. So we get to send any
+ * sequence number we want, but should send an ACK
+ * suggesting we've seen the kernel's SYN.
+ */
+ seq = 0;
+ ack_seq = ntohl(socket->last_outbound_tcp_header.seq) + 1;
+ }
+
+ packet = new_tcp_packet(socket->address_family,
+ DIRECTION_INBOUND, ip_info, 0, 0,
+ "R.", seq, 0, ack_seq, window, 0, NULL,
+ &error);
+ if (packet == NULL)
+ die("%s", error);
+
+ /* Rewrite addresses and port to match inbound live traffic. */
+ socket_get_inbound(&socket->live, &live_inbound);
+ set_packet_tuple(packet, &live_inbound);
+
+ /* Inject live packet into kernel. */
+ result = send_live_ip_packet(state->netdev, packet);
+
+ packet_free(packet);
+
+ return result;
+}
+
+struct packets *packets_new(const struct state *state)
+{
+ struct packets *packets = calloc(1, sizeof(struct packets));
+
+ /* cache a port */
+ packets->next_ephemeral_port =
+ ephemeral_port(state->config->ip_version);
+
+ return packets;
+}
+
+void packets_free(struct packets *packets)
+{
+ memset(packets, 0, sizeof(*packets)); /* to help catch bugs */
+ free(packets);
+}
diff --git a/test/packetdrill/run_packet.h b/test/packetdrill/run_packet.h
new file mode 100644
index 0000000..b422034
--- /dev/null
+++ b/test/packetdrill/run_packet.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Interface for a module to execute a packet event from a test script.
+ */
+
+#ifndef __RUN_PACKET_H__
+#define __RUN_PACKET_H__
+
+#include "types.h"
+
+#include "script.h"
+
+struct event;
+struct packet;
+struct socket;
+struct state;
+
+/* Internal state for the packet-handling module. */
+struct packets {
+ int next_ephemeral_port; /* cached port to use, or -1 */
+};
+
+/* Allocate and return internal state for the packets module. */
+extern struct packets *packets_new(const struct state *state);
+
+/* Tear down packets module state and free up the resources it has allocated. */
+extern void packets_free(struct packets *packets);
+
+/* Execute the packet event. On success, return STATUS_OK; on error
+ * return STATUS_ERR and fill in a malloc-allocated error message in
+ * *error.
+ */
+extern int run_packet_event(struct state *state,
+ struct event *event,
+ struct packet *packet,
+ char **error);
+
+/* Inject a TCP RST packet to clear the connection state out of the kernel. */
+extern int reset_connection(struct state *state,
+ struct socket *socket);
+
+#endif /* __RUN_PACKET_H__ */
diff --git a/test/packetdrill/run_system_call.c b/test/packetdrill/run_system_call.c
new file mode 100644
index 0000000..8c70a27
--- /dev/null
+++ b/test/packetdrill/run_system_call.c
@@ -0,0 +1,3561 @@
+/*
+ * Copyright 2013 Google Inc.
+ * Copyright 2016 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * A module to execute a system call from a test script.
+ */
+
+#include "run_system_call.h"
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <linux/netlink.h>
+#include <netinet/in.h>
+#include <poll.h>
+#include <pthread.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/sendfile.h>
+#include <sys/epoll.h>
+#include <time.h>
+#include <unistd.h>
+#include "assert.h"
+#include "file.h"
+#include "epoll.h"
+#include "pipe.h"
+#include "logging.h"
+#include "run.h"
+#include "script.h"
+#include "icmp.h"
+#include "icmpv6.h"
+#include "capability.h"
+
+static int to_live_fd(struct state *state, int script_fd, int *live_fd,
+ char **error);
+
+static int syscall_icmp_sendto(struct state *state,
+ struct syscall_spec *syscall,
+ struct expression_list *args, char **error);
+
+/* Provide a wrapper for the Linux gettid() system call (glibc does not). */
+static pid_t gettid(void)
+{
+#ifdef linux
+ return syscall(__NR_gettid);
+#endif
+#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
+ /* TODO(ncardwell): Implement me. XXX */
+ return 0;
+#endif /* defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)*/
+}
+
+/* Read a whole file into the given buffer of the given length. */
+static void read_whole_file(const char *path, char *buffer, int max_bytes)
+{
+ int fd = open(path, O_RDONLY);
+ if (fd < 0)
+ die_perror("open");
+
+ int bytes = read(fd, buffer, max_bytes);
+ if (bytes < 0)
+ die_perror("read");
+ else if (bytes == max_bytes)
+ die("%s file too large to read\n", path);
+
+ if (close(fd) < 0)
+ die_perror("close");
+}
+
+/* Return true iff the given thread is sleeping. */
+static bool is_thread_sleeping(pid_t process_id, pid_t thread_id)
+{
+ /* Read the entire thread state file, using the buffer size ps uses. */
+ char *proc_path = NULL;
+ asprintf(&proc_path, "/proc/%d/task/%d/stat", process_id, thread_id);
+ const int STATE_BUFFER_BYTES = 1023;
+ char *state = calloc(STATE_BUFFER_BYTES, 1);
+ read_whole_file(proc_path, state, STATE_BUFFER_BYTES - 1);
+ state[STATE_BUFFER_BYTES - 1] = '\0';
+
+ /* Parse the thread state from the third space-delimited field. */
+ const int THREAD_STATE_INDEX = 3;
+ const char *field = state;
+ int i = 0;
+ for (i = 0; i < THREAD_STATE_INDEX - 1; i++) {
+ field = strchr(field, ' ');
+ if (field == NULL)
+ die("unable to parse %s\n", proc_path);
+ ++field;
+ }
+ bool is_sleeping = (field[0] == 'S');
+
+ free(proc_path);
+ free(state);
+
+ return is_sleeping;
+}
+
+/* Returns number of expressions in the list. */
+static int expression_list_length(struct expression_list *list)
+{
+ int count = 0;
+ while (list != NULL) {
+ list = list->next;
+ ++count;
+ }
+ return count;
+}
+
+static int get_arg_count(struct expression_list *args)
+{
+ return expression_list_length(args);
+}
+
+/* Verify that the expression list has the expected number of
+ * expressions. Returns STATUS_OK on success; on failure returns
+ * STATUS_ERR and sets error message.
+ */
+static int check_arg_count(struct expression_list *args, int expected,
+ char **error)
+{
+ assert(expected >= 0);
+ int actual = get_arg_count(args);
+ if (actual != expected) {
+ asprintf(error, "Expected %d args but got %d", expected,
+ actual);
+ return STATUS_ERR;
+ }
+ return STATUS_OK;
+}
+
+/* Returns the argument with the given index. Returns the argument on
+ * success; on failure returns NULL and sets error message.
+ */
+static struct expression *get_arg(struct expression_list *args,
+ int index, char **error)
+{
+ assert(index >= 0);
+ int current = 0;
+ while ((args != NULL) && (current < index)) {
+ args = args->next;
+ ++current;
+ }
+ if ((args != NULL) && (current == index)) {
+ return args->expression;
+ } else {
+ asprintf(error, "Argument list too short");
+ return NULL;
+ }
+}
+
+/* Return STATUS_OK if the expression is of the expected
+ * type. Otherwise fill in the error with a human-readable error
+ * message about the mismatch and return STATUS_ERR.
+ */
+static int check_type(const struct expression *expression,
+ enum expression_t expected_type,
+ char **error)
+{
+ if (expression->type == expected_type) {
+ return STATUS_OK;
+ } else {
+ asprintf(error, "Bad type; actual: %s expected: %s",
+ expression_type_to_string(expression->type),
+ expression_type_to_string(expected_type));
+ return STATUS_ERR;
+ }
+}
+
+/* Sets the value from the expression argument, checking that it is a
+ * valid s32 or u32, and matches the expected type. Returns STATUS_OK on
+ * success; on failure returns STATUS_ERR and sets error message.
+ */
+static int get_s32(struct expression *expression,
+ s32 *value, char **error)
+{
+ if (check_type(expression, EXPR_INTEGER, error))
+ return STATUS_ERR;
+ if ((expression->value.num > UINT_MAX) ||
+ (expression->value.num < INT_MIN)) {
+ asprintf(error,
+ "Value out of range for 32-bit integer: %lld",
+ expression->value.num);
+ return STATUS_ERR;
+ }
+ *value = expression->value.num;
+ return STATUS_OK;
+}
+
+/* Sets the value from the expression argument, checking that it matches the
+ * expected type. Returns STATUS_OK on success; on failure returns STATUS_ERR
+ * and sets error message.
+ */
+static int get_s64(struct expression *expression,
+ s64 *value, char **error)
+{
+ if (check_type(expression, EXPR_INTEGER, error))
+ return STATUS_ERR;
+ *value = expression->value.num;
+ return STATUS_OK;
+}
+
+/* Return the value of the argument with the given index, and verify
+ * that it has the expected type.
+ */
+static int s32_arg(struct expression_list *args,
+ int index, s32 *value, char **error)
+{
+ struct expression *expression = get_arg(args, index, error);
+ if (expression == NULL)
+ return STATUS_ERR;
+ return get_s32(expression, value, error);
+}
+
+/* Return the value of the argument with the given index, and verify
+ * that it has the expected type.
+ */
+static int s64_arg(struct expression_list *args,
+ int index, s64 *value, char **error)
+{
+ struct expression *expression = get_arg(args, index, error);
+ if (expression == NULL)
+ return STATUS_ERR;
+ return get_s64(expression, value, error);
+}
+
+/* Return the value of the argument with the given index, and verify
+ * that it has the expected type: a list with a single integer.
+ */
+static int bracketed_arg(struct expression_list *args,
+ int index, struct expression **elt, char **error)
+{
+ struct expression_list *list;
+ struct expression *expression;
+
+ *elt = NULL;
+ expression = get_arg(args, index, error);
+ if (expression == NULL)
+ return STATUS_ERR;
+ if (check_type(expression, EXPR_LIST, error))
+ return STATUS_ERR;
+ list = expression->value.list;
+ if (expression_list_length(list) != 1) {
+ asprintf(error,
+ "Expected [<element>] but got multiple elements");
+ return STATUS_ERR;
+ }
+ *elt = list->expression;
+ return STATUS_OK;
+}
+
+/* Return the value of the argument with the given index, and verify
+ * that it has the expected type: a list with a single s32.
+ */
+static int s32_bracketed_arg(struct expression_list *args,
+ int index, s32 *value, char **error)
+{
+ struct expression *expression = NULL;
+
+ if (bracketed_arg(args, index, &expression, error))
+ return STATUS_ERR;
+ return get_s32(expression, value, error);
+}
+
+/* Return the value of the argument with the given index, and verify
+ * that it has the expected type: a list with a single s64.
+ */
+static int s64_bracketed_arg(struct expression_list *args,
+ int index, s64 *value, char **error)
+{
+ struct expression *expression = NULL;
+
+ if (bracketed_arg(args, index, &expression, error))
+ return STATUS_ERR;
+ return get_s64(expression, value, error);
+}
+
+/* Return STATUS_OK iff the argument with the given index is an
+ * ellipsis (...).
+ */
+static int ellipsis_arg(struct expression_list *args, int index, char **error)
+{
+ struct expression *expression = get_arg(args, index, error);
+ if (expression == NULL)
+ return STATUS_ERR;
+ if (check_type(expression, EXPR_ELLIPSIS, error))
+ return STATUS_ERR;
+ return STATUS_OK;
+}
+
+/* Free all the space used by the given iovec. */
+static void iovec_free(struct iovec *iov, size_t iov_len)
+{
+ int i;
+
+ if (iov == NULL)
+ return;
+
+ for (i = 0; i < iov_len; ++i)
+ free(iov[i].iov_base);
+ free(iov);
+}
+
+/* Allocate and fill in an iovec described by the given expression.
+ * Return STATUS_OK if the expression is a valid iovec. Otherwise
+ * fill in the error with a human-readable error message and return
+ * STATUS_ERR.
+ */
+static int iovec_new(struct expression *expression,
+ struct iovec **iov_ptr, size_t *iov_len_ptr,
+ char **error)
+{
+ int status = STATUS_ERR;
+ int i;
+ struct expression_list *list; /* input expression from script */
+ size_t iov_len = 0;
+ struct iovec *iov = NULL; /* live output */
+
+ if (check_type(expression, EXPR_LIST, error))
+ goto error_out;
+
+ list = expression->value.list;
+
+ iov_len = expression_list_length(list);
+ iov = calloc(iov_len, sizeof(struct iovec));
+
+ for (i = 0; i < iov_len; ++i, list = list->next) {
+ size_t len;
+ struct iovec_expr *iov_expr;
+
+ if (check_type(list->expression, EXPR_IOVEC, error))
+ goto error_out;
+
+ iov_expr = list->expression->value.iovec;
+
+ assert(iov_expr->iov_base->type == EXPR_ELLIPSIS);
+ assert(iov_expr->iov_len->type == EXPR_INTEGER);
+
+ len = iov_expr->iov_len->value.num;
+
+ iov[i].iov_len = len;
+ iov[i].iov_base = calloc(len, 1);
+ }
+
+ status = STATUS_OK;
+
+error_out:
+ *iov_ptr = iov;
+ *iov_len_ptr = iov_len;
+ return status;
+}
+
+static bool sendcall_may_free(struct state *state)
+{
+ return !state->config->send_omit_free;
+}
+
+static void sendcall_free(struct state *state, void *ptr)
+{
+ if (sendcall_may_free(state))
+ free(ptr);
+}
+
+static inline int list_length(struct expression_list *list)
+{
+ int length = 0;
+ while (list) {
+ length++;
+ list = list->next;
+ }
+ return length;
+}
+
+int add_nla(void *dst, int type, int len, const void *data)
+{
+ struct nlattr *nla = (struct nlattr *) dst;
+ int attr_size = NLA_HDRLEN + len;
+ int total_size = NLA_ALIGN(attr_size);
+
+ nla->nla_type = type;
+ nla->nla_len = attr_size;
+ memcpy(dst + NLA_HDRLEN, data, len);
+ memset(dst + attr_size, 0, total_size - attr_size);
+
+ return total_size;
+}
+
+/* Returns whether the NLA value is valid. */
+static bool nla_value_is_valid(enum expression_t type)
+{
+ return type == EXPR_INTEGER || type == EXPR_ELLIPSIS;
+}
+
+#define OPT_NLA_IGNORE_VAL (~0U)
+#define OPT_NLA_IGNORE_VAL_U32 ((u32) OPT_NLA_IGNORE_VAL)
+#define OPT_NLA_IGNORE_VAL_U8 ((u8) OPT_NLA_IGNORE_VAL)
+
+/* Fills in the value of a TLV expression. */
+static void get_nla_value(const struct expression *expr, void *out_buf,
+ int num_bytes)
+{
+ u64 val;
+
+ val = (expr->type == EXPR_INTEGER) ? expr->value.num
+ : OPT_NLA_IGNORE_VAL;
+ memcpy(out_buf, &val, num_bytes);
+}
+
+/* Fill in the expected values of from 'expr', which is a list of binary
+ * expressions of the form: key = val.
+ */
+static int nla_expr_list_to_nla(struct expression_list *list,
+ void *dst, int *len,
+ struct nla_type_info *nla_info, char **error)
+{
+ struct expression *element, *key, *value;
+ void *start = dst;
+ u64 val; /* each value uses some prefix of this space */
+ s64 key_num, val_num;
+ int num_bytes;
+
+ for (; list; list = list->next) {
+ element = list->expression;
+
+ if (check_type(element, EXPR_BINARY, error))
+ return STATUS_ERR;
+
+ if (strcmp("=", element->value.binary->op) != 0)
+ return STATUS_ERR;
+
+ key = element->value.binary->lhs;
+ value = element->value.binary->rhs;
+ if (check_type(key, EXPR_INTEGER, error))
+ return STATUS_ERR;
+ if (!nla_value_is_valid(value->type)) {
+ asprintf(error,
+ "values must be numeric or ellipsis");
+ return STATUS_ERR;
+ }
+
+ key_num = key->value.num;
+ val_num = value->value.num;
+ num_bytes = nla_info[key_num].length;
+ if (num_bytes == sizeof(u8) &&
+ value->type == EXPR_INTEGER && !is_valid_u8(val_num))
+ die("out of bound u8 value specified\n");
+ else if (num_bytes == sizeof(u32) &&
+ value->type == EXPR_INTEGER && !is_valid_u32(val_num))
+ die("out of bound u32 value specified\n");
+
+ get_nla_value(value, &val, num_bytes);
+ dst += add_nla(dst, key_num, nla_info[key_num].length, &val);
+ }
+
+ *len = dst - start;
+ return STATUS_OK;
+}
+
+/* Fill in the values of sock_extended_err structure from the expression. */
+static int new_extended_err(const struct sock_extended_err_expr *expr,
+ struct sock_extended_err *ee, char **error)
+{
+ if (get_s32(expr->ee_errno, (s32 *)&ee->ee_errno, error))
+ return STATUS_ERR;
+ if (get_s32(expr->ee_origin, (s32 *)&ee->ee_origin, error))
+ return STATUS_ERR;
+ if (get_s32(expr->ee_type, (s32 *)&ee->ee_type, error))
+ return STATUS_ERR;
+ if (get_s32(expr->ee_code, (s32 *)&ee->ee_code, error))
+ return STATUS_ERR;
+ if (get_s32(expr->ee_info, (s32 *)&ee->ee_info, error))
+ return STATUS_ERR;
+ if (get_s32(expr->ee_data, (s32 *)&ee->ee_data, error))
+ return STATUS_ERR;
+
+ return STATUS_OK;
+}
+
+/* Info for various TCP NLAs */
+struct nla_type_info tcp_nla[] = {
+ [_TCP_NLA_PAD] = {"TCP_NLA_PAD", sizeof(u32)},
+ [_TCP_NLA_BUSY] = {"TCP_NLA_BUSY", sizeof(u64)},
+ [_TCP_NLA_RWND_LIMITED] = {"TCP_NLA_RWND_LIMITED", sizeof(u64)},
+ [_TCP_NLA_SNDBUF_LIMITED] = {"TCP_NLA_SNDBUF_LIMITED", sizeof(u64)},
+ [_TCP_NLA_DATA_SEGS_OUT] = {"TCP_NLA_DATA_SEGS_OUT", sizeof(u64)},
+ [_TCP_NLA_TOTAL_RETRANS] = {"TCP_NLA_TOTAL_RETRANS", sizeof(u64)},
+ [_TCP_NLA_PACING_RATE] = {"TCP_NLA_PACING_RATE", sizeof(u64)},
+ [_TCP_NLA_DELIVERY_RATE] = {"TCP_NLA_DELIVERY_RATE", sizeof(u64)},
+ [_TCP_NLA_SND_CWND] = {"TCP_NLA_SND_CWND", sizeof(u32)},
+ [_TCP_NLA_REORDERING] = {"TCP_NLA_REORDERING", sizeof(u32)},
+ [_TCP_NLA_MIN_RTT] = {"TCP_NLA_MIN_RTT", sizeof(u32)},
+ [_TCP_NLA_RECUR_RETRANS] = {"TCP_NLA_RECUR_RETRANS", sizeof(u8)},
+ [_TCP_NLA_DELIVERY_RATE_APP_LMT] = {"TCP_NLA_DELIVERY_RATE_APP_LMT",
+ sizeof(u8)},
+ [_TCP_NLA_SNDQ_SIZE] = {"TCP_NLA_SNDQ_SIZE", sizeof(u32)},
+ [_TCP_NLA_CA_STATE] = {"TCP_NLA_CA_STATE", sizeof(u8)},
+};
+
+/* Allocate and fill a msg_control described by the given expression.
+ * Return STATUS_OK if the expression is a valid msg_control.
+ * Otherwise fill in the error with a human-readable error message and
+ * return STATUS_ERR.
+ */
+static int cmsg_new(const struct expression *expr, struct msghdr *msg,
+ char **error)
+{
+ int status = STATUS_ERR;
+ int len, sum = 0;
+ const struct expression_list *list;
+ const struct cmsg_expr *cmsg_expr;
+ struct sock_extended_err_expr *ee_expr;
+ struct expression_list *stats_expr;
+ struct cmsghdr *cmsg;
+ void *data;
+
+ assert(expr->type == EXPR_LIST);
+
+ msg->msg_control = calloc(1, MSGHDR_MAX_CONTROLLEN);
+ msg->msg_controllen = MSGHDR_MAX_CONTROLLEN;
+
+ cmsg = CMSG_FIRSTHDR(msg);
+
+ for (list = expr->value.list; list; list = list->next) {
+ expr = list->expression;
+ if (check_type(expr, EXPR_CMSG, error))
+ goto error_out;
+
+ cmsg_expr = expr->value.cmsg;
+ if (get_s32(cmsg_expr->cmsg_level, &cmsg->cmsg_level, error))
+ goto error_out;
+ if (get_s32(cmsg_expr->cmsg_type, &cmsg->cmsg_type, error))
+ goto error_out;
+
+ data = CMSG_DATA(cmsg);
+
+ switch (cmsg_expr->cmsg_data->type) {
+ case EXPR_INTEGER:
+ len = sizeof(int);
+ if (get_s32(cmsg_expr->cmsg_data, data, error))
+ goto error_out;
+ break;
+
+ case EXPR_SCM_TIMESTAMPING:
+ len = sizeof(struct scm_timestamping);
+ memcpy(data,
+ cmsg_expr->cmsg_data->value.scm_timestamping,
+ len);
+ break;
+
+ case EXPR_LIST:
+ stats_expr = cmsg_expr->cmsg_data->value.list;
+ if (nla_expr_list_to_nla(stats_expr, data, &len,
+ tcp_nla, error))
+ goto error_out;
+ break;
+
+ case EXPR_SOCK_EXTENDED_ERR:
+ /* ip(v6)_recv_error returns a struct defined in
+ * function scope that appends a sockaddr.
+ */
+ len = sizeof(struct sock_extended_err);
+ if (cmsg->cmsg_level == SOL_IP)
+ len += sizeof(struct sockaddr_in);
+ else
+ len += sizeof(struct sockaddr_in6);
+
+ ee_expr = cmsg_expr->cmsg_data->value.sock_extended_err;
+ if (new_extended_err(ee_expr,
+ (struct sock_extended_err *)data,
+ error))
+ goto error_out;
+ break;
+
+ default:
+ asprintf(error, "Unrecognized type for cmsg_data");
+ goto error_out;
+ }
+
+ cmsg->cmsg_len = CMSG_LEN(len);
+ sum += CMSG_SPACE(len);
+
+ cmsg = CMSG_NXTHDR(msg, cmsg);
+ }
+
+ status = STATUS_OK;
+
+error_out:
+ msg->msg_controllen = sum;
+
+ return status;
+}
+
+/* Check if the sock_extended_err structure is the same as expected. */
+static bool sock_ee_expect_eq(struct sock_extended_err *expected,
+ struct sock_extended_err *actual, int index,
+ char **error) {
+ if (actual->ee_errno != expected->ee_errno) {
+ asprintf(error,
+ "Bad errno in extended err %d: "
+ "expected=%u actual=%u",
+ index, expected->ee_errno, actual->ee_errno);
+ return false;
+ }
+ if (actual->ee_origin != expected->ee_origin) {
+ asprintf(error,
+ "Bad origin in extended err %d: "
+ "expected=%u actual=%u",
+ index, expected->ee_origin, actual->ee_origin);
+ return false;
+ }
+ if (actual->ee_type != expected->ee_type) {
+ asprintf(error,
+ "Bad type in extended err %d: "
+ "expected=%u actual=%u",
+ index, expected->ee_type, actual->ee_type);
+ return false;
+ }
+ if (actual->ee_code != expected->ee_code) {
+ asprintf(error,
+ "Bad code in extended err %d: "
+ "expected=%u actual=%u",
+ index, expected->ee_code, actual->ee_code);
+ return false;
+ }
+ if (actual->ee_info != expected->ee_info) {
+ asprintf(error,
+ "Bad info in extended err %d: "
+ "expected=%u actual=%u",
+ index, expected->ee_info, actual->ee_info);
+ return false;
+ }
+ if (actual->ee_data != expected->ee_data) {
+ asprintf(error,
+ "Bad data in extended err %d: "
+ "expected=%u actual=%u",
+ index, expected->ee_data, actual->ee_data);
+ return false;
+ }
+ return true;
+}
+
+/* Convert a timespec to usecs. */
+static s64 timespec_to_usecs(struct timespec *ts)
+{
+ if (ts == NULL)
+ return -1;
+ return (s64)ts->tv_sec * 1000000 + ts->tv_nsec / 1000;
+}
+
+/* Check if the scm_timestamping is the same as expected. */
+static bool scm_timestamping_expect_eq(struct state *state,
+ struct scm_timestamping *expected,
+ struct scm_timestamping *actual,
+ int index, char **error)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(expected->ts); i++) {
+ /* ignore the timestamps, if expected is 0. */
+ if (!expected->ts[i].tv_sec && !expected->ts[i].tv_nsec)
+ continue;
+
+ s64 exp_usecs = script_time_to_live_time_usecs(state,
+ timespec_to_usecs(&expected->ts[i]));
+ s64 actual_usecs = timespec_to_usecs(&actual->ts[i]);
+ /* difference exceeds configured timing tolerance */
+ if (llabs(exp_usecs - actual_usecs) >
+ state->config->tolerance_usecs) {
+ asprintf(error,
+ "Bad timestamp %d in scm_timestamping %d: "
+ "expected=%lld (%lld) actual=%lld (%lld) "
+ "start=%lld",
+ i, index,
+ exp_usecs,
+ exp_usecs - state->live_start_time_usecs,
+ actual_usecs,
+ actual_usecs - state->live_start_time_usecs,
+ state->live_start_time_usecs);
+ return false;
+ }
+ }
+ return true;
+}
+
+/* Check the stats of SCM_TIMESTAMPING_OPT_STATS */
+static bool scm_opt_stats_expect_eq(struct state *state,
+ void *expected,
+ void *actual,
+ int len,
+ int index, char **error)
+{
+ int tolerance_us = state->config->tolerance_usecs;
+ int offset = 0;
+ u64 ev, av;
+ u32 ev_u32, av_u32;
+ u8 ev_u8, av_u8;
+
+ while (offset < len) {
+ struct nlattr *enla = (struct nlattr *) (expected + offset);
+ struct nlattr *anla = (struct nlattr *) (actual + offset);
+
+ if (enla->nla_type != anla->nla_type) {
+ asprintf(error,
+ "Bad nla_type %d: "
+ "expected=%u actual=%u", index,
+ enla->nla_type, anla->nla_type);
+ return false;
+ }
+
+ if (enla->nla_len != anla->nla_len) {
+ asprintf(error,
+ "Bad nla_len %d: "
+ "expected=%u actual=%u", index,
+ enla->nla_len, anla->nla_len);
+ return false;
+ }
+
+ switch (enla->nla_type) {
+ case _TCP_NLA_BUSY:
+ case _TCP_NLA_RWND_LIMITED:
+ case _TCP_NLA_SNDBUF_LIMITED:
+ ev = *(u64 *) ((void *) enla + NLA_HDRLEN);
+ av = *(u64 *) ((void *) anla + NLA_HDRLEN);
+
+ if (ev == OPT_NLA_IGNORE_VAL) {
+ break;
+ } else if (ev) {
+ if (llabs((s64)(ev - av)) <= tolerance_us)
+ break;
+ } else if (!av) { /* Be precise about 0s */
+ break;
+ }
+
+ asprintf(error, "Bad %s: expected=%llu actual=%llu",
+ tcp_nla[enla->nla_type].name, ev, av);
+ return false;
+ case _TCP_NLA_DATA_SEGS_OUT:
+ case _TCP_NLA_TOTAL_RETRANS:
+ case _TCP_NLA_PACING_RATE:
+ case _TCP_NLA_DELIVERY_RATE:
+ ev = *(u64 *) ((void *) enla + NLA_HDRLEN);
+ av = *(u64 *) ((void *) anla + NLA_HDRLEN);
+ if (ev == av || ev == OPT_NLA_IGNORE_VAL)
+ break;
+
+ asprintf(error, "Bad %s: expected=%llu actual=%llu",
+ tcp_nla[enla->nla_type].name, ev, av);
+ return false;
+ case _TCP_NLA_SND_CWND:
+ case _TCP_NLA_REORDERING:
+ case _TCP_NLA_MIN_RTT:
+ case _TCP_NLA_SNDQ_SIZE:
+ ev_u32 = *(u32 *) ((void *) enla + NLA_HDRLEN);
+ av_u32 = *(u32 *) ((void *) anla + NLA_HDRLEN);
+ if (ev_u32 == av_u32 ||
+ ev_u32 == OPT_NLA_IGNORE_VAL_U32)
+ break;
+
+ asprintf(error, "Bad %s: expected=%u actual=%u",
+ tcp_nla[enla->nla_type].name, ev_u32, av_u32);
+ return false;
+
+ case _TCP_NLA_RECUR_RETRANS:
+ case _TCP_NLA_DELIVERY_RATE_APP_LMT:
+ case _TCP_NLA_CA_STATE:
+ ev_u8 = *(u8 *) ((void *) enla + NLA_HDRLEN);
+ av_u8 = *(u8 *) ((void *) anla + NLA_HDRLEN);
+ if (ev_u8 == av_u8 ||
+ ev_u8 == OPT_NLA_IGNORE_VAL_U8)
+ break;
+
+ asprintf(error, "Bad %s: expected=%u actual=%u",
+ tcp_nla[enla->nla_type].name, ev_u8, av_u8);
+ return false;
+
+ default:
+ return false;
+ }
+
+ offset += NLA_ALIGN(enla->nla_len);
+ }
+
+ return true;
+}
+
+/* Check if the cmsg in actual is the same as the one in expected. */
+static bool cmsg_expect_eq(struct state *state, struct msghdr *expect,
+ struct msghdr *actual, char **error)
+{
+ int i = 0;
+ const size_t hdr_len = CMSG_ALIGN(sizeof(struct cmsghdr));
+ struct cmsghdr *acm = NULL, *ecm = NULL;
+ void *adata = NULL, *edata = NULL;
+
+ for (acm = CMSG_FIRSTHDR(actual), ecm = CMSG_FIRSTHDR(expect);
+ acm && ecm && acm->cmsg_len && ecm->cmsg_len;
+ acm = CMSG_NXTHDR(actual, acm), ecm = CMSG_NXTHDR(expect, ecm),
+ i++) {
+ if (acm->cmsg_level != ecm->cmsg_level) {
+ asprintf(error,
+ "Bad level in cmsg %d: expected=%d actual=%d",
+ i, ecm->cmsg_level, acm->cmsg_level);
+ return false;
+ }
+ if (acm->cmsg_type != ecm->cmsg_type) {
+ asprintf(error,
+ "Bad type in cmsg %d: expected=%d actual=%d",
+ i, ecm->cmsg_type, acm->cmsg_type);
+ return false;
+ }
+ if (acm->cmsg_len != ecm->cmsg_len) {
+ asprintf(error,
+ "Bad len in cmsg %d: expected=%lu actual=%lu",
+ i, ecm->cmsg_len, acm->cmsg_len);
+ return false;
+ }
+
+ edata = CMSG_DATA(ecm);
+ adata = CMSG_DATA(acm);
+ if (!edata && !adata)
+ continue;
+
+ if (!edata) {
+ asprintf(error,
+ "Bad data in cmsg %d: "
+ "expected is null, actual is not null", i);
+ return false;
+ } else if (!adata) {
+ asprintf(error,
+ "Bad data in cmsg %d: "
+ "expected is not null, actual is null", i);
+ return false;
+ }
+
+ if ((acm->cmsg_level == SOL_IP &&
+ acm->cmsg_type == IP_RECVERR) ||
+ (acm->cmsg_level == SOL_IPV6 &&
+ acm->cmsg_type == IPV6_RECVERR)) {
+ struct sock_extended_err *eee = edata;
+ struct sock_extended_err *aee = adata;
+ if (!sock_ee_expect_eq(eee, aee, i, error))
+ return false;
+ } else if (acm->cmsg_level == SOL_SOCKET &&
+ acm->cmsg_type == SCM_TIMESTAMPING) {
+ struct scm_timestamping *ets = edata;
+ struct scm_timestamping *ats = adata;
+ if (!scm_timestamping_expect_eq(state, ets, ats, i,
+ error))
+ return false;
+ } else if (acm->cmsg_level == SOL_SOCKET &&
+ acm->cmsg_type == SCM_TIMESTAMPING_OPT_STATS) {
+ if (!scm_opt_stats_expect_eq(state, edata, adata,
+ acm->cmsg_len - hdr_len,
+ i, error))
+ return false;
+ } else if (memcmp((char *)adata, /* byte-to-byte */
+ (char *)edata, acm->cmsg_len - hdr_len)) {
+ asprintf(error,
+ "Bad data in cmsg %d: expected=%s actual=%s",
+ i, (char *)edata, (char *)adata);
+ return false;
+ }
+ }
+
+ if (!acm && !ecm)
+ return true;
+ if (acm && !ecm) {
+ asprintf(error, "received more than %d cmsgs", i);
+ return false;
+ }
+ if (!acm && ecm) {
+ asprintf(error, "received only %d cmsgs", i);
+ return false;
+ }
+ asprintf(error, "cmsgs do not match");
+ return false;
+}
+
+/* Free all the space used by the given msghdr. */
+static void msghdr_free(struct msghdr *msg, size_t iov_len)
+{
+ if (msg == NULL)
+ return;
+
+ free(msg->msg_name);
+ iovec_free(msg->msg_iov, iov_len);
+ free(msg->msg_control);
+}
+
+/* Allocate and fill in a msghdr described by the given expression. */
+static int msghdr_new(struct expression *expression,
+ struct msghdr **msg_ptr, size_t *iov_len_ptr,
+ char **error)
+{
+ int status = STATUS_ERR;
+ s32 s32_val = 0;
+ struct msghdr_expr *msg_expr; /* input expression from script */
+ socklen_t name_len = sizeof(struct sockaddr_storage);
+ struct msghdr *msg = NULL; /* live output */
+
+ if (check_type(expression, EXPR_MSGHDR, error))
+ goto error_out;
+
+ msg_expr = expression->value.msghdr;
+
+ msg = calloc(1, sizeof(struct msghdr));
+
+ if (msg_expr->msg_name != NULL) {
+ assert(msg_expr->msg_name->type == EXPR_ELLIPSIS);
+ msg->msg_name = calloc(1, name_len);
+ }
+
+ if (msg_expr->msg_namelen != NULL) {
+ assert(msg_expr->msg_namelen->type == EXPR_ELLIPSIS);
+ msg->msg_namelen = name_len;
+ }
+
+ if (msg_expr->msg_iov != NULL) {
+ if (iovec_new(msg_expr->msg_iov, &msg->msg_iov, iov_len_ptr,
+ error))
+ goto error_out;
+ }
+
+ if (msg_expr->msg_iovlen != NULL) {
+ if (get_s32(msg_expr->msg_iovlen, &s32_val, error))
+ goto error_out;
+ msg->msg_iovlen = s32_val;
+ }
+
+ if (msg->msg_iovlen != *iov_len_ptr) {
+ asprintf(error,
+ "msg_iovlen %d does not match %d-element iovec array",
+ (int)msg->msg_iovlen, (int)*iov_len_ptr);
+ goto error_out;
+ }
+
+ if (msg_expr->msg_control != NULL) {
+ if (cmsg_new(msg_expr->msg_control, msg, error))
+ goto error_out;
+ }
+
+ if (msg_expr->msg_flags != NULL) {
+ if (get_s32(msg_expr->msg_flags, &s32_val, error))
+ goto error_out;
+ msg->msg_flags = s32_val;
+ }
+
+ status = STATUS_OK;
+
+error_out:
+ *msg_ptr = msg;
+ return status;
+}
+
+/* Allocate and fill in a pollfds array described by the given
+ * fds_expression. Return STATUS_OK if the expression is a valid
+ * pollfd struct array. Otherwise fill in the error with a
+ * human-readable error message and return STATUS_ERR.
+ */
+static int pollfds_new(struct state *state,
+ struct expression *fds_expression,
+ struct pollfd **fds_ptr, size_t *fds_len_ptr,
+ char **error)
+{
+ int status = STATUS_ERR;
+ int i;
+ struct expression_list *list; /* input expression from script */
+ size_t fds_len = 0;
+ struct pollfd *fds = NULL; /* live output */
+
+ if (check_type(fds_expression, EXPR_LIST, error))
+ goto error_out;
+
+ list = fds_expression->value.list;
+
+ fds_len = expression_list_length(list);
+ fds = calloc(fds_len, sizeof(struct pollfd));
+
+ for (i = 0; i < fds_len; ++i, list = list->next) {
+ struct pollfd_expr *fds_expr;
+
+ if (check_type(list->expression, EXPR_POLLFD, error))
+ goto error_out;
+
+ fds_expr = list->expression->value.pollfd;
+
+ if (check_type(fds_expr->fd, EXPR_INTEGER, error))
+ goto error_out;
+ if (check_type(fds_expr->events, EXPR_INTEGER, error))
+ goto error_out;
+ if (check_type(fds_expr->revents, EXPR_INTEGER, error))
+ goto error_out;
+
+ if (to_live_fd(state, fds_expr->fd->value.num,
+ &fds[i].fd, error))
+ goto error_out;
+
+ fds[i].events = fds_expr->events->value.num;
+ fds[i].revents = fds_expr->revents->value.num;
+ }
+
+ status = STATUS_OK;
+
+error_out:
+ *fds_ptr = fds;
+ *fds_len_ptr = fds_len;
+ return status;
+}
+
+/* Check the results of a poll() system call: check that the output
+ * revents fields in the fds array match those in the script. Return
+ * STATUS_OK if they match. Otherwise fill in the error with a
+ * human-readable error message and return STATUS_ERR.
+ */
+static int pollfds_check(struct expression *fds_expression,
+ const struct pollfd *fds, size_t fds_len,
+ char **error)
+{
+ struct expression_list *list; /* input expression from script */
+ int i;
+
+ assert(fds_expression->type == EXPR_LIST);
+ list = fds_expression->value.list;
+
+ for (i = 0; i < fds_len; ++i, list = list->next) {
+ struct pollfd_expr *fds_expr;
+ int expected_revents, actual_revents;
+
+ assert(list->expression->type == EXPR_POLLFD);
+ fds_expr = list->expression->value.pollfd;
+
+ assert(fds_expr->fd->type == EXPR_INTEGER);
+ assert(fds_expr->events->type == EXPR_INTEGER);
+ assert(fds_expr->revents->type == EXPR_INTEGER);
+
+ expected_revents = fds_expr->revents->value.num;
+ actual_revents = fds[i].revents;
+ if (actual_revents != expected_revents) {
+ char *expected_revents_string =
+ flags_to_string(poll_flags,
+ expected_revents);
+ char *actual_revents_string =
+ flags_to_string(poll_flags,
+ actual_revents);
+ asprintf(error,
+ "Expected revents of %s but got %s "
+ "for pollfd %d",
+ expected_revents_string,
+ actual_revents_string,
+ i);
+ free(expected_revents_string);
+ free(actual_revents_string);
+ return STATUS_ERR;
+ }
+ }
+ return STATUS_OK;
+}
+
+/* For blocking system calls, give up the global lock and wake the
+ * main thread so it can continue test execution. Callers should call
+ * this function immediately before calling a system call in order to
+ * release the global lock immediately before a system call that the
+ * script expects to block.
+ */
+static void begin_syscall(struct state *state, struct syscall_spec *syscall)
+{
+ if (is_blocking_syscall(syscall)) {
+ assert(state->syscalls->state == SYSCALL_ENQUEUED);
+ state->syscalls->state = SYSCALL_RUNNING;
+ run_unlock(state);
+ DEBUGP("syscall thread: begin_syscall signals dequeued\n");
+ if (pthread_cond_signal(&state->syscalls->dequeued) != 0)
+ die_perror("pthread_cond_signal");
+ }
+}
+
+/* Verify that the system call returned the expected result code and
+ * errno value. Returns STATUS_OK on success; on failure returns
+ * STATUS_ERR and sets error message. Callers should call this function
+ * immediately after returning from a system call in order to immediately
+ * re-grab the global lock if this is a blocking call.
+ */
+enum result_check_t {
+ CHECK_EXACT, /* check that result matches exactly */
+ CHECK_FD, /* check that result is fd or matching error */
+};
+static int end_syscall(struct state *state, struct syscall_spec *syscall,
+ enum result_check_t mode, int actual, char **error)
+{
+ int actual_errno = errno; /* in case we clobber this later */
+ s32 expected = 0;
+
+ /* For blocking calls, advance state and reacquire the global lock. */
+ if (is_blocking_syscall(syscall)) {
+ s64 live_end_usecs = now_usecs(state);
+ DEBUGP("syscall thread: end_syscall grabs lock\n");
+ run_lock(state);
+ state->syscalls->live_end_usecs = live_end_usecs;
+ assert(state->syscalls->state == SYSCALL_RUNNING);
+ state->syscalls->state = SYSCALL_DONE;
+ }
+ if (state->config->verbose) {
+ printf("%s syscall: %9.6f\n", syscall->name,
+ usecs_to_secs(now_usecs(state)));
+ }
+
+
+ /* Compare actual vs expected return value */
+ if (get_s32(syscall->result, &expected, error))
+ return STATUS_ERR;
+ if (mode == CHECK_FD && expected >= 0) {
+ if (actual < 0) {
+ asprintf(error,
+ "Expected non-negative result but got %d "
+ "with errno %d (%s)",
+ actual, actual_errno, strerror(actual_errno));
+ return STATUS_ERR;
+ }
+ } else if (mode == CHECK_FD || mode == CHECK_EXACT) {
+ if (actual != expected) {
+ asprintf(error,
+ "Expected result %d but got %d "
+ "with errno %d (%s)",
+ expected,
+ actual, actual_errno, strerror(actual_errno));
+ return STATUS_ERR;
+ }
+ } else {
+ assert(!"bad mode");
+ }
+
+ /* Compare actual vs expected errno */
+ if (syscall->error != NULL) {
+ s64 expected_errno = 0;
+ if (symbol_to_int(syscall->error->errno_macro,
+ &expected_errno, error))
+ return STATUS_ERR;
+ if (actual_errno != expected_errno) {
+ asprintf(error,
+ "Expected errno %d (%s) but got %d (%s)",
+ (int)expected_errno, strerror(expected_errno),
+ actual_errno, strerror(actual_errno));
+ return STATUS_ERR;
+ }
+ }
+
+ return STATUS_OK;
+}
+
+/* Return a pointer to the fd with the given script fd, or NULL. */
+static struct fd_state *find_by_script_fd(
+ struct state *state, int script_fd)
+{
+ struct fd_state *fd = NULL;
+
+ for (fd = state->fds; fd != NULL; fd = fd->next)
+ if (!fd->is_closed && (fd->script_fd == script_fd)) {
+ assert(fd->live_fd >= 0);
+ assert(fd->script_fd >= 0);
+ return fd;
+ }
+ return NULL;
+}
+
+/* Return a pointer to the fd with the given live fd, or NULL. */
+static struct fd_state *find_by_live_fd(
+ struct state *state, int live_fd)
+{
+ struct fd_state *fd = NULL;
+
+ for (fd = state->fds; fd != NULL; fd = fd->next)
+ if (!fd->is_closed & (fd->live_fd == live_fd)) {
+ assert(fd->live_fd >= 0);
+ assert(fd->script_fd >= 0);
+ return fd;
+ }
+ return NULL;
+}
+
+/* Find the live fd corresponding to the fd in a script. Returns
+ * STATUS_OK on success; on failure returns STATUS_ERR and sets
+ * error message.
+ */
+static int to_live_fd(struct state *state, int script_fd, int *live_fd,
+ char **error)
+{
+ struct fd_state *fd = find_by_script_fd(state, script_fd);
+
+ if (fd != NULL) {
+ *live_fd = fd->live_fd;
+ return STATUS_OK;
+ } else {
+ *live_fd = -1;
+ asprintf(error, "unable to find fd with script fd %d",
+ script_fd);
+ return STATUS_ERR;
+ }
+}
+
+/* Look for conflicting fds. Should not happen if the script is valid and this
+ * program is bug-free.
+ */
+static int check_duplicate_fd(struct state *state, int script_fd, int live_fd,
+ char **error)
+{
+ if (find_by_script_fd(state, script_fd)) {
+ asprintf(error, "duplicate fd %d in script",
+ script_fd);
+ return STATUS_ERR;
+ }
+ if (find_by_live_fd(state, live_fd)) {
+ asprintf(error, "duplicate live fd %d", live_fd);
+ return STATUS_ERR;
+ }
+
+ return STATUS_OK;
+}
+
+/* Parse the argument with the given index
+ * Set *is_null to true if arg is 0 (NULL)
+ * Set *is_null to false if arg is ellipsis (...)
+ * Return error if arg is neither of the above
+ */
+static int buffer_arg(struct expression_list *args, int index,
+ bool *is_null, char **error)
+{
+ struct expression *expression = get_arg(args, index, error);
+
+ if (expression && expression->type == EXPR_ELLIPSIS) {
+ *is_null = false;
+ return STATUS_OK;
+ }
+ if (expression && expression->type == EXPR_INTEGER &&
+ expression->value.num == 0) {
+ *is_null = true;
+ return STATUS_OK;
+ }
+ asprintf(error, "Expected ... or NULL for buffer");
+ return STATUS_ERR;
+}
+
+static void *alloc_buffer(bool is_null, int count, bool set_zero)
+{
+ void *buf;
+
+ if (is_null)
+ return NULL;
+
+ if (set_zero)
+ buf = calloc(count, 1);
+ else
+ buf = malloc(count);
+ assert(buf != NULL);
+ return buf;
+}
+
+/****************************************************************************
+ * Here we have the "backend" post-processing and pre-processing that
+ * we perform after and/or before each of the system calls that
+ * we support...
+ */
+
+/* The app called open(). Create a struct file to track the new file.
+ * Returns STATUS_OK on success; on failure returns STATUS_ERR and
+ * sets error message.
+ */
+static int run_syscall_open(struct state *state, int script_fd, int live_fd,
+ char **error)
+{
+ struct file *file = NULL;
+
+ if (check_duplicate_fd(state, script_fd, live_fd, error))
+ return STATUS_ERR;
+
+ file = file_new(state);
+ file->fd.script_fd = script_fd;
+ file->fd.live_fd = live_fd;
+ return STATUS_OK;
+}
+
+/* The app called socket() in the script and we did a live reenactment
+ * socket() call. Create a struct socket to track the new socket.
+ * Returns STATUS_OK on success; on failure returns STATUS_ERR and
+ * sets error message.
+ */
+static int run_syscall_socket(struct state *state, int address_family, int type,
+ int protocol, int script_fd, int live_fd,
+ char **error)
+{
+ /* Validate fd values. */
+ if (script_fd < 0) {
+ asprintf(error, "invalid socket fd %d in script", script_fd);
+ return STATUS_ERR;
+ }
+ if (live_fd < 0) {
+ asprintf(error, "invalid live socket fd %d", live_fd);
+ return STATUS_ERR;
+ }
+
+ if (check_duplicate_fd(state, script_fd, live_fd, error))
+ return STATUS_ERR;
+
+ /* These fd values are kosher, so store them. */
+ struct socket *socket = socket_new(state);
+ socket->state = SOCKET_NEW;
+ socket->address_family = address_family;
+ socket->type = type;
+ socket->protocol = protocol;
+ socket->fd.script_fd = script_fd;
+ socket->fd.live_fd = live_fd;
+
+ /* Any later packets in the test script will now be mapped here. */
+ state->socket_under_test = socket;
+
+ DEBUGP("socket() creating new socket: script_fd: %d live_fd: %d\n",
+ socket->fd.script_fd, socket->fd.live_fd);
+ return STATUS_OK;
+}
+
+/* Handle a close() call for the given fd.
+ * Returns STATUS_OK on success; on failure returns STATUS_ERR and
+ * sets error message.
+ */
+static int run_syscall_close(struct state *state, int script_fd,
+ int live_fd, char **error)
+{
+ struct fd_state *fd = find_by_script_fd(state, script_fd);
+ if ((fd == NULL) || (fd->live_fd != live_fd))
+ goto error_out;
+
+ fd->is_closed = true;
+ return STATUS_OK;
+
+error_out:
+ asprintf(error,
+ "unable to find fd with script fd %d and live fd %d",
+ script_fd, live_fd);
+ return STATUS_ERR;
+}
+
+/* Fill in the live_addr and live_addrlen for a bind() call.
+ * Returns STATUS_OK on success; on failure returns STATUS_ERR and
+ * sets error message.
+ */
+static int run_syscall_bind(struct state *state,
+ struct sockaddr *live_addr,
+ socklen_t *live_addrlen, char **error)
+{
+ DEBUGP("run_syscall_bind\n");
+
+ /* Fill in the live address we want to bind to */
+ ip_to_sockaddr(&state->config->live_bind_ip,
+ state->config->live_bind_port,
+ live_addr, live_addrlen);
+
+ return STATUS_OK;
+}
+
+/* Handle a listen() call for the given socket.
+ * Returns STATUS_OK on success; on failure returns STATUS_ERR and
+ * sets error message.
+ */
+static int run_syscall_listen(struct state *state, int script_fd,
+ int live_fd, char **error)
+{
+ struct socket *socket = NULL;
+
+ socket = fd_to_socket(find_by_script_fd(state, script_fd));
+ if (socket != NULL) {
+ assert(socket->fd.script_fd == script_fd);
+ assert(socket->fd.live_fd == live_fd);
+ socket->state = SOCKET_PASSIVE_LISTENING;
+ return STATUS_OK;
+ } else {
+ asprintf(error, "unable to find socket with script fd %d",
+ script_fd);
+ return STATUS_ERR;
+ }
+}
+
+/* Handle an accept() call creating a new socket with the given file
+ * descriptors.
+ * Returns STATUS_OK on success; on failure returns STATUS_ERR and
+ * sets error message.
+ */
+static int run_syscall_accept(struct state *state,
+ int script_accepted_fd,
+ int live_accepted_fd,
+ struct sockaddr *live_addr,
+ int live_addrlen, char **error)
+{
+ struct socket *socket = NULL;
+ struct fd_state *fd = NULL;
+ struct ip_address ip;
+ u16 port = 0;
+ DEBUGP("run_syscall_accept\n");
+
+ /* Parse the sockaddr into a nice multi-protocol ip_address struct. */
+ ip_from_sockaddr(live_addr, live_addrlen, &ip, &port);
+
+ /* For ipv4-mapped-ipv6: if ip is IPv4-mapped IPv6, map it to IPv4. */
+ if (ip.address_family == AF_INET6) {
+ struct ip_address ipv4;
+ if (ipv6_map_to_ipv4(ip, &ipv4) == STATUS_OK)
+ ip = ipv4;
+ }
+
+ for (fd = state->fds; fd != NULL; fd = fd->next) {
+ if (fd->ops->type != FD_SOCKET)
+ continue;
+ socket = fd_to_socket(fd);
+ if (DEBUG_LOGGING) {
+ char remote_string[ADDR_STR_LEN];
+ DEBUGP("socket state=%d script addr: %s:%d\n",
+ socket->state,
+ ip_to_string(&socket->script.remote.ip,
+ remote_string),
+ socket->script.remote.port);
+ }
+
+ if ((socket->state == SOCKET_PASSIVE_SYNACK_SENT) || /* TFO */
+ (socket->state == SOCKET_PASSIVE_SYNACK_ACKED)) {
+ assert(is_equal_ip(&socket->live.remote.ip, &ip));
+ assert(is_equal_port(socket->live.remote.port,
+ htons(port)));
+ socket->fd.script_fd = script_accepted_fd;
+ socket->fd.live_fd = live_accepted_fd;
+ return STATUS_OK;
+ }
+ }
+
+ if (!state->config->is_wire_client) {
+ asprintf(error, "unable to find socket matching accept() call");
+ return STATUS_ERR;
+ }
+
+ /* If this is a wire client, then this process just
+ * sees the system call action for this socket. Create a child
+ * passive socket for this accept call, and fill in what we
+ * know about the socket. Any further packets in the test
+ * script will be directed to this child socket.
+ */
+ socket = socket_new(state);
+ state->socket_under_test = socket;
+ assert(socket->state == SOCKET_INIT);
+ socket->address_family = ip.address_family;
+
+ socket->live.remote.ip = ip;
+ socket->live.remote.port = port;
+ socket->live.local.ip = state->config->live_local_ip;
+ socket->live.local.port = htons(state->config->live_bind_port);
+
+ socket->fd.live_fd = live_accepted_fd;
+ socket->fd.script_fd = script_accepted_fd;
+
+ if (DEBUG_LOGGING) {
+ char local_string[ADDR_STR_LEN];
+ char remote_string[ADDR_STR_LEN];
+ DEBUGP("live: local: %s.%d\n",
+ ip_to_string(&socket->live.local.ip, local_string),
+ ntohs(socket->live.local.port));
+ DEBUGP("live: remote: %s.%d\n",
+ ip_to_string(&socket->live.remote.ip, remote_string),
+ ntohs(socket->live.remote.port));
+ }
+ return STATUS_OK;
+}
+
+/* Handle an connect() or sendto() call initiating a connect to a
+ * remote address. Fill in the live_addr and live_addrlen for the live
+ * connect(). Returns STATUS_OK on success; on failure returns
+ * STATUS_ERR and sets error message.
+ */
+static int run_syscall_connect(struct state *state,
+ int script_fd,
+ bool must_be_new_socket,
+ struct sockaddr *live_addr,
+ socklen_t *live_addrlen,
+ int sa_family,
+ char **error)
+{
+ struct socket *socket = NULL;
+ DEBUGP("run_syscall_connect\n");
+
+ if (sa_family != -1) {
+ sa_family_t sa_fa = (sa_family_t) sa_family;
+ memset(live_addr, 0, sizeof(*live_addr));
+ live_addr->sa_family = sa_fa;
+ } else {
+ /* Fill in the live address we want to connect to */
+ ip_to_sockaddr(&state->config->live_connect_ip,
+ state->config->live_connect_port,
+ live_addr, live_addrlen);
+ }
+
+ socket = fd_to_socket(find_by_script_fd(state, script_fd));
+ assert(socket != NULL);
+ /* Reset socket state to NEW if we are about to disconnect
+ * the socket so that later connect will succeed.
+ */
+ if (live_addr->sa_family == AF_UNSPEC) {
+ socket->state = SOCKET_NEW;
+ return STATUS_OK;
+ }
+
+ if (socket->state != SOCKET_NEW) {
+ if (must_be_new_socket) {
+ asprintf(error, "socket is not new");
+ return STATUS_ERR;
+ } else {
+ return STATUS_OK;
+ }
+ }
+
+ socket->state = SOCKET_ACTIVE_CONNECTING;
+ ip_reset(&socket->script.remote.ip);
+ ip_reset(&socket->script.local.ip);
+ socket->script.remote.port = 0;
+ socket->script.local.port = 0;
+ socket->live.remote.ip = state->config->live_remote_ip;
+ socket->live.remote.port = htons(state->config->live_connect_port);
+ DEBUGP("success: setting socket to state %d\n", socket->state);
+ return STATUS_OK;
+}
+
+/* The app called epoll_create(). Create a struct epoll to track this new
+ * epoll event.
+ * Returns STATUS_OK on success; on failure returns STATUS_ERR and sets
+ * error message.
+ */
+static int run_syscall_epoll_create(struct state *state, int epfd_script,
+ int epfd_live, char **error)
+{
+ struct epoll *epoll = NULL;
+
+ if (check_duplicate_fd(state, epfd_script, epfd_live, error))
+ return STATUS_ERR;
+
+ epoll = epoll_new(state);
+ epoll->fd.script_fd = epfd_script;
+ epoll->fd.live_fd = epfd_live;
+ return STATUS_OK;
+}
+
+/* The app called pipe(). Create a struct pipe to track this new pipe event.
+ * Note: both pfd_script and pfd_live point to 2-integer arrays.
+ * Returns STATUS_OK on success; on failure returns STATUS_ERR and sets
+ * error message.
+ */
+static int run_syscall_pipe(struct state *state, int *pfd_script, int *pfd_live,
+ char **error)
+{
+ struct pipe *pipe = NULL;
+ int i;
+
+ for (i = 0; i < 2; i++) {
+ if (check_duplicate_fd(state, pfd_script[i],
+ pfd_live[i], error))
+ return STATUS_ERR;
+ }
+
+ for (i = 0; i < 2; i++) {
+ pipe = pipe_new(state);
+ pipe->fd.script_fd = pfd_script[i];
+ pipe->fd.live_fd = pfd_live[i];
+ }
+ return STATUS_OK;
+}
+
+/****************************************************************************
+ * Here we have the parsing and invocation of the system calls that
+ * we support...
+ */
+
+static int syscall_socket(struct state *state, struct syscall_spec *syscall,
+ struct expression_list *args, char **error)
+{
+ int domain, type, protocol, live_fd, script_fd, result;
+
+ if (check_arg_count(args, 3, error))
+ return STATUS_ERR;
+ if (ellipsis_arg(args, 0, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 1, &type, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 2, &protocol, error))
+ return STATUS_ERR;
+
+ domain = state->config->socket_domain;
+
+ begin_syscall(state, syscall);
+
+ if (state->so_instance) {
+ result = state->so_instance->ifc.socket(
+ state->so_instance->ifc.userdata,
+ domain, type, protocol);
+ } else {
+ result = socket(domain, type, protocol);
+ }
+
+ if (end_syscall(state, syscall, CHECK_FD, result, error))
+ return STATUS_ERR;
+
+ if (result >= 0) {
+ live_fd = result;
+ if (get_s32(syscall->result, &script_fd, error))
+ return STATUS_ERR;
+ if (run_syscall_socket(state, domain, type, protocol,
+ script_fd, live_fd, error))
+ return STATUS_ERR;
+ }
+
+ return STATUS_OK;
+}
+
+static int syscall_bind(struct state *state, struct syscall_spec *syscall,
+ struct expression_list *args, char **error)
+{
+ int live_fd, script_fd, result;
+ struct sockaddr_storage live_addr;
+ socklen_t live_addrlen;
+
+ if (check_arg_count(args, 3, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 0, &script_fd, error))
+ return STATUS_ERR;
+ if (to_live_fd(state, script_fd, &live_fd, error))
+ return STATUS_ERR;
+ if (ellipsis_arg(args, 1, error))
+ return STATUS_ERR;
+ if (ellipsis_arg(args, 2, error))
+ return STATUS_ERR;
+ if (run_syscall_bind(
+ state,
+ (struct sockaddr *)&live_addr, &live_addrlen, error))
+ return STATUS_ERR;
+
+ begin_syscall(state, syscall);
+
+ if (state->so_instance) {
+ result = state->so_instance->ifc.bind(
+ state->so_instance->ifc.userdata,
+ live_fd, (struct sockaddr *)&live_addr,
+ live_addrlen);
+ } else {
+ result = bind(live_fd, (struct sockaddr *)&live_addr,
+ live_addrlen);
+ }
+
+ return end_syscall(state, syscall, CHECK_EXACT, result, error);
+}
+
+static int syscall_listen(struct state *state, struct syscall_spec *syscall,
+ struct expression_list *args, char **error)
+{
+ int live_fd, script_fd, backlog, result;
+
+ if (check_arg_count(args, 2, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 0, &script_fd, error))
+ return STATUS_ERR;
+ if (to_live_fd(state, script_fd, &live_fd, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 1, &backlog, error))
+ return STATUS_ERR;
+
+ begin_syscall(state, syscall);
+
+ if (state->so_instance) {
+ result = state->so_instance->ifc.listen(
+ state->so_instance->ifc.userdata,
+ live_fd, backlog);
+ } else {
+ result = listen(live_fd, backlog);
+ }
+
+ if (end_syscall(state, syscall, CHECK_EXACT, result, error))
+ return STATUS_ERR;
+
+ if (run_syscall_listen(state, script_fd, live_fd, error))
+ return STATUS_ERR;
+
+ return STATUS_OK;
+}
+
+static int syscall_accept(struct state *state, struct syscall_spec *syscall,
+ struct expression_list *args, char **error)
+{
+ int live_fd, script_fd, live_accepted_fd, script_accepted_fd, result;
+ struct sockaddr_storage live_addr;
+ socklen_t live_addrlen = sizeof(live_addr);
+ if (check_arg_count(args, 3, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 0, &script_fd, error))
+ return STATUS_ERR;
+ if (to_live_fd(state, script_fd, &live_fd, error))
+ return STATUS_ERR;
+ if (ellipsis_arg(args, 1, error))
+ return STATUS_ERR;
+ if (ellipsis_arg(args, 2, error))
+ return STATUS_ERR;
+
+ begin_syscall(state, syscall);
+
+ if (state->so_instance) {
+ result = state->so_instance->ifc.accept(
+ state->so_instance->ifc.userdata,
+ live_fd, (struct sockaddr *)&live_addr,
+ &live_addrlen);
+ } else {
+ result = accept(live_fd, (struct sockaddr *)&live_addr,
+ &live_addrlen);
+ }
+
+ if (end_syscall(state, syscall, CHECK_FD, result, error))
+ return STATUS_ERR;
+
+ if (result >= 0) {
+ live_accepted_fd = result;
+ if (get_s32(syscall->result, &script_accepted_fd, error))
+ return STATUS_ERR;
+ if (run_syscall_accept(
+ state, script_accepted_fd, live_accepted_fd,
+ (struct sockaddr *)&live_addr, live_addrlen,
+ error))
+ return STATUS_ERR;
+ }
+
+ return STATUS_OK;
+}
+
+static int syscall_connect(struct state *state, struct syscall_spec *syscall,
+ struct expression_list *args, char **error)
+{
+ int live_fd, script_fd, result;
+ struct sockaddr_storage live_addr;
+ socklen_t live_addrlen = sizeof(live_addr);
+ int sa_family = -1;
+ if (check_arg_count(args, 3, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 0, &script_fd, error))
+ return STATUS_ERR;
+ if (to_live_fd(state, script_fd, &live_fd, error))
+ return STATUS_ERR;
+ if (ellipsis_arg(args, 1, error) &&
+ s32_arg(args, 1, &sa_family, error))
+ return STATUS_ERR;
+ if (ellipsis_arg(args, 2, error))
+ return STATUS_ERR;
+
+ if (run_syscall_connect(
+ state, script_fd, false,
+ (struct sockaddr *)&live_addr, &live_addrlen,
+ sa_family, error))
+ return STATUS_ERR;
+
+ begin_syscall(state, syscall);
+
+ if (state->so_instance) {
+ result = state->so_instance->ifc.connect(
+ state->so_instance->ifc.userdata,
+ live_fd, (struct sockaddr *)&live_addr,
+ live_addrlen);
+ } else {
+ result = connect(live_fd, (struct sockaddr *)&live_addr,
+ live_addrlen);
+ }
+
+ return end_syscall(state, syscall, CHECK_EXACT, result, error);
+}
+
+static int syscall_read(struct state *state, struct syscall_spec *syscall,
+ struct expression_list *args, char **error)
+{
+ int live_fd, script_fd, count, result;
+ char *buf = NULL;
+ bool is_null;
+
+ if (check_arg_count(args, 3, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 0, &script_fd, error))
+ return STATUS_ERR;
+ if (to_live_fd(state, script_fd, &live_fd, error))
+ return STATUS_ERR;
+ if (buffer_arg(args, 1, &is_null, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 2, &count, error))
+ return STATUS_ERR;
+ buf = alloc_buffer(is_null, count, false);
+
+ begin_syscall(state, syscall);
+
+ if (state->so_instance) {
+ result = state->so_instance->ifc.read(
+ state->so_instance->ifc.userdata,
+ live_fd, buf, count);
+ } else {
+ result = read(live_fd, buf, count);
+ }
+
+ int status = end_syscall(state, syscall, CHECK_EXACT, result, error);
+
+ free(buf);
+ return status;
+}
+
+static int syscall_readv(struct state *state, struct syscall_spec *syscall,
+ struct expression_list *args, char **error)
+{
+ int live_fd, script_fd, iov_count, result;
+ struct expression *iov_expression = NULL;
+ struct iovec *iov = NULL;
+ size_t iov_len = 0;
+ int status = STATUS_ERR;
+
+ if (check_arg_count(args, 3, error))
+ goto error_out;
+
+ if (s32_arg(args, 0, &script_fd, error))
+ goto error_out;
+ if (to_live_fd(state, script_fd, &live_fd, error))
+ goto error_out;
+
+ iov_expression = get_arg(args, 1, error);
+ if (iov_expression == NULL)
+ goto error_out;
+ if (iovec_new(iov_expression, &iov, &iov_len, error))
+ goto error_out;
+
+ if (s32_arg(args, 2, &iov_count, error))
+ goto error_out;
+
+ if (iov_count != iov_len) {
+ asprintf(error,
+ "iov_count %d does not match %d-element iovec array",
+ iov_count, (int)iov_len);
+ goto error_out;
+ }
+
+ begin_syscall(state, syscall);
+
+ if (state->so_instance) {
+ result = state->so_instance->ifc.readv(
+ state->so_instance->ifc.userdata,
+ live_fd, iov, iov_count);
+ } else {
+ result = readv(live_fd, iov, iov_count);
+ }
+
+ status = end_syscall(state, syscall, CHECK_EXACT, result, error);
+
+error_out:
+ iovec_free(iov, iov_len);
+ return status;
+}
+
+static int syscall_recv(struct state *state, struct syscall_spec *syscall,
+ struct expression_list *args, char **error)
+{
+ int live_fd, script_fd, count, flags, result;
+ char *buf = NULL;
+ bool is_null;
+
+ if (check_arg_count(args, 4, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 0, &script_fd, error))
+ return STATUS_ERR;
+ if (to_live_fd(state, script_fd, &live_fd, error))
+ return STATUS_ERR;
+ if (buffer_arg(args, 1, &is_null, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 2, &count, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 3, &flags, error))
+ return STATUS_ERR;
+ buf = alloc_buffer(is_null, count, false);
+
+ begin_syscall(state, syscall);
+
+ if (state->so_instance) {
+ result = state->so_instance->ifc.recv(
+ state->so_instance->ifc.userdata,
+ live_fd, buf, count, flags);
+ } else {
+ result = recv(live_fd, buf, count, flags);
+ }
+
+ int status = end_syscall(state, syscall, CHECK_EXACT, result, error);
+
+ free(buf);
+ return status;
+}
+
+static int syscall_recvfrom(struct state *state, struct syscall_spec *syscall,
+ struct expression_list *args, char **error)
+{
+ int live_fd, script_fd, count, flags, result;
+ struct sockaddr_storage live_addr;
+ socklen_t live_addrlen = sizeof(live_addr);
+ char *buf = NULL;
+ bool is_null;
+
+ if (check_arg_count(args, 6, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 0, &script_fd, error))
+ return STATUS_ERR;
+ if (to_live_fd(state, script_fd, &live_fd, error))
+ return STATUS_ERR;
+ if (buffer_arg(args, 1, &is_null, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 2, &count, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 3, &flags, error))
+ return STATUS_ERR;
+ if (ellipsis_arg(args, 4, error))
+ return STATUS_ERR;
+ if (ellipsis_arg(args, 5, error))
+ return STATUS_ERR;
+ buf = alloc_buffer(is_null, count, false);
+
+ begin_syscall(state, syscall);
+
+ if (state->so_instance) {
+ result = state->so_instance->ifc.recvfrom(
+ state->so_instance->ifc.userdata,
+ live_fd, buf, count, flags,
+ (struct sockaddr *)&live_addr, &live_addrlen);
+ } else {
+ result = recvfrom(live_fd, buf, count, flags,
+ (struct sockaddr *)&live_addr, &live_addrlen);
+ }
+
+ int status = end_syscall(state, syscall, CHECK_EXACT, result, error);
+
+ free(buf);
+ return status;
+}
+
+static int syscall_recvmsg(struct state *state, struct syscall_spec *syscall,
+ struct expression_list *args, char **error)
+{
+ int live_fd, script_fd, flags, result;
+ struct expression *msg_expression = NULL;
+ struct msghdr *msg = NULL, *expected_msg = NULL;
+ size_t iov_len = 0;
+ int status = STATUS_ERR;
+
+ if (check_arg_count(args, 3, error))
+ goto error_out;
+ if (s32_arg(args, 0, &script_fd, error))
+ goto error_out;
+ if (to_live_fd(state, script_fd, &live_fd, error))
+ goto error_out;
+
+ msg_expression = get_arg(args, 1, error);
+ if (msg_expression == NULL)
+ goto error_out;
+ if (msghdr_new(msg_expression, &msg, &iov_len, error))
+ goto error_out;
+ if (msghdr_new(msg_expression, &expected_msg, &iov_len, error))
+ goto error_out;
+
+ if (s32_arg(args, 2, &flags, error))
+ goto error_out;
+
+ begin_syscall(state, syscall);
+
+ if (state->so_instance) {
+ result = state->so_instance->ifc.recvmsg(
+ state->so_instance->ifc.userdata,
+ live_fd, msg, flags);
+ } else {
+ result = recvmsg(live_fd, msg, flags);
+ }
+
+ if (end_syscall(state, syscall, CHECK_EXACT, result, error))
+ goto error_out;
+
+ if (msg->msg_flags != expected_msg->msg_flags) {
+ asprintf(error, "Expected msg_flags 0x%08X but got 0x%08X",
+ expected_msg->msg_flags, msg->msg_flags);
+ goto error_out;
+ }
+
+ if (!cmsg_expect_eq(state, expected_msg, msg, error))
+ goto error_out;
+
+ status = STATUS_OK;
+
+error_out:
+ msghdr_free(msg, iov_len);
+ msghdr_free(expected_msg, iov_len);
+ return status;
+}
+
+static int syscall_write(struct state *state, struct syscall_spec *syscall,
+ struct expression_list *args, char **error)
+{
+ int live_fd, script_fd, count, result;
+ char *buf = NULL;
+ bool is_null;
+
+ if (check_arg_count(args, 3, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 0, &script_fd, error))
+ return STATUS_ERR;
+ if (to_live_fd(state, script_fd, &live_fd, error))
+ return STATUS_ERR;
+ if (buffer_arg(args, 1, &is_null, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 2, &count, error))
+ return STATUS_ERR;
+ buf = alloc_buffer(is_null, count, true);
+
+ begin_syscall(state, syscall);
+
+ if (state->so_instance) {
+ result = state->so_instance->ifc.write(
+ state->so_instance->ifc.userdata,
+ live_fd, buf, count);
+ } else {
+ result = write(live_fd, buf, count);
+ }
+
+ int status = end_syscall(state, syscall, CHECK_EXACT, result, error);
+
+ free(buf);
+ return status;
+}
+
+static int syscall_writev(struct state *state, struct syscall_spec *syscall,
+ struct expression_list *args, char **error)
+{
+ int live_fd, script_fd, iov_count, result;
+ struct expression *iov_expression = NULL;
+ struct iovec *iov = NULL;
+ size_t iov_len = 0;
+ int status = STATUS_ERR;
+
+ if (check_arg_count(args, 3, error))
+ goto error_out;
+
+ if (s32_arg(args, 0, &script_fd, error))
+ goto error_out;
+ if (to_live_fd(state, script_fd, &live_fd, error))
+ goto error_out;
+
+ iov_expression = get_arg(args, 1, error);
+ if (iov_expression == NULL)
+ goto error_out;
+ if (iovec_new(iov_expression, &iov, &iov_len, error))
+ goto error_out;
+
+ if (s32_arg(args, 2, &iov_count, error))
+ goto error_out;
+
+ if (iov_count != iov_len) {
+ asprintf(error,
+ "iov_count %d does not match %d-element iovec array",
+ iov_count, (int)iov_len);
+ goto error_out;
+ }
+
+ begin_syscall(state, syscall);
+
+ if (state->so_instance) {
+ result = state->so_instance->ifc.writev(
+ state->so_instance->ifc.userdata,
+ live_fd, iov, iov_count);
+ } else {
+ result = writev(live_fd, iov, iov_count);
+ }
+
+ status = end_syscall(state, syscall, CHECK_EXACT, result, error);
+
+error_out:
+ iovec_free(iov, iov_len);
+ return status;
+}
+
+static int syscall_send(struct state *state, struct syscall_spec *syscall,
+ struct expression_list *args, char **error)
+{
+ int live_fd, script_fd, count, flags, result;
+ char *buf = NULL;
+ bool is_null;
+
+ if (check_arg_count(args, 4, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 0, &script_fd, error))
+ return STATUS_ERR;
+ if (to_live_fd(state, script_fd, &live_fd, error))
+ return STATUS_ERR;
+ if (buffer_arg(args, 1, &is_null, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 2, &count, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 3, &flags, error))
+ return STATUS_ERR;
+ buf = alloc_buffer(is_null, count, true);
+
+ begin_syscall(state, syscall);
+
+ if (state->so_instance) {
+ result = state->so_instance->ifc.send(
+ state->so_instance->ifc.userdata,
+ live_fd, buf, count, flags);
+ } else {
+ result = send(live_fd, buf, count, flags);
+ }
+
+ int status = end_syscall(state, syscall, CHECK_EXACT, result, error);
+
+ sendcall_free(state, buf);
+
+ return status;
+}
+
+static int syscall_sendto(struct state *state, struct syscall_spec *syscall,
+ struct expression_list *args, char **error)
+{
+ int live_fd, script_fd, count, flags, result;
+ struct sockaddr_storage live_addr;
+ socklen_t live_addrlen = sizeof(live_addr);
+ struct socket *socket = NULL;
+ char *buf = NULL;
+ int sa_family = -1;
+ bool is_null;
+
+ if (check_arg_count(args, 6, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 0, &script_fd, error))
+ return STATUS_ERR;
+ if (to_live_fd(state, script_fd, &live_fd, error))
+ return STATUS_ERR;
+ if (buffer_arg(args, 1, &is_null, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 2, &count, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 3, &flags, error))
+ return STATUS_ERR;
+ if (ellipsis_arg(args, 4, error) &&
+ s32_arg(args, 4, &sa_family, error))
+ return STATUS_ERR;
+ if (ellipsis_arg(args, 5, error))
+ return STATUS_ERR;
+
+ /* ICMP sockets need special handling. */
+ socket = fd_to_socket(find_by_script_fd(state, script_fd));
+ if (socket != NULL && socket->type == SOCK_DGRAM &&
+ ((socket->address_family == AF_INET &&
+ socket->protocol == IPPROTO_ICMP) ||
+ (socket->address_family == AF_INET6 &&
+ socket->protocol == IPPROTO_ICMPV6)))
+ return syscall_icmp_sendto(state, syscall, args, error);
+
+ if (run_syscall_connect(
+ state, script_fd, false,
+ (struct sockaddr *)&live_addr, &live_addrlen, sa_family,
+ error))
+ return STATUS_ERR;
+
+ buf = alloc_buffer(is_null, count, true);
+
+ begin_syscall(state, syscall);
+
+ if (state->so_instance) {
+ result = state->so_instance->ifc.sendto(
+ state->so_instance->ifc.userdata,
+ live_fd, buf, count, flags,
+ (struct sockaddr *)&live_addr, live_addrlen);
+ } else {
+ result = sendto(live_fd, buf, count, flags,
+ (struct sockaddr *)&live_addr, live_addrlen);
+ }
+
+ int status = end_syscall(state, syscall, CHECK_EXACT, result, error);
+
+ sendcall_free(state, buf);
+
+ return status;
+}
+
+static int syscall_sendmsg(struct state *state, struct syscall_spec *syscall,
+ struct expression_list *args, char **error)
+{
+ int live_fd, script_fd, flags, result;
+ struct expression *msg_expression = NULL;
+ struct msghdr *msg = NULL;
+ size_t iov_len = 0;
+ int status = STATUS_ERR;
+
+ if (check_arg_count(args, 3, error))
+ goto error_out;
+ if (s32_arg(args, 0, &script_fd, error))
+ goto error_out;
+ if (to_live_fd(state, script_fd, &live_fd, error))
+ goto error_out;
+
+ msg_expression = get_arg(args, 1, error);
+ if (msg_expression == NULL)
+ goto error_out;
+ if (msghdr_new(msg_expression, &msg, &iov_len, error))
+ goto error_out;
+
+ if (s32_arg(args, 2, &flags, error))
+ goto error_out;
+
+ if ((msg->msg_name != NULL) &&
+ run_syscall_connect(state, script_fd, false,
+ msg->msg_name, &msg->msg_namelen, -1, error))
+ goto error_out;
+ if (msg->msg_flags != 0) {
+ asprintf(error, "sendmsg ignores msg_flags field in msghdr");
+ goto error_out;
+ }
+
+ begin_syscall(state, syscall);
+
+ if (state->so_instance) {
+ result = state->so_instance->ifc.sendmsg(
+ state->so_instance->ifc.userdata,
+ live_fd, msg, flags);
+ } else {
+ result = sendmsg(live_fd, msg, flags);
+ }
+
+ status = end_syscall(state, syscall, CHECK_EXACT, result, error);
+
+error_out:
+ if (sendcall_may_free(state))
+ msghdr_free(msg, iov_len);
+ return status;
+}
+
+/*
+ * Send echo request using ICMP socket.
+ * Note: Kernel will reject and fail the sendto() call if the data sent does not
+ * have room for a proper ICMP header. And ICMP type must be 8 (ICMP_ECHO) and
+ * ICMP code must be 0.
+ */
+static int syscall_icmp_sendto(struct state *state, struct syscall_spec *syscall,
+ struct expression_list *args, char **error)
+{
+ int live_fd, script_fd, count, flags, result;
+ struct sockaddr_storage live_addr;
+ socklen_t live_addrlen = sizeof(live_addr);
+ char *buf = NULL;
+ bool is_null;
+
+ if (check_arg_count(args, 6, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 0, &script_fd, error))
+ return STATUS_ERR;
+ if (to_live_fd(state, script_fd, &live_fd, error))
+ return STATUS_ERR;
+ if (buffer_arg(args, 1, &is_null, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 2, &count, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 3, &flags, error))
+ return STATUS_ERR;
+ if (ellipsis_arg(args, 4, error))
+ return STATUS_ERR;
+ if (ellipsis_arg(args, 5, error))
+ return STATUS_ERR;
+
+ if (run_syscall_connect(
+ state, script_fd, false,
+ (struct sockaddr *)&live_addr, &live_addrlen, -1, error))
+ return STATUS_ERR;
+
+ buf = alloc_buffer(is_null, count, true);
+ if (state->config->wire_protocol == AF_INET &&
+ count >= sizeof(struct icmpv4)) {
+ struct icmpv4 *icmp = (struct icmpv4 *)buf;
+ icmp->type = ICMP_ECHO;
+ } else if (state->config->wire_protocol == AF_INET6 &&
+ count >= sizeof(struct icmpv6)) {
+ struct icmpv6 *icmpv6 = (struct icmpv6 *)buf;
+ icmpv6->type = ICMPV6_ECHO_REQUEST;
+ }
+
+ begin_syscall(state, syscall);
+
+ if (state->so_instance) {
+ result = state->so_instance->ifc.sendto(
+ state->so_instance->ifc.userdata,
+ live_fd, buf, count, flags,
+ (struct sockaddr *)&live_addr, live_addrlen);
+ } else {
+ result = sendto(live_fd, buf, count, flags,
+ (struct sockaddr *)&live_addr, live_addrlen);
+ }
+
+ int status = end_syscall(state, syscall, CHECK_EXACT, result, error);
+
+ free(buf);
+ return status;
+}
+
+static int syscall_fcntl(struct state *state, struct syscall_spec *syscall,
+ struct expression_list *args, char **error)
+{
+ int live_fd, script_fd, command, result;
+
+ /* fcntl is an odd system call - it can take either 2 or 3 args. */
+ int actual_arg_count = get_arg_count(args);
+ if ((actual_arg_count != 2) && (actual_arg_count != 3)) {
+ asprintf(error, "fcntl expected 2-3 args but got %d",
+ actual_arg_count);
+ return STATUS_ERR;
+ }
+
+ if (s32_arg(args, 0, &script_fd, error))
+ return STATUS_ERR;
+ if (to_live_fd(state, script_fd, &live_fd, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 1, &command, error))
+ return STATUS_ERR;
+
+ if (actual_arg_count == 2) {
+ begin_syscall(state, syscall);
+
+ if (state->so_instance) {
+ result = state->so_instance->ifc.fcntl(
+ state->so_instance->ifc.userdata,
+ live_fd, command);
+ } else {
+ result = fcntl(live_fd, command);
+ }
+ } else if (actual_arg_count == 3) {
+ s32 arg;
+ if (s32_arg(args, 2, &arg, error))
+ return STATUS_ERR;
+ begin_syscall(state, syscall);
+
+ if (state->so_instance) {
+ result = state->so_instance->ifc.fcntl(
+ state->so_instance->ifc.userdata,
+ live_fd, command, arg);
+ } else {
+ result = fcntl(live_fd, command, arg);
+ }
+ } else {
+ assert(0); /* not reached */
+ }
+
+ return end_syscall(state, syscall, CHECK_EXACT, result, error);
+}
+
+static int syscall_ioctl(struct state *state, struct syscall_spec *syscall,
+ struct expression_list *args, char **error)
+{
+ int live_fd, script_fd, command, result;
+
+ /* ioctl is an odd system call - it can take either 2 or 3 args. */
+ int actual_arg_count = get_arg_count(args);
+ if ((actual_arg_count != 2) && (actual_arg_count != 3)) {
+ asprintf(error, "ioctl expected 2-3 args but got %d",
+ actual_arg_count);
+ return STATUS_ERR;
+ }
+
+ if (s32_arg(args, 0, &script_fd, error))
+ return STATUS_ERR;
+ if (to_live_fd(state, script_fd, &live_fd, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 1, &command, error))
+ return STATUS_ERR;
+
+ if (actual_arg_count == 2) {
+ begin_syscall(state, syscall);
+
+ if (state->so_instance) {
+ result = state->so_instance->ifc.ioctl(
+ state->so_instance->ifc.userdata,
+ live_fd, command);
+ } else {
+ result = ioctl(live_fd, command);
+ }
+
+ return end_syscall(state, syscall, CHECK_EXACT, result, error);
+
+ } else if (actual_arg_count == 3) {
+ s32 script_optval, live_optval;
+
+ if (s32_bracketed_arg(args, 2, &script_optval, error))
+ return STATUS_ERR;
+
+ begin_syscall(state, syscall);
+
+ if (state->so_instance) {
+ result = state->so_instance->ifc.ioctl(
+ state->so_instance->ifc.userdata,
+ live_fd, command, &live_optval);
+ } else {
+ result = ioctl(live_fd, command, &live_optval);
+ }
+
+ if (end_syscall(state, syscall, CHECK_EXACT, result, error))
+ return STATUS_ERR;
+
+ if (live_optval != script_optval) {
+ asprintf(error,
+ "Bad ioctl optval: expected: %d actual: %d",
+ (int)script_optval, (int)live_optval);
+ return STATUS_ERR;
+ }
+
+ return STATUS_OK;
+ } else {
+ assert(0); /* not reached */
+ }
+ return STATUS_ERR;
+}
+
+static int syscall_close(struct state *state, struct syscall_spec *syscall,
+ struct expression_list *args, char **error)
+{
+ int live_fd, script_fd, result;
+ if (check_arg_count(args, 1, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 0, &script_fd, error))
+ return STATUS_ERR;
+ if (to_live_fd(state, script_fd, &live_fd, error))
+ return STATUS_ERR;
+
+ begin_syscall(state, syscall);
+
+ if (state->so_instance) {
+ result = state->so_instance->ifc.close(
+ state->so_instance->ifc.userdata,
+ live_fd);
+ } else {
+ result = close(live_fd);
+ }
+
+ if (end_syscall(state, syscall, CHECK_EXACT, result, error))
+ return STATUS_ERR;
+
+ if (run_syscall_close(state, script_fd, live_fd, error))
+ return STATUS_ERR;
+
+ return STATUS_OK;
+}
+
+static int syscall_shutdown(struct state *state, struct syscall_spec *syscall,
+ struct expression_list *args, char **error)
+{
+ int live_fd, script_fd, how, result;
+ if (check_arg_count(args, 2, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 0, &script_fd, error))
+ return STATUS_ERR;
+ if (to_live_fd(state, script_fd, &live_fd, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 1, &how, error))
+ return STATUS_ERR;
+
+ begin_syscall(state, syscall);
+
+ if (state->so_instance) {
+ result = state->so_instance->ifc.shutdown(
+ state->so_instance->ifc.userdata,
+ live_fd, how);
+ } else {
+ result = shutdown(live_fd, how);
+ }
+
+ if (end_syscall(state, syscall, CHECK_EXACT, result, error))
+ return STATUS_ERR;
+
+ return STATUS_OK;
+}
+
+static int syscall_getsockopt(struct state *state, struct syscall_spec *syscall,
+ struct expression_list *args, char **error)
+{
+ int script_fd, live_fd, level, optname, result;
+ void *live_optval = NULL, *script_optval = NULL;
+ s32 script_optlen, script_optval_s32;
+ socklen_t live_optlen;
+ struct expression *val_expression = NULL;
+ int status = STATUS_ERR;
+
+ if (check_arg_count(args, 5, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 0, &script_fd, error))
+ return STATUS_ERR;
+ if (to_live_fd(state, script_fd, &live_fd, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 1, &level, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 2, &optname, error))
+ return STATUS_ERR;
+ val_expression = get_arg(args, 3, error);
+ if (val_expression == NULL)
+ return STATUS_ERR;
+ if (s32_bracketed_arg(args, 4, &script_optlen, error))
+ return STATUS_ERR;
+
+ /* Allocate space for getsockopt output. */
+ live_optlen = script_optlen;
+ live_optval = calloc(1, live_optlen + 1);
+ assert(live_optval != NULL);
+
+ begin_syscall(state, syscall);
+
+ if (state->so_instance) {
+ result = state->so_instance->ifc.getsockopt(
+ state->so_instance->ifc.userdata,
+ live_fd, level, optname,
+ live_optval, &live_optlen);
+ } else {
+ result = getsockopt(live_fd, level, optname,
+ live_optval, &live_optlen);
+ }
+
+ if (end_syscall(state, syscall, CHECK_EXACT, result, error))
+ goto error_out;
+
+ if ((int)live_optlen != script_optlen) {
+ asprintf(error,
+ "Bad getsockopt optlen: expected: %d actual: %d",
+ (int)script_optlen, (int)live_optlen);
+ goto error_out;
+ }
+
+ if (val_expression->type == EXPR_STRING) {
+ script_optval = val_expression->value.string;
+
+ if (strcmp(live_optval, script_optval) != 0) {
+ asprintf(error,
+ "Bad getsockopt optval: "
+ "expected: '%s' actual: '%s'",
+ (char *)script_optval, (char *)live_optval);
+ goto error_out;
+ }
+ } else if (val_expression->type == EXPR_LIST) {
+ if (script_optlen != 4) {
+ asprintf(error, "Unsupported getsockopt optlen: %d",
+ (int)script_optlen);
+ goto error_out;
+ }
+
+ if (s32_bracketed_arg(args, 3, &script_optval_s32, error))
+ goto error_out;
+
+ if (*(s32 *)live_optval != script_optval_s32) {
+ asprintf(error,
+ "Bad getsockopt optval: "
+ "expected: %d actual: %d",
+ script_optval_s32, *(s32 *)live_optval);
+ goto error_out;
+ }
+ } else if (val_expression->type == EXPR_GRE) {
+ struct gre *live_gre = (struct gre *)live_optval;
+ struct gre *script_gre = &val_expression->value.gre;
+
+ if (script_optlen != sizeof(struct gre)) {
+ asprintf(error, "Unsupported getsockopt optlen: %d",
+ (int)script_optlen);
+ goto error_out;
+ }
+
+ if (live_gre->flags != script_gre->flags ||
+ live_gre->be16[0] != script_gre->be16[0] ||
+ live_gre->be16[1] != script_gre->be16[1] ||
+ live_gre->be32[1] != script_gre->be32[1] ||
+ live_gre->be32[2] != script_gre->be32[2]) {
+ asprintf(error, "Bad getsockopt optval.");
+ /* TODO: Populate this with a GRE header dump. */
+ goto error_out;
+ }
+ } else if (val_expression->type == EXPR_IN6_ADDR) {
+ struct in6_addr *live_ipv6 = (struct in6_addr *)live_optval;
+ struct in6_addr *script_ipv6 = &val_expression->value.address_ipv6;
+
+ if (script_optlen != sizeof(struct in6_addr)) { // != 16
+ asprintf(error, "Unsupported getsockopt optlen: %d",
+ (int)script_optlen);
+ goto error_out;
+ }
+
+ if (memcmp(live_ipv6, script_ipv6, sizeof(struct in6_addr))) {
+ char live_buf[INET6_ADDRSTRLEN];
+ char script_buf[INET6_ADDRSTRLEN];
+ inet_ntop(AF_INET6, live_ipv6, live_buf, sizeof(live_buf));
+ inet_ntop(AF_INET6, script_ipv6, script_buf, sizeof(script_buf));
+ asprintf(error,
+ "Bad getsockopt optval: "
+ "expected: %s "
+ "actual: %s ",
+ script_buf, live_buf);
+ goto error_out;
+ }
+ } else {
+ asprintf(error, "unsupported getsockopt value type: %s",
+ expression_type_to_string(
+ val_expression->type));
+ goto error_out;
+ }
+
+ status = STATUS_OK;
+
+error_out:
+ free(live_optval);
+ return status;
+}
+
+static int syscall_setsockopt(struct state *state, struct syscall_spec *syscall,
+ struct expression_list *args, char **error)
+{
+ int script_fd, live_fd, level, optname, optval_s32, optlen, result;
+ void *optval = NULL;
+ struct expression *val_expression;
+
+ if (check_arg_count(args, 5, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 0, &script_fd, error))
+ return STATUS_ERR;
+ if (to_live_fd(state, script_fd, &live_fd, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 1, &level, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 2, &optname, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 4, &optlen, error))
+ return STATUS_ERR;
+
+ val_expression = get_arg(args, 3, error);
+ if (val_expression == NULL)
+ return STATUS_ERR;
+ if (val_expression->type == EXPR_LINGER) {
+ optval = &val_expression->value.linger;
+ } else if (val_expression->type == EXPR_GRE) {
+ optval = &val_expression->value.gre;
+ } else if (val_expression->type == EXPR_IN6_ADDR) {
+ optval = &val_expression->value.address_ipv6;
+ } else if (val_expression->type == EXPR_MPLS_STACK) {
+ optval = val_expression->value.mpls_stack;
+ } else if (val_expression->type == EXPR_STRING) {
+ optval = val_expression->value.string;
+ } else if (val_expression->type == EXPR_LIST) {
+ if (s32_bracketed_arg(args, 3, &optval_s32, error))
+ return STATUS_ERR;
+ optval = &optval_s32;
+ } else {
+ asprintf(error, "unsupported setsockopt value type: %s",
+ expression_type_to_string(
+ val_expression->type));
+ return STATUS_ERR;
+ }
+
+ begin_syscall(state, syscall);
+
+ if (state->so_instance) {
+ result = state->so_instance->ifc.setsockopt(
+ state->so_instance->ifc.userdata,
+ live_fd, level, optname, optval, optlen);
+ } else {
+ result = setsockopt(live_fd, level, optname, optval, optlen);
+ }
+
+ return end_syscall(state, syscall, CHECK_EXACT, result, error);
+}
+
+static int syscall_poll(struct state *state, struct syscall_spec *syscall,
+ struct expression_list *args, char **error)
+{
+ struct expression *fds_expression = NULL;
+ struct pollfd *fds = NULL;
+ size_t fds_len;
+ int nfds, timeout, result;
+ int status = STATUS_ERR;
+
+ if (check_arg_count(args, 3, error))
+ goto error_out;
+
+ fds_expression = get_arg(args, 0, error);
+ if (fds_expression == NULL)
+ goto error_out;
+ if (pollfds_new(state, fds_expression, &fds, &fds_len, error))
+ goto error_out;
+
+ if (s32_arg(args, 1, &nfds, error))
+ goto error_out;
+ if (s32_arg(args, 2, &timeout, error))
+ goto error_out;
+
+ if (nfds != fds_len) {
+ asprintf(error,
+ "nfds %d does not match %d-element pollfd array",
+ nfds, (int)fds_len);
+ goto error_out;
+ }
+
+ begin_syscall(state, syscall);
+
+ if (state->so_instance) {
+ result = state->so_instance->ifc.poll(
+ state->so_instance->ifc.userdata,
+ fds, nfds, timeout);
+ } else {
+ result = poll(fds, nfds, timeout);
+ }
+
+ if (end_syscall(state, syscall, CHECK_EXACT, result, error))
+ goto error_out;
+
+ if (pollfds_check(fds_expression, fds, fds_len, error))
+ goto error_out;
+
+ status = STATUS_OK;
+
+error_out:
+ free(fds);
+ return status;
+}
+
+static int syscall_cap_set(struct state *state, struct syscall_spec *syscall,
+ struct expression_list *args, char **error)
+{
+ int cap_flag, cap_value, cap_op;
+ int result;
+ cap_t caps;
+
+ if (check_arg_count(args, 3, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 0, &cap_flag, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 1, &cap_value, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 2, &cap_op, error))
+ return STATUS_ERR;
+
+ caps = cap_get_proc();
+ if (caps == NULL)
+ die("Error in cap_get_proc()\n");
+
+ if (cap_set_flag(caps, cap_flag, 1, &cap_value,
+ cap_op) == -1)
+ die("Error in cap_set_flag()\n");
+
+ begin_syscall(state, syscall);
+
+ result = cap_set_proc(caps);
+
+ if (end_syscall(state, syscall, CHECK_FD, result, error))
+ return STATUS_ERR;
+
+ if (cap_free(caps) == -1)
+ die("Error in cap_free()\n");
+
+ return STATUS_OK;
+}
+
+static int syscall_open(struct state *state, struct syscall_spec *syscall,
+ struct expression_list *args, char **error)
+{
+ int script_fd, live_fd, result;
+ struct expression *name_expression;
+ char *name;
+ int flags;
+
+ if (check_arg_count(args, 2, error))
+ return STATUS_ERR;
+ name_expression = get_arg(args, 0, error);
+ if (check_type(name_expression, EXPR_STRING, error))
+ return STATUS_ERR;
+ name = name_expression->value.string;
+ if (s32_arg(args, 1, &flags, error))
+ return STATUS_ERR;
+
+ begin_syscall(state, syscall);
+
+ result = open(name, flags);
+
+ if (end_syscall(state, syscall, CHECK_FD, result, error))
+ return STATUS_ERR;
+
+ if (result >= 0) {
+ live_fd = result;
+ if (get_s32(syscall->result, &script_fd, error))
+ return STATUS_ERR;
+ if (run_syscall_open(state, script_fd, live_fd, error))
+ return STATUS_ERR;
+ }
+
+ return STATUS_OK;
+}
+
+static int syscall_sendfile(struct state *state, struct syscall_spec *syscall,
+ struct expression_list *args, char **error)
+{
+ int live_outfd, script_outfd;
+ int live_infd, script_infd;
+ s64 script_offset = 0;
+ off_t live_offset;
+ int count, result;
+ int status = STATUS_ERR;
+
+ if (check_arg_count(args, 4, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 0, &script_outfd, error))
+ return STATUS_ERR;
+ if (to_live_fd(state, script_outfd, &live_outfd, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 1, &script_infd, error))
+ return STATUS_ERR;
+ if (to_live_fd(state, script_infd, &live_infd, error))
+ return STATUS_ERR;
+ if (s64_bracketed_arg(args, 2, &script_offset, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 3, &count, error))
+ return STATUS_ERR;
+
+ live_offset = script_offset;
+
+ begin_syscall(state, syscall);
+
+ result = sendfile(live_outfd, live_infd, &live_offset, count);
+
+ status = end_syscall(state, syscall, CHECK_EXACT, result, error);
+
+ return status;
+}
+
+/* Translate epoll_event expression into epoll_event data structure
+ * epoll_data specifies the type of epoll_event->data
+ */
+static int get_epoll_event_from_expr(struct state *state,
+ struct expression *epollev,
+ struct epoll_event *event,
+ enum epoll_data_type_t *epoll_data,
+ int script_fd,
+ int live_fd,
+ char **error)
+{
+ struct epollev_expr *epollev_expr = NULL;
+
+ if (epollev == NULL)
+ return STATUS_ERR;
+ if (check_type(epollev, EXPR_EPOLLEV, error))
+ return STATUS_ERR;
+ epollev_expr = epollev->value.epollev;
+ if (!epollev_expr)
+ return STATUS_ERR;
+ if (check_type(epollev_expr->events, EXPR_INTEGER, error))
+ return STATUS_ERR;
+ event->events = epollev_expr->events->value.num;
+ if (epollev_expr->ptr) {
+ if (check_type(epollev_expr->ptr, EXPR_INTEGER, error))
+ return STATUS_ERR;
+ event->data.ptr = (void *)epollev_expr->ptr->value.num;
+ *epoll_data = EPOLL_DATA_PTR;
+ } else if (epollev_expr->fd) {
+ if (check_type(epollev_expr->fd, EXPR_INTEGER, error))
+ return STATUS_ERR;
+ /* script_fd = -1 means we don't have a specific socket fd
+ * So we find live_fd directly from passed in event->data.fd
+ */
+ if (script_fd == -1) {
+ script_fd = epollev_expr->fd->value.num;
+ if (to_live_fd(state, script_fd, &live_fd, error))
+ return STATUS_ERR;
+ } else {
+ if (epollev_expr->fd->value.num != script_fd) {
+ asprintf(error,
+ "wrong fd specified in epoll_event\n");
+ return STATUS_ERR;
+ }
+ }
+ event->data.fd = live_fd;
+ *epoll_data = EPOLL_DATA_FD;
+ } else if (epollev_expr->u32) {
+ if (check_type(epollev_expr->u32, EXPR_INTEGER, error))
+ return STATUS_ERR;
+ event->data.u32 = epollev_expr->u32->value.num;
+ *epoll_data = EPOLL_DATA_U32;
+ } else if (epollev_expr->u64) {
+ if (check_type(epollev_expr->u64, EXPR_INTEGER, error))
+ return STATUS_ERR;
+ event->data.u64 = epollev_expr->u64->value.num;
+ *epoll_data = EPOLL_DATA_U64;
+ } else {
+ asprintf(error, "epoll_event specified incorrectly");
+ return STATUS_ERR;
+ }
+
+ return STATUS_OK;
+}
+
+static int syscall_epoll_create(struct state *state, struct syscall_spec *syscall,
+ struct expression_list *args, char **error)
+{
+ int size, result, script_fd, live_fd;
+ if (check_arg_count(args, 1, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 0, &size, error))
+ return STATUS_ERR;
+
+ begin_syscall(state, syscall);
+
+ if (state->so_instance) {
+ result = state->so_instance->ifc.epoll_create(
+ state->so_instance->ifc.userdata,
+ size);
+ } else {
+ result = epoll_create(size);
+ }
+
+ if (end_syscall(state, syscall, CHECK_FD, result, error))
+ return STATUS_ERR;
+
+ if (result >= 0) {
+ live_fd = result;
+ if (get_s32(syscall->result, &script_fd, error))
+ return STATUS_ERR;
+ if (run_syscall_epoll_create(state, script_fd, live_fd, error))
+ return STATUS_ERR;
+ }
+
+ return STATUS_OK;
+}
+
+static int syscall_epoll_ctl(struct state *state, struct syscall_spec *syscall,
+ struct expression_list *args, char **error)
+{
+ int epfd_script, epfd_live, op, script_fd, live_fd, result;
+ struct expression *epollev = NULL;
+ struct epoll_event event;
+ enum epoll_data_type_t epoll_data;
+
+ if (check_arg_count(args, 4, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 0, &epfd_script, error))
+ return STATUS_ERR;
+ if (to_live_fd(state, epfd_script, &epfd_live, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 1, &op, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 2, &script_fd, error))
+ return STATUS_ERR;
+ if (to_live_fd(state, script_fd, &live_fd, error))
+ return STATUS_ERR;
+ epollev = get_arg(args, 3, error);
+ if (get_epoll_event_from_expr(state, epollev, &event, &epoll_data,
+ script_fd, live_fd, error))
+ return STATUS_ERR;
+
+ begin_syscall(state, syscall);
+
+ if (state->so_instance) {
+ result = state->so_instance->ifc.epoll_ctl(
+ state->so_instance->ifc.userdata,
+ epfd_live, op, live_fd, &event);
+ } else {
+ result = epoll_ctl(epfd_live, op, live_fd, &event);
+ }
+
+ if (end_syscall(state, syscall, CHECK_EXACT, result, error))
+ return STATUS_ERR;
+
+ return STATUS_OK;
+}
+
+static int syscall_epoll_wait(struct state *state, struct syscall_spec *syscall,
+ struct expression_list *args, char **error)
+{
+ int epfd_script, epfd_live, maxevents, timeout;
+ struct expression *epollev = NULL;
+ struct epoll_event event_script = {0};
+ struct epoll_event *event_live;
+ enum epoll_data_type_t epoll_data;
+ int status = STATUS_ERR;
+ int result;
+
+ if (check_arg_count(args, 4, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 0, &epfd_script, error))
+ return STATUS_ERR;
+ if (to_live_fd(state, epfd_script, &epfd_live, error))
+ return STATUS_ERR;
+ epollev = get_arg(args, 1, error);
+ if (get_epoll_event_from_expr(state, epollev, &event_script,
+ &epoll_data, -1, -1, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 2, &maxevents, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 3, &timeout, error))
+ return STATUS_ERR;
+
+ event_live = calloc(maxevents, sizeof(struct epoll_event));
+ if (!event_live) {
+ asprintf(error, "Failed to calloc %d struct epoll_event\n",
+ maxevents);
+ goto error_out;
+ }
+
+ begin_syscall(state, syscall);
+
+ if (state->so_instance) {
+ result = state->so_instance->ifc.epoll_wait(
+ state->so_instance->ifc.userdata,
+ epfd_live, event_live, maxevents, timeout);
+ } else {
+ result = epoll_wait(epfd_live, event_live, maxevents, timeout);
+ }
+
+ if (end_syscall(state, syscall, CHECK_EXACT, result, error))
+ goto error_out;
+
+ if (event_script.events != event_live->events) {
+ asprintf(error,
+ "epoll_event->events does not match script: "
+ "expected: 0x%x "
+ "actual: 0x%x\n",
+ event_script.events, event_live->events);
+ goto error_out;
+ }
+
+ switch(epoll_data) {
+ case EPOLL_DATA_PTR:
+ if (event_script.data.ptr != event_live->data.ptr) {
+ asprintf(error,
+ "epoll_event->data does not match script: "
+ "expected: %p "
+ "actual: %p\n",
+ event_script.data.ptr,
+ event_live->data.ptr);
+ goto error_out;
+ }
+ break;
+ case EPOLL_DATA_FD:
+ if (event_script.data.fd != event_live->data.fd) {
+ asprintf(error,
+ "epoll_event->data does not match script: "
+ "expected: %d "
+ "actual: %d\n",
+ event_script.data.fd,
+ event_live->data.fd);
+ goto error_out;
+ }
+ break;
+ case EPOLL_DATA_U32:
+ if (event_script.data.u32 != event_live->data.u32) {
+ asprintf(error,
+ "epoll_event->data does not match script: "
+ "expected: %u "
+ "actual: %u\n",
+ event_script.data.u32,
+ event_live->data.u32);
+ goto error_out;
+ }
+ break;
+ case EPOLL_DATA_U64:
+ if (event_script.data.u64 != event_live->data.u64) {
+ asprintf(error,
+ "epoll_event->data does not match script: "
+ "expected: %lu "
+ "actual: %lu\n",
+ event_script.data.u64,
+ event_live->data.u64);
+ goto error_out;
+ }
+ break;
+ default:
+ asprintf(error, "wrong event->data type\n");
+ goto error_out;
+ }
+
+ status = STATUS_OK;
+
+error_out:
+ free(event_live);
+ return status;
+}
+
+static int get_pipe_expression(struct state *state,
+ struct expression *pipe_expr,
+ int *pipefd_script,
+ char **error)
+{
+ struct expression_list *list;
+ int i = 0;
+ int list_len;
+
+ if (check_type(pipe_expr, EXPR_LIST, error))
+ return STATUS_ERR;
+ list = pipe_expr->value.list;
+ list_len = list_length(list);
+ if (list_len != 2) {
+ asprintf(error, "%d pipe file descriptors instead of 2\n",
+ list_len);
+ return STATUS_ERR;
+ }
+ for (i = 0; i < 2; i++) {
+ if (check_type(list->expression, EXPR_INTEGER, error))
+ return STATUS_ERR;
+ pipefd_script[i] = list->expression->value.num;
+ list = list->next;
+ }
+
+ return STATUS_OK;
+}
+
+static int syscall_pipe(struct state *state, struct syscall_spec *syscall,
+ struct expression_list *args, char **error)
+{
+ struct expression *pipe_expr = NULL;
+ int pipefd_script[2];
+ int pipefd_live[2];
+ int result;
+
+ if (check_arg_count(args, 1, error))
+ return STATUS_ERR;
+ pipe_expr = get_arg(args, 0, error);
+ if (pipe_expr == NULL)
+ return STATUS_ERR;
+ if (get_pipe_expression(state, pipe_expr, pipefd_script, error))
+ return STATUS_ERR;
+
+ begin_syscall(state, syscall);
+
+ if (state->so_instance) {
+ result = state->so_instance->ifc.pipe(
+ state->so_instance->ifc.userdata,
+ pipefd_live);
+ } else {
+ result = pipe(pipefd_live);
+ }
+
+ if (end_syscall(state, syscall, CHECK_EXACT, result, error))
+ return STATUS_ERR;
+
+ if (result >= 0) {
+ if (run_syscall_pipe(state, pipefd_script, pipefd_live, error))
+ return STATUS_ERR;
+ }
+
+ return STATUS_OK;
+}
+
+static int syscall_splice(struct state *state, struct syscall_spec *syscall,
+ struct expression_list *args, char **error)
+{
+ int fd_in_script, fd_in_live;
+ int fd_out_script, fd_out_live;
+ s64 off_in, off_out;
+ int len, flags;
+ int result;
+
+ if (check_arg_count(args, 6, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 0, &fd_in_script, error))
+ return STATUS_ERR;
+ if (to_live_fd(state, fd_in_script, &fd_in_live, error))
+ return STATUS_ERR;
+ if (s64_arg(args, 1, &off_in, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 2, &fd_out_script, error))
+ return STATUS_ERR;
+ if (to_live_fd(state, fd_out_script, &fd_out_live, error))
+ return STATUS_ERR;
+ if (s64_arg(args, 3, &off_out, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 4, &len, error))
+ return STATUS_ERR;
+ if (s32_arg(args, 5, &flags, error))
+ return STATUS_ERR;
+
+ if (state->so_instance) {
+ result = state->so_instance->ifc.splice(
+ state->so_instance->ifc.userdata,
+ fd_in_live, (loff_t *) &off_in,
+ fd_out_live, (loff_t *) &off_out,
+ len, flags);
+ } else {
+ result = splice(fd_in_live, (loff_t *) off_in, fd_out_live,
+ (loff_t *) off_out, len, flags);
+ }
+ if (end_syscall(state, syscall, CHECK_EXACT, result, error))
+ return STATUS_ERR;
+
+ return STATUS_OK;
+}
+
+/* A dispatch table with all the system calls that we support... */
+struct system_call_entry {
+ const char *name;
+ int (*function) (struct state *state,
+ struct syscall_spec *syscall,
+ struct expression_list *args,
+ char **error);
+};
+struct system_call_entry system_call_table[] = {
+ {"socket", syscall_socket},
+ {"bind", syscall_bind},
+ {"listen", syscall_listen},
+ {"accept", syscall_accept},
+ {"connect", syscall_connect},
+ {"read", syscall_read},
+ {"readv", syscall_readv},
+ {"recv", syscall_recv},
+ {"recvfrom", syscall_recvfrom},
+ {"recvmsg", syscall_recvmsg},
+ {"write", syscall_write},
+ {"writev", syscall_writev},
+ {"send", syscall_send},
+ {"sendto", syscall_sendto},
+ {"sendmsg", syscall_sendmsg},
+ {"fcntl", syscall_fcntl},
+ {"ioctl", syscall_ioctl},
+ {"close", syscall_close},
+ {"shutdown", syscall_shutdown},
+ {"getsockopt", syscall_getsockopt},
+ {"setsockopt", syscall_setsockopt},
+ {"poll", syscall_poll},
+ {"cap_set", syscall_cap_set},
+ {"open", syscall_open},
+ {"sendfile", syscall_sendfile},
+ {"epoll_create", syscall_epoll_create},
+ {"epoll_ctl", syscall_epoll_ctl},
+ {"epoll_wait", syscall_epoll_wait},
+ {"pipe", syscall_pipe},
+ {"splice", syscall_splice},
+};
+
+/* Evaluate the system call arguments and invoke the system call. */
+static void invoke_system_call(
+ struct state *state, struct event *event, struct syscall_spec *syscall)
+{
+ DEBUGP("%d: invoke call: %s\n", event->line_number, syscall->name);
+
+ char *error = NULL;
+ const char *name = syscall->name;
+ struct expression_list *args = NULL;
+ int i = 0;
+ int result = 0;
+
+ /* Wait for the right time before firing off this event. */
+ wait_for_event(state);
+
+ /* Find and invoke the handler for this system call. */
+ for (i = 0; i < ARRAY_SIZE(system_call_table); ++i)
+ if (strcmp(name, system_call_table[i].name) == 0)
+ break;
+ if (i == ARRAY_SIZE(system_call_table)) {
+ asprintf(&error, "Unknown system call: '%s'", name);
+ goto error_out;
+ }
+
+ /* Evaluate script symbolic expressions to get live numeric args for
+ * system calls.
+ */
+ if (evaluate_expression_list(syscall->arguments, &args, &error))
+ goto error_out;
+
+ /* Run the system call. */
+ result = system_call_table[i].function(state, syscall, args, &error);
+
+ free_expression_list(args);
+
+ if (result == STATUS_ERR)
+ goto error_out;
+ return;
+
+error_out:
+ die("%s:%d: runtime error in %s call: %s\n",
+ state->config->script_path, event->line_number,
+ syscall->name, error);
+ free(error);
+}
+
+/* Wait for the system call thread to go idle. To avoid mystifying
+ * hangs when scripts specify overlapping time ranges for blocking
+ * system calls, we limit the duration of our waiting to 1 second.
+ */
+static int await_idle_thread(struct state *state)
+{
+ struct timespec end_time = { .tv_sec = 0, .tv_nsec = 0 };
+ const int MAX_WAIT_SECS = 1;
+ while (state->syscalls->state != SYSCALL_IDLE) {
+ /* On the first time through the loop, calculate end time. */
+ if (end_time.tv_sec == 0) {
+ if (clock_gettime(CLOCK_REALTIME, &end_time) != 0)
+ die_perror("clock_gettime");
+ end_time.tv_sec += MAX_WAIT_SECS;
+ }
+ /* Wait for a signal or our timeout end_time to arrive. */
+ DEBUGP("main thread: awaiting idle syscall thread\n");
+ int status = pthread_cond_timedwait(&state->syscalls->idle,
+ &state->mutex, &end_time);
+ if (status == ETIMEDOUT)
+ return STATUS_ERR;
+ else if (status != 0)
+ die_perror("pthread_cond_timedwait");
+ }
+ return STATUS_OK;
+}
+
+static int yield(void)
+{
+#if defined(linux)
+ return pthread_yield();
+#elif defined(__FreeBSD__) || defined(__OpenBSD__)
+ pthread_yield();
+ return 0;
+#elif defined(__NetBSD__)
+ return sched_yield();
+#endif /* defined(__NetBSD__) */
+}
+
+/* Enqueue the system call for the syscall thread and wake up the thread. */
+static void enqueue_system_call(
+ struct state *state, struct event *event, struct syscall_spec *syscall)
+{
+ char *error = NULL;
+ bool done = false;
+
+ /* Wait if there are back-to-back blocking system calls. */
+ if (await_idle_thread(state)) {
+ asprintf(&error, "blocking system call while another blocking "
+ "system call is already in progress");
+ goto error_out;
+ }
+
+ /* Enqueue the system call info and wake up the syscall thread. */
+ DEBUGP("main thread: signal enqueued\n");
+ state->syscalls->state = SYSCALL_ENQUEUED;
+ if (pthread_cond_signal(&state->syscalls->enqueued) != 0)
+ die_perror("pthread_cond_signal");
+
+ /* Wait for the syscall thread to dequeue and start the system call. */
+ while (state->syscalls->state == SYSCALL_ENQUEUED) {
+ DEBUGP("main thread: waiting for dequeued signal; "
+ "state: %d\n", state->syscalls->state);
+ if (pthread_cond_wait(&state->syscalls->dequeued,
+ &state->mutex) != 0) {
+ die_perror("pthread_cond_wait");
+ }
+ }
+
+ /* Wait for the syscall thread to block or finish the call. */
+ while (!done) {
+ /* Unlock and yield so the system call thread can make
+ * the system call in a timely fashion.
+ */
+ DEBUGP("main thread: unlocking and yielding\n");
+ pid_t thread_id = state->syscalls->thread_id;
+ run_unlock(state);
+ if (yield() != 0)
+ die_perror("yield");
+
+ DEBUGP("main thread: checking syscall thread state\n");
+ if (is_thread_sleeping(getpid(), thread_id))
+ done = true;
+
+ /* Grab the lock again and see if the thread is idle. */
+ DEBUGP("main thread: locking and reading state\n");
+ run_lock(state);
+ if (state->syscalls->state == SYSCALL_IDLE)
+ done = true;
+ }
+ DEBUGP("main thread: continuing after syscall\n");
+ return;
+
+error_out:
+ die("%s:%d: runtime error in %s call: %s\n",
+ state->config->script_path, event->line_number,
+ syscall->name, error);
+ free(error);
+}
+
+void run_system_call_event(
+ struct state *state, struct event *event, struct syscall_spec *syscall)
+{
+ DEBUGP("%d: system call: %s\n", event->line_number, syscall->name);
+
+ if (is_blocking_syscall(syscall))
+ enqueue_system_call(state, event, syscall);
+ else {
+ await_idle_thread(state);
+ invoke_system_call(state, event, syscall);
+ }
+}
+
+/* The code executed by our system call thread, which executes
+ * blocking system calls.
+ */
+static void *system_call_thread(void *arg)
+{
+ struct state *state = (struct state *)arg;
+ char *error = NULL;
+ struct event *event = NULL;
+ struct syscall_spec *syscall = NULL;
+ bool done = false;
+
+ DEBUGP("syscall thread: starting and locking\n");
+ run_lock(state);
+
+ state->syscalls->thread_id = gettid();
+ if (state->syscalls->thread_id < 0)
+ die_perror("gettid");
+
+ while (!done) {
+ DEBUGP("syscall thread: in state %d\n",
+ state->syscalls->state);
+
+ switch (state->syscalls->state) {
+ case SYSCALL_IDLE:
+ DEBUGP("syscall thread: waiting\n");
+ if (pthread_cond_wait(&state->syscalls->enqueued,
+ &state->mutex)) {
+ die_perror("pthread_cond_wait");
+ }
+ break;
+
+ case SYSCALL_RUNNING:
+ case SYSCALL_DONE:
+ assert(0); /* should not be reached */
+ break;
+
+ case SYSCALL_ENQUEUED:
+ DEBUGP("syscall thread: invoking syscall\n");
+ /* Remember the syscall event, since below we
+ * release the global lock and the main thread
+ * will move on to other, later events.
+ */
+ event = state->event;
+ syscall = event->event.syscall;
+ assert(event->type == SYSCALL_EVENT);
+ state->syscalls->event = event;
+ state->syscalls->live_end_usecs = -1;
+
+ /* Make the system call. Note that our callees
+ * here will release the global lock before
+ * making the actual system call and then
+ * re-acquire it after the system call returns
+ * and before returning to us.
+ */
+ invoke_system_call(state, event, syscall);
+
+ /* Check end time for the blocking system call. */
+ assert(state->syscalls->live_end_usecs >= 0);
+ if (verify_time(state,
+ event->time_type,
+ syscall->end_usecs, 0,
+ state->syscalls->live_end_usecs,
+ "system call return", &error)) {
+ die("%s:%d: %s\n",
+ state->config->script_path,
+ event->line_number,
+ error);
+ }
+
+ /* Mark our thread idle and wake the main
+ * thread if it's waiting for this call to
+ * finish.
+ */
+ assert(state->syscalls->state == SYSCALL_DONE);
+ state->syscalls->state = SYSCALL_IDLE;
+ state->syscalls->event = NULL;
+ state->syscalls->live_end_usecs = -1;
+ DEBUGP("syscall thread: now idle\n");
+ if (pthread_cond_signal(&state->syscalls->idle) != 0)
+ die_perror("pthread_cond_signal");
+ break;
+
+ case SYSCALL_EXITING:
+ done = true;
+ break;
+ /* omitting default so compiler will catch missing cases */
+ }
+ }
+ DEBUGP("syscall thread: unlocking and exiting\n");
+ run_unlock(state);
+
+ return NULL;
+}
+
+struct syscalls *syscalls_new(struct state *state)
+{
+ struct syscalls *syscalls = calloc(1, sizeof(struct syscalls));
+
+ syscalls->state = SYSCALL_IDLE;
+
+ if (pthread_create(&syscalls->thread, NULL, system_call_thread,
+ state) != 0) {
+ die_perror("pthread_create");
+ }
+
+ if ((pthread_cond_init(&syscalls->idle, NULL) != 0) ||
+ (pthread_cond_init(&syscalls->enqueued, NULL) != 0) ||
+ (pthread_cond_init(&syscalls->dequeued, NULL) != 0)) {
+ die_perror("pthread_cond_init");
+ }
+
+ return syscalls;
+}
+
+void syscalls_free(struct state *state, struct syscalls *syscalls)
+{
+ /* Wait a bit for the thread to go idle. */
+ if (await_idle_thread(state)) {
+ die("%s:%d: runtime error: exiting while "
+ "a blocking system call is in progress\n",
+ state->config->script_path,
+ syscalls->event->line_number);
+ }
+
+ /* Send a request to terminate the thread. */
+ DEBUGP("main thread: signaling syscall thread to exit\n");
+ syscalls->state = SYSCALL_EXITING;
+ if (pthread_cond_signal(&syscalls->enqueued) != 0)
+ die_perror("pthread_cond_signal");
+
+ /* Release the lock briefly and wait for syscall thread to finish. */
+ run_unlock(state);
+ DEBUGP("main thread: unlocking, waiting for syscall thread exit\n");
+ void *thread_result = NULL;
+ if (pthread_join(syscalls->thread, &thread_result) != 0)
+ die_perror("pthread_cancel");
+ DEBUGP("main thread: joined syscall thread; relocking\n");
+ run_lock(state);
+
+ if ((pthread_cond_destroy(&syscalls->idle) != 0) ||
+ (pthread_cond_destroy(&syscalls->enqueued) != 0) ||
+ (pthread_cond_destroy(&syscalls->dequeued) != 0)) {
+ die_perror("pthread_cond_destroy");
+ }
+
+ memset(syscalls, 0, sizeof(*syscalls)); /* to help catch bugs */
+ free(syscalls);
+}
diff --git a/test/packetdrill/run_system_call.h b/test/packetdrill/run_system_call.h
new file mode 100644
index 0000000..03a3d27
--- /dev/null
+++ b/test/packetdrill/run_system_call.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Interface for a module to execute a system call from a test script.
+ */
+
+#ifndef __RUN_SYSTEM_CALL_H__
+#define __RUN_SYSTEM_CALL_H__
+
+#include "types.h"
+
+#include <pthread.h>
+#include "script.h"
+
+struct state;
+
+/* States in which the system call thread can be. */
+enum syscall_state_t {
+ SYSCALL_IDLE, /* system call thread is idle */
+ SYSCALL_ENQUEUED, /* blocking system call is ready to execute */
+ SYSCALL_RUNNING, /* system call is running */
+ SYSCALL_DONE, /* system call is done running */
+ SYSCALL_EXITING, /* process is exiting */
+};
+
+/* Internal state for the system call module, including the "syscall
+ * thread", which handles blocking system calls.
+ */
+struct syscalls {
+ enum syscall_state_t state; /* current state of syscall thread */
+ struct event *event; /* current system call it's running */
+ s64 live_end_usecs; /* time of last system call return */
+
+ /* Handles for the syscall thread, for blocking system calls. */
+ pthread_t thread; /* pthread thread handle */
+ pid_t thread_id; /* kernel thread ID */
+
+ /* The main thread waits on this condition variable. The
+ * system call thread signals this when it has finished
+ * executing a blocking system call and is now idle and ready
+ * to execute another blocking system call.
+ */
+ pthread_cond_t idle;
+
+ /* The system call thread waits on this condition
+ * variable. The main thread signals this when it has enqueued
+ * a blocking system call to execute, and thus the system call
+ * thread should wake up and execute that system call. The
+ * main thread also signals this when it's time to exit.
+ */
+ pthread_cond_t enqueued;
+
+ /* The main thread waits on this condition variable. The
+ * system call thread signals this after it has dequeued the
+ * system call and just before it invokes the system call, at
+ * which point the main thread should wake up to continue test
+ * execution.
+ */
+ pthread_cond_t dequeued;
+};
+
+/* Info for a nla type */
+struct nla_type_info {
+ const char* name;
+ int length;
+};
+
+/* Allocate and return internal state for the system call module. */
+extern struct syscalls *syscalls_new(struct state *state);
+
+/* Tear down a syscalls and free up the resources it has allocated. */
+extern void syscalls_free(struct state *state,
+ struct syscalls *syscalls);
+
+/* Execute the given system call event. The system call may be
+ * expected to block for a while, or it may be expected to return
+ * immediately. To keep things simple, currently we only support
+ * at most one blocking system call at a time; if a script attempts to
+ * start a second blocking call before the first blocking call has
+ * returned then this second call raises a runtime error.
+ */
+void run_system_call_event(struct state *state,
+ struct event *event,
+ struct syscall_spec *syscall);
+
+#endif /* __RUN_SYSTEM_CALL_H__ */
diff --git a/test/packetdrill/script.c b/test/packetdrill/script.c
new file mode 100644
index 0000000..253e9f7
--- /dev/null
+++ b/test/packetdrill/script.c
@@ -0,0 +1,745 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Implementation of functions to help interpret a test script.
+ */
+
+#include "script.h"
+
+#include <poll.h>
+#include <stdlib.h>
+
+#include "assert.h"
+#include "symbols.h"
+#include "gre.h"
+
+/* Fill in a value representing the given expression in
+ * fully-evaluated form (e.g. symbols resolved to ints). On success,
+ * returns STATUS_OK. On error return STATUS_ERR and fill in *error.
+ */
+static int evaluate(struct expression *in,
+ struct expression **out_ptr, char **error);
+
+/* Initialize script object */
+void init_script(struct script *script)
+{
+ memset(script, 0, sizeof(*script));
+ script->option_list = NULL;
+ script->init_command = NULL;
+ script->event_list = NULL;
+}
+
+/* This table maps expression types to human-readable strings */
+struct expression_type_entry {
+ enum expression_t type;
+ const char *name;
+};
+struct expression_type_entry expression_type_table[] = {
+ { EXPR_ELLIPSIS, "ellipsis" },
+ { EXPR_INTEGER, "integer" },
+ { EXPR_WORD, "word" },
+ { EXPR_STRING, "string" },
+ { EXPR_GRE, "gre" },
+ { EXPR_IN6_ADDR, "in6_addr" },
+ { EXPR_SOCKET_ADDRESS_IPV4, "sockaddr_in" },
+ { EXPR_SOCKET_ADDRESS_IPV6, "sockaddr_in6" },
+ { EXPR_LINGER, "linger" },
+ { EXPR_BINARY, "binary_expression" },
+ { EXPR_LIST, "list" },
+ { EXPR_IOVEC, "iovec" },
+ { EXPR_MSGHDR, "msghdr" },
+ { EXPR_CMSG, "cmsg" },
+ { EXPR_POLLFD, "pollfd" },
+ { EXPR_MPLS_STACK, "mpls_stack" },
+ { EXPR_SCM_TIMESTAMPING, "scm_timestamping"},
+ { EXPR_SOCK_EXTENDED_ERR, "sock_extended_err"},
+ { EXPR_EPOLLEV, "epollev" },
+ {-1, NULL}
+};
+
+const char *expression_type_to_string(enum expression_t type)
+{
+ int i = 0;
+ assert(ARRAY_SIZE(expression_type_table) == NUM_EXPR_TYPES + 1);
+ for (i = 0; expression_type_table[i].name != NULL; ++i)
+ if (expression_type_table[i].type == type)
+ return expression_type_table[i].name;
+ return "UNKNOWN_TYPE";
+}
+
+/* Cross-platform symbols. */
+struct int_symbol cross_platform_symbols[] = {
+ { AF_INET, "AF_INET" },
+ { AF_INET6, "AF_INET6" },
+
+ { PF_INET, "PF_INET" },
+ { PF_INET6, "PF_INET6" },
+
+ { SOCK_STREAM, "SOCK_STREAM" },
+ { SOCK_DGRAM, "SOCK_DGRAM" },
+ { SOCK_NONBLOCK, "SOCK_NONBLOCK" },
+
+ { IPPROTO_IP, "IPPROTO_IP" },
+ { IPPROTO_IPV6, "IPPROTO_IPV6" },
+ { IPPROTO_ICMP, "IPPROTO_ICMP" },
+ { IPPROTO_ICMPV6, "IPPROTO_ICMPV6" },
+ { IPPROTO_TCP, "IPPROTO_TCP" },
+ { IPPROTO_UDP, "IPPROTO_UDP" },
+
+ { SHUT_RD, "SHUT_RD" },
+ { SHUT_WR, "SHUT_WR" },
+ { SHUT_RDWR, "SHUT_RDWR" },
+
+ { SOL_SOCKET, "SOL_SOCKET" },
+
+ { 0, "NULL" },
+
+ /* Sentinel marking the end of the table. */
+ { 0, NULL },
+};
+
+/* Do a symbol->int lookup, and return true iff we found the symbol. */
+static bool lookup_int_symbol(const char *input_symbol, s64 *output_integer,
+ struct int_symbol *symbols)
+{
+ int i;
+ for (i = 0; symbols[i].name != NULL ; ++i) {
+ if (strcmp(input_symbol, symbols[i].name) == 0) {
+ *output_integer = symbols[i].value;
+ return true;
+ }
+ }
+ return false;
+}
+
+int symbol_to_int(const char *input_symbol, s64 *output_integer,
+ char **error)
+{
+ if (lookup_int_symbol(input_symbol, output_integer,
+ cross_platform_symbols))
+ return STATUS_OK;
+
+ if (lookup_int_symbol(input_symbol, output_integer,
+ platform_symbols()))
+ return STATUS_OK;
+
+ asprintf(error, "unknown symbol: '%s'", input_symbol);
+ return STATUS_ERR;
+}
+
+/* Names for the events and revents bit mask flags for poll() system call */
+struct flag_name poll_flags[] = {
+
+ { POLLIN, "POLLIN" },
+ { POLLPRI, "POLLPRI" },
+ { POLLOUT, "POLLOUT" },
+
+#ifdef POLLRDNORM
+ { POLLRDNORM, "POLLRDNORM" },
+#endif
+#ifdef POLLRDBAND
+ { POLLRDBAND, "POLLRDBAND" },
+#endif
+#ifdef POLLWRNORM
+ { POLLWRNORM, "POLLWRNORM" },
+#endif
+#ifdef POLLWRBAND
+ { POLLWRBAND, "POLLWRBAND" },
+#endif
+
+#ifdef POLLMSG
+ { POLLMSG, "POLLMSG" },
+#endif
+#ifdef POLLREMOVE
+ { POLLREMOVE, "POLLREMOVE" },
+#endif
+#ifdef POLLRDHUP
+ { POLLRDHUP, "POLLRDHUP" },
+#endif
+
+#ifdef POLLINIGNEOF
+ { POLLINIGNEOF, "POLLINIGNEOF" },
+#endif
+
+ { POLLERR, "POLLERR" },
+ { POLLHUP, "POLLHUP" },
+ { POLLNVAL, "POLLNVAL" },
+
+ { 0, "" },
+};
+
+/* Return the human-readable ASCII string corresponding to a given
+ * flag value, or "???" if none matches.
+ */
+static const char *flag_name(struct flag_name *flags_array, u64 flag)
+{
+ while (flags_array->name && flags_array->flag != flag)
+ flags_array++;
+ if (flags_array->flag == flag)
+ return flags_array->name;
+ else
+ return "???";
+}
+
+char *flags_to_string(struct flag_name *flags_array, u64 flags)
+{
+ u64 bit_mask = 1;
+ int i = 0;
+ char *out = strdup("");
+
+ for (i = 0; i < 64; ++i) {
+ if (flags & bit_mask) {
+ char *tmp = NULL;
+ asprintf(&tmp, "%s%s%s",
+ out,
+ out[0] ? "|" : "",
+ flag_name(flags_array, bit_mask));
+ free(out);
+ out = tmp;
+ }
+ bit_mask <<= 1;
+ }
+ return out;
+}
+
+/* Fill in 'out' with an unescaped version of the input string. On
+ * success, return STATUS_OK; on error, return STATUS_ERR and store
+ * an error message in *error.
+ */
+static int unescape_cstring_expression(const char *input_string,
+ struct expression *out, char **error)
+{
+ int bytes = strlen(input_string);
+ out->type = EXPR_STRING;
+ out->value.string = (char *)calloc(1, bytes + 1);
+ const char *c_in = input_string;
+ char *c_out = out->value.string;
+ while (*c_in != '\0') {
+ if (*c_in == '\\') {
+ ++c_in;
+ switch (*c_in) {
+ case '\\':
+ *c_out = '\\';
+ case '"':
+ *c_out = '"';
+ case 'f':
+ *c_out = '\f';
+ break;
+ case 'n':
+ *c_out = '\n';
+ break;
+ case 'r':
+ *c_out = '\r';
+ break;
+ case 't':
+ *c_out = '\t';
+ break;
+ case 'v':
+ *c_out = '\v';
+ break;
+ case 'x': {
+ ++c_in;
+ if (strlen(c_in) >= 2) {
+ char s[] = { c_in[0], c_in[1], 0 };
+ char *end = NULL;
+
+ *c_out = strtol(s, &end, 16);
+ if (s[0] != '\0' && *end == '\0') {
+ ++c_in;
+ break;
+ }
+ }
+ asprintf(error,
+ "invalid hex escape (\\xhh): '\\x%s'",
+ c_in);
+ return STATUS_ERR;
+ }
+ default:
+ asprintf(error, "unsupported escape code: '%c'",
+ *c_in);
+ return STATUS_ERR;
+ }
+ } else {
+ *c_out = *c_in;
+ }
+ ++c_in;
+ ++c_out;
+ }
+ return STATUS_OK;
+}
+
+void free_expression(struct expression *expression)
+{
+ if (expression == NULL)
+ return;
+ if (expression->type >= NUM_EXPR_TYPES)
+ assert(!"bad expression type");
+ switch (expression->type) {
+ case EXPR_ELLIPSIS:
+ case EXPR_INTEGER:
+ case EXPR_GRE:
+ case EXPR_IN6_ADDR:
+ case EXPR_LINGER:
+ break;
+ case EXPR_WORD:
+ assert(expression->value.string);
+ free(expression->value.string);
+ break;
+ case EXPR_STRING:
+ assert(expression->value.string);
+ free(expression->value.string);
+ break;
+ case EXPR_SOCKET_ADDRESS_IPV4:
+ assert(expression->value.socket_address_ipv4);
+ free(expression->value.socket_address_ipv4);
+ break;
+ case EXPR_SOCKET_ADDRESS_IPV6:
+ assert(expression->value.socket_address_ipv6);
+ free(expression->value.socket_address_ipv6);
+ break;
+ case EXPR_BINARY:
+ assert(expression->value.binary);
+ free(expression->value.binary->op);
+ free_expression(expression->value.binary->lhs);
+ free_expression(expression->value.binary->rhs);
+ free(expression->value.binary);
+ break;
+ case EXPR_LIST:
+ free_expression_list(expression->value.list);
+ break;
+ case EXPR_IOVEC:
+ assert(expression->value.iovec);
+ free_expression(expression->value.iovec->iov_base);
+ free_expression(expression->value.iovec->iov_len);
+ break;
+ case EXPR_MSGHDR:
+ assert(expression->value.msghdr);
+ free_expression(expression->value.msghdr->msg_name);
+ free_expression(expression->value.msghdr->msg_namelen);
+ free_expression(expression->value.msghdr->msg_iov);
+ free_expression(expression->value.msghdr->msg_iovlen);
+ free_expression(expression->value.msghdr->msg_control);
+ free_expression(expression->value.msghdr->msg_flags);
+ break;
+ case EXPR_CMSG:
+ assert(expression->value.cmsg);
+ free_expression(expression->value.cmsg->cmsg_level);
+ free_expression(expression->value.cmsg->cmsg_type);
+ free_expression(expression->value.cmsg->cmsg_data);
+ break;
+ case EXPR_POLLFD:
+ assert(expression->value.pollfd);
+ free_expression(expression->value.pollfd->fd);
+ free_expression(expression->value.pollfd->events);
+ free_expression(expression->value.pollfd->revents);
+ break;
+ case EXPR_SCM_TIMESTAMPING:
+ assert(expression->value.scm_timestamping);
+ free(expression->value.scm_timestamping);
+ break;
+ case EXPR_SOCK_EXTENDED_ERR:
+ assert(expression->value.sock_extended_err);
+ free_expression(expression->value.sock_extended_err->ee_errno);
+ free_expression(expression->value.sock_extended_err->ee_origin);
+ free_expression(expression->value.sock_extended_err->ee_type);
+ free_expression(expression->value.sock_extended_err->ee_code);
+ free_expression(expression->value.sock_extended_err->ee_info);
+ free_expression(expression->value.sock_extended_err->ee_data);
+ free(expression->value.sock_extended_err);
+ break;
+ case EXPR_MPLS_STACK:
+ assert(expression->value.mpls_stack);
+ free(expression->value.mpls_stack);
+ break;
+ case NUM_EXPR_TYPES:
+ break;
+ case EXPR_EPOLLEV:
+ assert(expression->value.epollev);
+ free_expression(expression->value.epollev->events);
+ if (expression->value.epollev->ptr)
+ free_expression(expression->value.epollev->ptr);
+ if (expression->value.epollev->fd)
+ free_expression(expression->value.epollev->fd);
+ if (expression->value.epollev->u32)
+ free_expression(expression->value.epollev->u32);
+ if (expression->value.epollev->u64)
+ free_expression(expression->value.epollev->u64);
+ break;
+
+ /* missing default case so compiler catches missing cases */
+ }
+ memset(expression, 0, sizeof(*expression)); /* paranoia */
+ free(expression);
+}
+
+void free_expression_list(struct expression_list *list)
+{
+ while (list != NULL) {
+ free_expression(list->expression);
+ struct expression_list *dead = list;
+ list = list->next;
+ free(dead);
+ }
+}
+
+static int evaluate_binary_expression(struct expression *in,
+ struct expression *out, char **error)
+{
+ int result = STATUS_ERR;
+ assert(in->type == EXPR_BINARY);
+ assert(in->value.binary);
+ out->type = EXPR_INTEGER;
+
+ struct expression *lhs = NULL;
+ struct expression *rhs = NULL;
+ if (evaluate(in->value.binary->lhs, &lhs, error))
+ goto error_out;
+ if (evaluate(in->value.binary->rhs, &rhs, error))
+ goto error_out;
+ if (strcmp("|", in->value.binary->op) == 0) {
+ if (lhs->type != EXPR_INTEGER) {
+ asprintf(error, "left hand side of | not an integer");
+ } else if (rhs->type != EXPR_INTEGER) {
+ asprintf(error, "right hand side of | not an integer");
+ } else {
+ out->value.num = lhs->value.num | rhs->value.num;
+ result = STATUS_OK;
+ }
+ } else if (strcmp("=", in->value.binary->op) == 0) {
+ out->value.binary = calloc(1, sizeof(struct binary_expression));
+ out->value.binary->op = strdup(in->value.binary->op);
+ out->value.binary->lhs = lhs;
+ out->value.binary->rhs = rhs;
+ out->type = EXPR_BINARY;
+ return STATUS_OK;
+ } else {
+ asprintf(error, "bad binary operator '%s'",
+ in->value.binary->op);
+ }
+error_out:
+ free_expression(rhs);
+ free_expression(lhs);
+ return result;
+}
+
+static int evaluate_list_expression(struct expression *in,
+ struct expression *out, char **error)
+{
+ assert(in->type == EXPR_LIST);
+ assert(out->type == EXPR_LIST);
+
+ out->value.list = NULL;
+ return evaluate_expression_list(in->value.list,
+ &out->value.list, error);
+}
+
+static int evaluate_iovec_expression(struct expression *in,
+ struct expression *out, char **error)
+{
+ struct iovec_expr *in_iov;
+ struct iovec_expr *out_iov;
+
+ assert(in->type == EXPR_IOVEC);
+ assert(in->value.iovec);
+ assert(out->type == EXPR_IOVEC);
+
+ out->value.iovec = calloc(1, sizeof(struct iovec_expr));
+
+ in_iov = in->value.iovec;
+ out_iov = out->value.iovec;
+
+ if (evaluate(in_iov->iov_base, &out_iov->iov_base, error))
+ return STATUS_ERR;
+ if (evaluate(in_iov->iov_len, &out_iov->iov_len, error))
+ return STATUS_ERR;
+
+ return STATUS_OK;
+}
+
+static int evaluate_msghdr_expression(struct expression *in,
+ struct expression *out, char **error)
+{
+ struct msghdr_expr *in_msg;
+ struct msghdr_expr *out_msg;
+
+ assert(in->type == EXPR_MSGHDR);
+ assert(in->value.msghdr);
+ assert(out->type == EXPR_MSGHDR);
+
+ out->value.msghdr = calloc(1, sizeof(struct msghdr_expr));
+
+ in_msg = in->value.msghdr;
+ out_msg = out->value.msghdr;
+
+ if (evaluate(in_msg->msg_name, &out_msg->msg_name, error))
+ return STATUS_ERR;
+ if (evaluate(in_msg->msg_namelen, &out_msg->msg_namelen, error))
+ return STATUS_ERR;
+ if (evaluate(in_msg->msg_iov, &out_msg->msg_iov, error))
+ return STATUS_ERR;
+ if (evaluate(in_msg->msg_iovlen, &out_msg->msg_iovlen, error))
+ return STATUS_ERR;
+ if (evaluate(in_msg->msg_control, &out_msg->msg_control, error))
+ return STATUS_ERR;
+ if (evaluate(in_msg->msg_flags, &out_msg->msg_flags, error))
+ return STATUS_ERR;
+
+ return STATUS_OK;
+}
+
+static int evaluate_cmsg_expression(struct expression *in,
+ struct expression *out, char **error)
+{
+ struct cmsg_expr *in_cmsg;
+ struct cmsg_expr *out_cmsg;
+
+ assert(in->type == EXPR_CMSG);
+ assert(in->value.cmsg);
+ assert(out->type == EXPR_CMSG);
+
+ out->value.cmsg = calloc(1, sizeof(struct cmsg_expr));
+
+ in_cmsg = in->value.cmsg;
+ out_cmsg = out->value.cmsg;
+
+ if (evaluate(in_cmsg->cmsg_level, &out_cmsg->cmsg_level, error))
+ return STATUS_ERR;
+ if (evaluate(in_cmsg->cmsg_type, &out_cmsg->cmsg_type, error))
+ return STATUS_ERR;
+ if (evaluate(in_cmsg->cmsg_data, &out_cmsg->cmsg_data, error))
+ return STATUS_ERR;
+
+ return STATUS_OK;
+}
+
+static int evaluate_sock_extended_err(struct expression *in,
+ struct expression *out, char **error)
+{
+ struct sock_extended_err_expr *in_ee_err;
+ struct sock_extended_err_expr *out_ee_err;
+
+ assert(in->type == EXPR_SOCK_EXTENDED_ERR);
+ assert(in->value.sock_extended_err);
+ assert(out->type == EXPR_SOCK_EXTENDED_ERR);
+
+ out->value.sock_extended_err =
+ calloc(1, sizeof(struct sock_extended_err_expr));
+
+ in_ee_err = in->value.sock_extended_err;
+ out_ee_err = out->value.sock_extended_err;
+
+ if (evaluate(in_ee_err->ee_errno, &out_ee_err->ee_errno, error))
+ return STATUS_ERR;
+ if (evaluate(in_ee_err->ee_origin, &out_ee_err->ee_origin, error))
+ return STATUS_ERR;
+ if (evaluate(in_ee_err->ee_type, &out_ee_err->ee_type, error))
+ return STATUS_ERR;
+ if (evaluate(in_ee_err->ee_code, &out_ee_err->ee_code, error))
+ return STATUS_ERR;
+ if (evaluate(in_ee_err->ee_info, &out_ee_err->ee_info, error))
+ return STATUS_ERR;
+ if (evaluate(in_ee_err->ee_data, &out_ee_err->ee_data, error))
+ return STATUS_ERR;
+
+ return STATUS_OK;
+}
+
+static int evaluate_pollfd_expression(struct expression *in,
+ struct expression *out, char **error)
+{
+ struct pollfd_expr *in_pollfd;
+ struct pollfd_expr *out_pollfd;
+
+ assert(in->type == EXPR_POLLFD);
+ assert(in->value.pollfd);
+ assert(out->type == EXPR_POLLFD);
+
+ out->value.pollfd = calloc(1, sizeof(struct pollfd_expr));
+
+ in_pollfd = in->value.pollfd;
+ out_pollfd = out->value.pollfd;
+
+ if (evaluate(in_pollfd->fd, &out_pollfd->fd, error))
+ return STATUS_ERR;
+ if (evaluate(in_pollfd->events, &out_pollfd->events, error))
+ return STATUS_ERR;
+ if (evaluate(in_pollfd->revents, &out_pollfd->revents, error))
+ return STATUS_ERR;
+
+ return STATUS_OK;
+}
+
+static int evaluate_epollev_expression(struct expression *in,
+ struct expression *out, char **error)
+{
+ struct epollev_expr *in_epollev;
+ struct epollev_expr *out_epollev;
+
+ assert(in->type == EXPR_EPOLLEV);
+ assert(in->value.epollev);
+ assert(out->type == EXPR_EPOLLEV);
+
+ out->value.epollev = calloc(1, sizeof(struct epollev_expr));
+ in_epollev = in->value.epollev;
+ out_epollev = out->value.epollev;
+
+ if (evaluate(in_epollev->events, &out_epollev->events, error))
+ return STATUS_ERR;
+
+ if (in_epollev->ptr) {
+ if (evaluate(in_epollev->ptr, &out_epollev->ptr, error))
+ return STATUS_ERR;
+ } else if (in_epollev->fd) {
+ if (evaluate(in_epollev->fd, &out_epollev->fd, error))
+ return STATUS_ERR;
+ } else if (in_epollev->u32) {
+ if (evaluate(in_epollev->u32, &out_epollev->u32, error))
+ return STATUS_ERR;
+ } else if (in_epollev->u64) {
+ if (evaluate(in_epollev->u64, &out_epollev->u64, error))
+ return STATUS_ERR;
+ } else {
+ return STATUS_ERR;
+ }
+ return STATUS_OK;
+}
+
+static int evaluate(struct expression *in,
+ struct expression **out_ptr, char **error)
+{
+ int result = STATUS_OK;
+ struct expression *out = calloc(1, sizeof(struct expression));
+ *out_ptr = out;
+ out->type = in->type; /* most types of expression stay the same */
+
+ if (in->type >= NUM_EXPR_TYPES) {
+ asprintf(error, "bad expression type: %d", in->type);
+ return STATUS_ERR;
+ }
+ switch (in->type) {
+ case EXPR_ELLIPSIS:
+ break;
+ case EXPR_INTEGER: /* copy as-is */
+ out->value.num = in->value.num;
+ break;
+ case EXPR_GRE: /* copy as-is */
+ memcpy(&out->value.gre, &in->value.gre,
+ gre_len(&in->value.gre));
+ break;
+ case EXPR_IN6_ADDR: /* copy as-is */
+ memcpy(&out->value.address_ipv6, &in->value.address_ipv6,
+ sizeof(in->value.address_ipv6));
+ break;
+ case EXPR_LINGER: /* copy as-is */
+ memcpy(&out->value.linger, &in->value.linger,
+ sizeof(in->value.linger));
+ break;
+ case EXPR_WORD:
+ out->type = EXPR_INTEGER;
+ if (symbol_to_int(in->value.string,
+ &out->value.num, error))
+ return STATUS_ERR;
+ break;
+ case EXPR_STRING:
+ if (unescape_cstring_expression(in->value.string, out, error))
+ return STATUS_ERR;
+ break;
+ case EXPR_SOCKET_ADDRESS_IPV4: /* copy as-is */
+ out->value.socket_address_ipv4 =
+ malloc(sizeof(struct sockaddr_in));
+ memcpy(out->value.socket_address_ipv4,
+ in->value.socket_address_ipv4,
+ sizeof(*(out->value.socket_address_ipv4)));
+ break;
+ case EXPR_SOCKET_ADDRESS_IPV6: /* copy as-is */
+ out->value.socket_address_ipv6 =
+ malloc(sizeof(struct sockaddr_in6));
+ memcpy(out->value.socket_address_ipv6,
+ in->value.socket_address_ipv6,
+ sizeof(*(out->value.socket_address_ipv6)));
+ break;
+ case EXPR_BINARY:
+ result = evaluate_binary_expression(in, out, error);
+ break;
+ case EXPR_LIST:
+ result = evaluate_list_expression(in, out, error);
+ break;
+ case EXPR_IOVEC:
+ result = evaluate_iovec_expression(in, out, error);
+ break;
+ case EXPR_MSGHDR:
+ result = evaluate_msghdr_expression(in, out, error);
+ break;
+ case EXPR_CMSG:
+ result = evaluate_cmsg_expression(in, out, error);
+ break;
+ case EXPR_SCM_TIMESTAMPING:
+ memcpy(&out->value.scm_timestamping,
+ &in->value.scm_timestamping,
+ sizeof(in->value.scm_timestamping));
+ break;
+ case EXPR_SOCK_EXTENDED_ERR:
+ result = evaluate_sock_extended_err(in, out, error);
+ break;
+ case EXPR_POLLFD:
+ result = evaluate_pollfd_expression(in, out, error);
+ break;
+ case EXPR_MPLS_STACK: /* copy as-is */
+ out->value.mpls_stack = malloc(sizeof(struct mpls_stack));
+ memcpy(out->value.mpls_stack,
+ in->value.mpls_stack,
+ sizeof(*out->value.mpls_stack));
+ break;
+ case EXPR_EPOLLEV:
+ result = evaluate_epollev_expression(in, out, error);
+ break;
+ case NUM_EXPR_TYPES:
+ break;
+ /* missing default case so compiler catches missing cases */
+ }
+
+ return result;
+}
+
+/* Return a copy of the given expression list with each expression
+ * evaluated (e.g. symbols resolved to ints). On failure, return NULL
+ * and fill in *error.
+ */
+int evaluate_expression_list(struct expression_list *in_list,
+ struct expression_list **out_list,
+ char **error)
+{
+ struct expression_list **node_ptr = out_list;
+ while (in_list != NULL) {
+ struct expression_list *node =
+ calloc(1, sizeof(struct expression_list));
+ *node_ptr = node;
+ if (evaluate(in_list->expression,
+ &node->expression, error)) {
+ free_expression_list(*out_list);
+ *out_list = NULL;
+ return STATUS_ERR;
+ }
+ node_ptr = &(node->next);
+ in_list = in_list->next;
+ }
+ return STATUS_OK;
+}
diff --git a/test/packetdrill/script.h b/test/packetdrill/script.h
new file mode 100644
index 0000000..be4944a
--- /dev/null
+++ b/test/packetdrill/script.h
@@ -0,0 +1,308 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Type definitions for data structures to represent a parsed test script.
+ */
+
+#ifndef __SCRIPT_H__
+#define __SCRIPT_H__
+
+#include "types.h"
+
+#include <sys/time.h>
+#include "packet.h"
+
+#define MSGHDR_MAX_CONTROLLEN 1000 /* arbitrary maximum cmsg length */
+
+/* The types of expressions in a script */
+enum expression_t {
+ EXPR_ELLIPSIS, /* ... but no value */
+ EXPR_INTEGER, /* integer in 'num' */
+ EXPR_LINGER, /* struct linger for SO_LINGER */
+ EXPR_WORD, /* unquoted word in 'string' */
+ EXPR_STRING, /* double-quoted string in 'string' */
+ EXPR_GRE, /* GRE header */
+ EXPR_IN6_ADDR, /* in6_addr in 'address_ipv6' */
+ EXPR_SOCKET_ADDRESS_IPV4, /* sockaddr_in in 'socket_address_ipv4' */
+ EXPR_SOCKET_ADDRESS_IPV6, /* sockaddr_in6 in 'socket_address_ipv6' */
+ EXPR_BINARY, /* binary expression, 2 sub-expressions */
+ EXPR_LIST, /* list of expressions */
+ EXPR_IOVEC, /* expression tree for an iovec struct */
+ EXPR_MSGHDR, /* expression tree for a msghdr struct */
+ EXPR_CMSG, /* expression tree for a cmsg struct */
+ EXPR_POLLFD, /* expression tree for a pollfd struct */
+ EXPR_MPLS_STACK, /* MPLS label stack expression */
+ EXPR_SCM_TIMESTAMPING, /* scm_timestamping expression */
+ EXPR_SOCK_EXTENDED_ERR, /* scm_sock_extended_err expression */
+ EXPR_EPOLLEV, /* expression tree for a epoll_event struct */
+ NUM_EXPR_TYPES,
+};
+/* Convert an expression type to a human-readable string */
+const char *expression_type_to_string(enum expression_t type);
+
+/* An expression in a script */
+struct expression {
+ enum expression_t type;
+ union {
+ s64 num;
+ char *string;
+ struct linger linger;
+ struct gre gre;
+ struct in6_addr address_ipv6;
+ struct sockaddr_in *socket_address_ipv4;
+ struct sockaddr_in6 *socket_address_ipv6;
+ struct binary_expression *binary;
+ struct expression_list *list;
+ struct iovec_expr *iovec;
+ struct msghdr_expr *msghdr;
+ struct cmsg_expr *cmsg;
+ struct pollfd_expr *pollfd;
+ struct mpls_stack *mpls_stack;
+ struct scm_timestamping_expr *scm_timestamping;
+ struct sock_extended_err_expr *sock_extended_err;
+ struct epollev_expr *epollev;
+ } value;
+ const char *format; /* the printf format for printing the value */
+};
+
+/* Two expressions combined via a binary operator */
+struct binary_expression {
+ char *op; /* binary operator */
+ struct expression *lhs; /* left hand side expression */
+ struct expression *rhs; /* right hand side expression */
+};
+
+/* A list of expressions, e.g. a list of actual parameters in function call,
+ * or list of elements in an array.
+ */
+struct expression_list {
+ struct expression *expression;
+ struct expression_list *next;
+};
+
+/* Parse tree for a iovec struct in a writev/readv/sendmsg/recvmsg syscall. */
+struct iovec_expr {
+ struct expression *iov_base;
+ struct expression *iov_len;
+};
+
+/* Parse tree for a msghdr struct in a sendmsg/recvmsg syscall. */
+struct msghdr_expr {
+ struct expression *msg_name;
+ struct expression *msg_namelen;
+ struct expression *msg_iov;
+ struct expression *msg_iovlen;
+ struct expression *msg_control;
+ struct expression *msg_flags;
+};
+
+/* Parse tree for a cmsg item in a sendmsg/recvmsg syscall. */
+struct cmsg_expr {
+ struct expression *cmsg_level;
+ struct expression *cmsg_type;
+ struct expression *cmsg_data;
+};
+
+/* A verbatim copy of Linux's struct scm_timestamping for portability. */
+struct scm_timestamping_expr {
+ struct timespec ts[3];
+};
+
+/* Parse tree for a sock_extended_err item in a recvmsg syscall. */
+struct sock_extended_err_expr {
+ struct expression *ee_errno;
+ struct expression *ee_origin;
+ struct expression *ee_type;
+ struct expression *ee_code;
+ struct expression *ee_info;
+ struct expression *ee_data;
+};
+
+/* Parse tree for a pollfd struct in a poll syscall. */
+struct pollfd_expr {
+ struct expression *fd; /* file descriptor */
+ struct expression *events; /* requested events */
+ struct expression *revents; /* returned events */
+};
+
+/* Parse tree for a epoll_event struct in an epoll syscall. */
+struct epollev_expr {
+ struct expression *events;
+ struct expression *ptr;
+ struct expression *fd;
+ struct expression *u32;
+ struct expression *u64;
+};
+
+/* The errno-related info from strace to summarize a system call error */
+struct errno_spec {
+ const char *errno_macro; /* errno symbol (C macro name) */
+ const char *strerror; /* strerror translation of errno */
+};
+
+/* A system call and its expected result. System calls that should
+ * return immediately have an end_usecs value of SYSCALL_NON_BLOCKING.
+ * System calls that block for some non-zero time have a non-negative
+ * end_usecs indicating the time at which the system call should
+ * return.
+ */
+struct syscall_spec {
+ const char *name; /* name of system call */
+ struct expression_list *arguments; /* arguments to system call */
+ struct expression *result; /* expected result from call */
+ struct errno_spec *error; /* errno symbol or NULL */
+ char *note; /* extra note from strace */
+ s64 end_usecs; /* finish time, if it blocks */
+};
+#define SYSCALL_NON_BLOCKING -1 /* end_usecs if non-blocking */
+
+static inline bool is_blocking_syscall(struct syscall_spec *syscall)
+{
+ return syscall->end_usecs != SYSCALL_NON_BLOCKING;
+}
+
+/* A shell command line to execute using system(3) */
+struct command_spec {
+ const char *command_line; /* executed with /bin/sh */
+};
+
+/* An ASCII text snippet of code to insert in the post-processing
+ * output. This can be, for example, a snippet of Python to execute.
+ */
+struct code_spec {
+ const char *text; /* snippet of post-processing code */
+};
+
+/* Types of events in a script */
+enum event_t {
+ INVALID_EVENT = 0,
+ PACKET_EVENT,
+ SYSCALL_EVENT,
+ COMMAND_EVENT,
+ CODE_EVENT,
+ NUM_EVENT_TYPES,
+};
+
+/* Types of event times */
+enum event_time_t {
+ ABSOLUTE_TIME = 0,
+ RELATIVE_TIME,
+ ANY_TIME,
+ ABSOLUTE_RANGE_TIME,
+ RELATIVE_RANGE_TIME,
+ NUM_TIME_TYPES,
+};
+
+/* An event in a script */
+struct event {
+ int line_number; /* location in test script file */
+ s64 time_usecs; /* event time in microseconds */
+ s64 time_usecs_end; /* event time range end (or NO_TIME_RANGE) */
+ s64 offset_usecs; /* relative event time offset from script start
+ * (or NO_TIME_RANGE) */
+ enum event_time_t time_type; /* type of time */
+ enum event_t type; /* type of the event */
+ union {
+ struct packet *packet;
+ struct syscall_spec *syscall;
+ struct command_spec *command;
+ struct code_spec *code;
+ } event; /* pointer to the event */
+ struct event *next; /* next in linked list of events */
+};
+#define NO_TIME_RANGE -1 /* time_usecs_end if no range */
+
+static inline bool is_event_time_absolute(struct event *event)
+{
+ return ((event->time_type == ABSOLUTE_TIME) ||
+ (event->time_type == ABSOLUTE_RANGE_TIME));
+}
+
+/* A --name=value option in a script */
+struct option_list {
+ char *name;
+ char *value;
+ struct option_list *next;
+};
+
+/* A parsed script. The script owns all of the data to which
+ * it points. TODO: add a script_free() to free everything when we are
+ * done executing the script, instead of leaking all that memory.
+ */
+struct script {
+ struct option_list *option_list; /* linked list of options */
+ struct command_spec *init_command; /* untimed initialization command */
+ struct event *event_list; /* linked list of all events */
+ struct command_spec *cleanup_command; /* untimed cleanup command */
+ char *buffer; /* raw input text of the script */
+ int length; /* number of bytes in the script */
+};
+
+/* Global pointer for final command we always execute at end of script: */
+extern const char *cleanup_cmd;
+/* Path of currently-executing script, for use in cleanup command errors: */
+extern const char *script_path;
+
+/* A table entry mapping a bit mask to its human-readable name.
+ * A table of such mappings must be terminated with a struct with a
+ * NULL name.
+ */
+struct flag_name {
+ u64 flag; /* a flag with one bit set */
+ const char *name; /* human-readable ASCII name for this bit */
+};
+
+/* Initialize a script object */
+extern void init_script(struct script *script);
+
+/* Look up the value of the given symbol, and fill it in. On success,
+ * return STATUS_OK; if the symbol cannot be found, return
+ * STATUS_ERR and fill in an error message in *error.
+ */
+extern int symbol_to_int(const char *input_symbol, s64 *output_integer,
+ char **error);
+
+/* Convert the given bit flags to a human-readable ASCII bit-wise OR
+ * ('|') expression and return the resulting malloc-allocated
+ * string. Caller must free() the memory.
+ */
+extern struct flag_name poll_flags[];
+char *flags_to_string(struct flag_name *flags_array, u64 flags);
+
+/* Do a deep deallocation of a heap-allocated expression list,
+ * including any other space that it points too.
+ */
+extern void free_expression(struct expression *expression);
+
+/* Do a deep deallocation of a heap-allocated expression list,
+ * including any other space that it points too.
+ */
+extern void free_expression_list(struct expression_list *list);
+
+/* Return a copy of the given expression list with each expression
+ * evaluated (e.g. symbols resolved to ints). On success, returns
+ * STATUS_OK. On error return STATUS_ERR and fill in *error.
+ */
+extern int evaluate_expression_list(struct expression_list *in_list,
+ struct expression_list **out_list,
+ char **error);
+
+#endif /* __SCRIPT_H__ */
diff --git a/test/packetdrill/sctp.h b/test/packetdrill/sctp.h
new file mode 100644
index 0000000..b831dd9
--- /dev/null
+++ b/test/packetdrill/sctp.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2013 Michael Tuexen.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: tuexen@fh-muenster.de (Michael Tuexen)
+ *
+ * Our own SCTP header declarations, so we have something that's
+ * portable and somewhat more readable than a typical system header
+ * file.
+ */
+
+#ifndef __SCTP_HEADERS_H__
+#define __SCTP_HEADERS_H__
+
+#include "types.h"
+
+/* SCTP common header. See RFC 4960. */
+struct sctp_common_header {
+ __be16 src_port;
+ __be16 dst_port;
+ __be32 v_tag;
+ __be32 crc32c;
+};
+
+#endif /* __SCTP_HEADERS_H__ */
diff --git a/test/packetdrill/so_testing.c b/test/packetdrill/so_testing.c
new file mode 100644
index 0000000..ee6a9ee
--- /dev/null
+++ b/test/packetdrill/so_testing.c
@@ -0,0 +1,169 @@
+/*
+ * Copyright 2015 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: xiaoj@google.com (Xiao Jia)
+ *
+ * Testing against a shared object (*.so) file.
+ */
+
+#include "so_testing.h"
+
+#include <dlfcn.h>
+
+#include "logging.h"
+#include "netdev.h"
+#include "packetdrill.h"
+#include "run.h"
+
+struct so_netdev {
+ struct netdev netdev; /* "inherit" from netdev */
+ struct packetdrill_interface *ifc; /* to be filled in later */
+};
+
+/* "Downcast" an abstract netdev to our flavor. */
+static inline struct so_netdev *to_so_netdev(struct netdev *netdev)
+{
+ return (struct so_netdev *)netdev;
+}
+
+static void so_netdev_free(struct netdev *a_netdev)
+{
+ struct so_netdev *netdev = to_so_netdev(a_netdev);
+
+ memset(netdev, 0, sizeof(*netdev)); /* paranoia */
+ free(netdev);
+}
+
+static int so_netdev_send(struct netdev *a_netdev, struct packet *packet)
+{
+ struct so_netdev *netdev = to_so_netdev(a_netdev);
+
+ assert(packet->ip_bytes > 0);
+ /* We do IPv4 and IPv6 */
+ assert(packet->ipv4 || packet->ipv6);
+ /* We only do TCP and ICMP */
+ assert(packet->tcp || packet->udp || packet->icmpv4 || packet->icmpv6);
+
+ return netdev->ifc->netdev_send(netdev->ifc->userdata,
+ packet_start(packet),
+ packet->ip_bytes);
+}
+
+static int so_netdev_receive(struct netdev *a_netdev, struct packet **packet,
+ char **error)
+{
+ struct so_netdev *netdev = to_so_netdev(a_netdev);
+ enum packet_parse_result_t result;
+ enum packet_layer_t layer = PACKET_LAYER_3_IP;
+ size_t in_bytes;
+
+ assert(*packet == NULL); /* should be no packet yet */
+
+ for (;;) {
+ *packet = packet_new(PACKET_READ_BYTES);
+ in_bytes = (*packet)->buffer_bytes;
+
+ /* Sniff the next outbound packet from the stack under test. */
+ if (netdev->ifc->netdev_receive(netdev->ifc->userdata,
+ (*packet)->buffer, &in_bytes,
+ &((*packet)->time_usecs)))
+ goto next;
+
+ result = parse_packet(*packet, in_bytes, layer, error);
+
+ if (result == PACKET_OK)
+ return STATUS_OK;
+
+ if (result == PACKET_BAD)
+ return STATUS_ERR;
+
+ DEBUGP("parse_result:%d; error parsing packet: %s\n",
+ result, *error);
+next:
+ packet_free(*packet);
+ *packet = NULL;
+ }
+
+ assert(!"should not be reached");
+ return STATUS_ERR; /* not reached */
+}
+
+static struct netdev_ops so_netdev_ops = {
+ .free = so_netdev_free,
+ .send = so_netdev_send,
+ .receive = so_netdev_receive,
+};
+
+struct netdev *so_netdev_new(struct config *config)
+{
+ struct so_netdev *netdev = calloc(1, sizeof(struct so_netdev));
+
+ netdev->netdev.ops = &so_netdev_ops;
+ return (struct netdev *)netdev;
+}
+
+struct so_instance *so_instance_new(void)
+{
+ return calloc(1, sizeof(struct so_instance));
+}
+
+int so_instance_init(struct so_instance *instance,
+ const struct config *config,
+ const struct script *script,
+ const struct state *state)
+{
+#if 0
+ packetdrill_interface_init_t init;
+ char *error;
+
+ instance->handle = dlopen(config->so_filename,
+ RTLD_NOW | RTLD_LOCAL | RTLD_NODELETE |
+ RTLD_DEEPBIND);
+ if (!instance->handle)
+ die("%s\n", dlerror());
+ dlerror(); /* clear any existing error */
+
+ init = dlsym(instance->handle, "packetdrill_interface_init");
+ error = dlerror();
+ if (error)
+ die("%s\n", error);
+
+ init(config->so_flags, &instance->ifc);
+#else
+ extern void packetdrill_interface_init(const char *flags, struct packetdrill_interface *ifc);
+ instance->handle = (void *)1;
+ packetdrill_interface_init(config->so_flags, &instance->ifc);
+#endif
+ to_so_netdev(state->netdev)->ifc = &instance->ifc;
+ return STATUS_OK;
+}
+
+void so_instance_free(struct so_instance *instance)
+{
+ if (!instance)
+ return;
+
+ instance->ifc.free(instance->ifc.userdata);
+
+// if (instance->handle)
+// dlclose(instance->handle);
+
+ memset(instance, 0, sizeof(*instance));
+ free(instance);
+}
diff --git a/test/packetdrill/so_testing.h b/test/packetdrill/so_testing.h
new file mode 100644
index 0000000..7008eba
--- /dev/null
+++ b/test/packetdrill/so_testing.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2015 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: xiaoj@google.com (Xiao Jia)
+ *
+ * Testing against a shared object (*.so) file.
+ */
+
+#ifndef __SO_TESTING_H__
+#define __SO_TESTING_H__
+
+#include "packetdrill.h"
+
+struct config;
+struct netdev;
+struct script;
+struct state;
+
+struct so_instance {
+ struct packetdrill_interface ifc;
+ void *handle;
+};
+
+/* Allocate and return a new netdev for SO testing. */
+struct netdev *so_netdev_new(struct config *config);
+
+/* Allocate a new so_instance. */
+struct so_instance *so_instance_new(void);
+
+/* Load the shared object and setup callback functions. */
+int so_instance_init(struct so_instance *instance,
+ const struct config *config,
+ const struct script *script,
+ const struct state *state);
+
+/* Delete a so_instance and its associated objects. */
+void so_instance_free(struct so_instance *instance);
+
+#endif /* __SO_TESTING_H__ */
diff --git a/test/packetdrill/socket.c b/test/packetdrill/socket.c
new file mode 100644
index 0000000..74e3723
--- /dev/null
+++ b/test/packetdrill/socket.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Implementation for the socket-related state and logic.
+ */
+
+#include "socket.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include "run.h"
+
+void socket_close(struct state *state, struct fd_state *fd)
+{
+ struct socket *socket = fd_to_socket(fd);
+
+ if (fd->live_fd >= 0 && !socket->fd.is_closed) {
+ assert(fd->script_fd >= 0);
+ DEBUGP("closing struct state socket "
+ "live.fd:%d script.fd:%d\n",
+ fd->live_fd, fd->script_fd);
+ if (state->so_instance != NULL) {
+ if (state->so_instance->ifc.close(state->so_instance->ifc.userdata, fd->live_fd))
+ die_perror("close");
+ } else {
+ if (close(fd->live_fd))
+ die_perror("close");
+ }
+ }
+ if (socket->protocol == IPPROTO_TCP &&
+ socket->live.local.port != 0 &&
+ socket->live.remote.port != 0 &&
+ !state->config->is_wire_client &&
+ reset_connection(state, socket)) {
+ die("error reseting connection\n");
+ }
+
+ socket_free(socket);
+}
+
+/* Global info about file descriptors that point to sockets. */
+struct fd_ops socket_ops = {
+ .type = FD_SOCKET,
+ .close = socket_close,
+};
+
+struct socket *socket_new(struct state *state)
+{
+ struct socket *socket = calloc(1, sizeof(struct socket));
+
+ socket->fd.ops = &socket_ops;
+ socket->ts_val_map = hash_map_new(1);
+ state_add_fd(state, to_fd(socket));
+ return socket;
+}
+
+void socket_free(struct socket *socket)
+{
+ hash_map_free(socket->ts_val_map);
+ memset(socket, 0, sizeof(*socket)); /* paranoia to help catch bugs */
+ free(socket);
+}
diff --git a/test/packetdrill/socket.h b/test/packetdrill/socket.h
new file mode 100644
index 0000000..55e43b0
--- /dev/null
+++ b/test/packetdrill/socket.h
@@ -0,0 +1,311 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Interface for tracking sockets in the kernel under test.
+ */
+
+#ifndef __SOCKET_H__
+#define __SOCKET_H__
+
+#include "types.h"
+
+#include <netinet/in.h>
+#include <string.h>
+#include <sys/socket.h>
+#include "config.h"
+#include "fd_state.h"
+#include "hash_map.h"
+#include "logging.h"
+#include "packet.h"
+
+/* All possible states for a socket we're tracking. */
+enum socket_state_t {
+ SOCKET_INIT, /* uninitialized */
+ SOCKET_NEW, /* after socket() call */
+ SOCKET_PASSIVE_LISTENING, /* after listen() call */
+ SOCKET_PASSIVE_PACKET_RECEIVED, /* after receiving first packet */
+ SOCKET_PASSIVE_SYNACK_SENT, /* after sending SYNACK */
+ SOCKET_PASSIVE_SYNACK_ACKED, /* after server's SYN is ACKed */
+ SOCKET_ACTIVE_CONNECTING, /* after connect() call */
+ SOCKET_ACTIVE_SYN_SENT, /* after sending client's SYN */
+ SOCKET_ACTIVE_SYN_ACKED, /* after client's SYN is ACKed */
+};
+
+/* A TCP/UDP/IP address for an endpoint. */
+struct endpoint {
+ struct ip_address ip; /* IP address */
+ __be16 port; /* TCP/UDP port (network order) */
+};
+
+/* The 4-tuple for a TCP/UDP/IP packet. */
+struct tuple {
+ struct endpoint src;
+ struct endpoint dst;
+};
+
+/* The scripted or live aspects of socket state */
+struct socket_state {
+ struct endpoint local; /* local endpoint address */
+ u32 local_isn; /* initial TCP sequence (host order) */
+ struct endpoint remote; /* remote endpoint address */
+ u32 remote_isn; /* initial TCP sequence (host order) */
+};
+
+/* Flowlabel mapping between script and live */
+struct flowlabel_map {
+ u32 flowlabel_script;
+ u32 flowlabel_live;
+};
+
+/* The runtime state for a socket */
+struct socket {
+ /* NOTE: struct fd_state must be first field in all fd flavors. */
+ struct fd_state fd; /* info about fd for this socket */
+
+ enum socket_state_t state; /* current state of socket */
+ int address_family; /* AF_INET or AF_INET6 */
+ int type; /* e.g. SOCK_STREAM, SOCK_DGRAM */
+ int protocol; /* IPPROTO_UDP or IPPROTO_TCP */
+
+ /* The "canned" info from the test script */
+ struct socket_state script;
+
+ /* The "live" info at runtime while executing the test */
+ struct socket_state live;
+
+ /* We look at outgoing TCP timestamp values and learn the
+ * mapping between script values and live values. We store
+ * this mapping in a hash map mapping outgoing TCP timestamp
+ * values from scripted value to live value. Then we use this
+ * to map incoming TCP timestamp echo replies from their
+ * script value to their live value.
+ */
+ struct hash_map *ts_val_map;
+
+ /* Baseline to map TCP timestamp val from live to script space. */
+ bool found_first_tcp_ts;
+ u32 first_script_ts_val;
+ u32 first_actual_ts_val;
+ u32 first_script_ts_ecr;
+ u32 first_actual_ts_ecr;
+
+ /* We remember the last inbound/outbound TCP header so we can send a
+ * RST packet that the kernel will accept for this socket, in
+ * order to induce the kernel to free the socket.
+ */
+ struct tcp last_outbound_tcp_header;
+ struct tcp last_injected_tcp_header;
+ u32 last_injected_tcp_payload_len;
+
+ /* flowlabel mapping */
+ struct flowlabel_map flowlabel_map;
+};
+
+/* Convert to socket pointer if the fd is a socket, otherwise return NULL. */
+static inline struct socket *fd_to_socket(struct fd_state *fd)
+{
+ if (fd && fd->ops->type == FD_SOCKET)
+ return (struct socket *)fd;
+ else
+ return NULL;
+}
+
+struct state;
+
+/* Allocate and return a new socket object. */
+extern struct socket *socket_new(struct state *state);
+
+/* Deallocate a socket. */
+extern void socket_free(struct socket *socket);
+
+/* Get the tuple we expect to see in outbound packets from this socket. */
+static inline void socket_get_outbound(
+ const struct socket_state *socket_state, struct tuple *tuple)
+{
+ memset(tuple, 0, sizeof(*tuple));
+ tuple->src = socket_state->local;
+ tuple->dst = socket_state->remote;
+}
+
+/* Get the tuple we expect to see in inbound packets from this socket. */
+static inline void socket_get_inbound(
+ const struct socket_state *socket_state, struct tuple *tuple)
+{
+ memset(tuple, 0, sizeof(*tuple));
+ tuple->src = socket_state->remote;
+ tuple->dst = socket_state->local;
+}
+
+/* Return true iff the two tuples are equal. */
+static inline bool is_equal_tuple(const struct tuple *a,
+ const struct tuple *b)
+{
+ return memcmp(a, b, sizeof(*a)) == 0;
+}
+
+/* Fill in the *dst_tuple with the tuple for packet flow in the
+ * direction opposite that of *src_tuple
+ */
+static inline void reverse_tuple(const struct tuple *src_tuple,
+ struct tuple *dst_tuple)
+{
+ dst_tuple->src.ip = src_tuple->dst.ip;
+ dst_tuple->dst.ip = src_tuple->src.ip;
+ dst_tuple->src.port = src_tuple->dst.port;
+ dst_tuple->dst.port = src_tuple->src.port;
+}
+
+/* Get the tuple for a packet. */
+static inline void get_packet_tuple(const struct packet *packet,
+ struct tuple *tuple)
+{
+ memset(tuple, 0, sizeof(*tuple));
+ if (packet->ipv4 != NULL) {
+ ip_from_ipv4(&packet->ipv4->src_ip, &tuple->src.ip);
+ ip_from_ipv4(&packet->ipv4->dst_ip, &tuple->dst.ip);
+ } else if (packet->ipv6 != NULL) {
+ ip_from_ipv6(&packet->ipv6->src_ip, &tuple->src.ip);
+ ip_from_ipv6(&packet->ipv6->dst_ip, &tuple->dst.ip);
+ } else {
+ assert(!"bad IP version in packet");
+ }
+ if (packet->tcp != NULL) {
+ tuple->src.port = packet->tcp->src_port;
+ tuple->dst.port = packet->tcp->dst_port;
+ } else if (packet->udp != NULL) {
+ tuple->src.port = packet->udp->src_port;
+ tuple->dst.port = packet->udp->dst_port;
+ }
+}
+
+/* Set the tuple inside some TCP/IPv4 or TCP/IPv6 headers. */
+static inline void set_headers_tuple(struct ipv4 *ipv4,
+ struct ipv6 *ipv6,
+ struct tcp *tcp,
+ struct udp *udp,
+ const struct tuple *tuple)
+{
+ if (ipv4 != NULL) {
+ ip_to_ipv4(&tuple->src.ip, &ipv4->src_ip);
+ ip_to_ipv4(&tuple->dst.ip, &ipv4->dst_ip);
+ } else if (ipv6 != NULL) {
+ ip_to_ipv6(&tuple->src.ip, &ipv6->src_ip);
+ ip_to_ipv6(&tuple->dst.ip, &ipv6->dst_ip);
+ } else {
+ assert(!"bad IP version in packet");
+ }
+ if (tcp != NULL) {
+ tcp->src_port = tuple->src.port;
+ tcp->dst_port = tuple->dst.port;
+ } else if (udp != NULL) {
+ udp->src_port = tuple->src.port;
+ udp->dst_port = tuple->dst.port;
+ }
+}
+
+/* Set the tuple for a packet header echoed inside an ICMPv4/ICMPv6 message. */
+static inline void set_icmp_echoed_tuple(struct packet *packet,
+ const struct tuple *tuple)
+{
+ /* All currently supported ICMP message types include a copy
+ * of the outbound IP header and the first few bytes inside,
+ * which so far always means the first ICMP_ECHO_BYTES of
+ * TCP header.
+ */
+ DEBUGP("set_icmp_echoed_tuple");
+
+ /* Flip the direction of the tuple, since the ICMP message is
+ * flowing in the direction opposite that of the echoed TCP/IP
+ * packet, and then fill in the fields of the echoed packet.
+ */
+ struct tuple echoed_tuple;
+ reverse_tuple(tuple, &echoed_tuple);
+ set_headers_tuple(packet_echoed_ipv4_header(packet),
+ packet_echoed_ipv6_header(packet),
+ packet_echoed_tcp_header(packet),
+ packet_echoed_udp_header(packet),
+ &echoed_tuple);
+}
+
+/* Set the tuple for a packet. */
+static inline void set_packet_tuple(struct packet *packet,
+ const struct tuple *tuple)
+{
+ set_headers_tuple(packet->ipv4, packet->ipv6, packet->tcp, packet->udp,
+ tuple);
+ if ((packet->icmpv4 != NULL) || (packet->icmpv6 != NULL))
+ set_icmp_echoed_tuple(packet, tuple);
+}
+
+
+/* Helpers for translating between script and live sequence numbers.
+ *
+ * We try to interpret sequence numbers in scripts in
+ * a manner that is similar to tcpdump output: sequence numbers and
+ * ACK numbers in all packets with the SYN flag set are absolute, and
+ * for other packets the sequence numbers and ACK numbers are relative
+ * to the first SYN.
+ *
+ * Using this approach has several advantages:
+ *
+ * o tcpdump output may be more easily converted into packetdrill scripts.
+ *
+ * o we follow the principle of least surprise: it's basically what
+ * tcpdump does, so users should be more used to that and thus it
+ * should lead to fewer bugs and it should requires less
+ * documentation.
+ *
+ * o it gives convenience and expressiveness in allowing arbitrary
+ * ISNs without requiring a command line argument, so tests can be
+ * more self-contained..
+ *
+ * The code below for remote and local cases are different because the
+ * packetdrill tool gets to pick the live ISN for remote packets but the
+ * local kernel under test always gets to pick its live ISN.
+ */
+
+static inline u32 remote_seq_script_to_live_offset(struct socket *socket,
+ bool is_syn)
+{
+ return is_syn ? 0 : socket->live.remote_isn;
+}
+
+static inline u32 remote_seq_live_to_script_offset(struct socket *socket,
+ bool is_syn)
+{
+ return -remote_seq_script_to_live_offset(socket, is_syn);
+}
+
+static inline u32 local_seq_script_to_live_offset(struct socket *socket,
+ bool is_syn)
+{
+ return is_syn ?
+ (socket->live.local_isn - socket->script.local_isn) :
+ socket->live.local_isn;
+}
+
+static inline u32 local_seq_live_to_script_offset(struct socket *socket,
+ bool is_syn)
+{
+ return -local_seq_script_to_live_offset(socket, is_syn);
+}
+
+#endif /* __SOCKET_H__ */
diff --git a/test/packetdrill/symbols.h b/test/packetdrill/symbols.h
new file mode 100644
index 0000000..64914dc
--- /dev/null
+++ b/test/packetdrill/symbols.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Definitions of strace-style symbols for our platform.
+ * Allows us to map from symbolic strings to integers for system call inputs.
+ */
+
+#ifndef __SYMBOLS_H__
+#define __SYMBOLS_H__
+
+#include "types.h"
+
+/* For tables mapping symbolic strace strings to the corresponding
+ * integer values.
+ */
+struct int_symbol {
+ s64 value;
+ const char *name;
+};
+
+/* Return a pointer to a table of platform-specific string->int mappings. */
+extern struct int_symbol *platform_symbols(void);
+
+#endif /* __SYMBOLS_H__ */
diff --git a/test/packetdrill/symbols_freebsd.c b/test/packetdrill/symbols_freebsd.c
new file mode 100644
index 0000000..9f630f5
--- /dev/null
+++ b/test/packetdrill/symbols_freebsd.c
@@ -0,0 +1,310 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Definitions of strace-style symbols for FreeBSD.
+ * Allows us to map from symbolic strings to integers for system call inputs.
+ */
+
+#if defined(__FreeBSD__)
+
+#include "symbols.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <netinet/in.h>
+#include <netinet/udp.h>
+#include <poll.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/unistd.h>
+
+#include "tcp.h"
+
+/* A table of platform-specific string->int mappings. */
+struct int_symbol platform_symbols_table[] = {
+
+ /* /usr/include/sys/socket.h */
+ { SO_DEBUG, "SO_DEBUG" },
+ { SO_ACCEPTCONN, "SO_ACCEPTCONN" },
+ { SO_REUSEADDR, "SO_REUSEADDR" },
+ { SO_KEEPALIVE, "SO_KEEPALIVE" },
+ { SO_DONTROUTE, "SO_DONTROUTE" },
+ { SO_BROADCAST, "SO_BROADCAST" },
+ { SO_USELOOPBACK, "SO_USELOOPBACK" },
+ { SO_LINGER, "SO_LINGER" },
+ { SO_OOBINLINE, "SO_OOBINLINE" },
+ { SO_REUSEPORT, "SO_REUSEPORT" },
+ { SO_TIMESTAMP, "SO_TIMESTAMP" },
+ { SO_NOSIGPIPE, "SO_NOSIGPIPE" },
+ { SO_ACCEPTFILTER, "SO_ACCEPTFILTER" },
+ { SO_BINTIME, "SO_BINTIME" },
+ { SO_NO_OFFLOAD, "SO_NO_OFFLOAD" },
+ { SO_NO_DDP, "SO_NO_DDP" },
+ { SO_SNDBUF, "SO_SNDBUF" },
+ { SO_RCVBUF, "SO_RCVBUF" },
+ { SO_SNDLOWAT, "SO_SNDLOWAT" },
+ { SO_RCVLOWAT, "SO_RCVLOWAT" },
+ { SO_SNDTIMEO, "SO_SNDTIMEO" },
+ { SO_RCVTIMEO, "SO_RCVTIMEO" },
+ { SO_ERROR, "SO_ERROR" },
+ { SO_TYPE, "SO_TYPE" },
+ { SO_LABEL, "SO_LABEL" },
+ { SO_PEERLABEL, "SO_PEERLABEL" },
+ { SO_LISTENQLIMIT, "SO_LISTENQLIMIT" },
+ { SO_LISTENQLEN, "SO_LISTENQLEN" },
+ { SO_LISTENINCQLEN, "SO_LISTENINCQLEN" },
+ { SO_SETFIB, "SO_SETFIB" },
+ { SO_USER_COOKIE, "SO_USER_COOKIE" },
+
+ /* /usr/include/netinet/tcp.h */
+ { TCP_NODELAY, "TCP_NODELAY" },
+ { TCP_MAXSEG, "TCP_MAXSEG" },
+ { TCP_NOPUSH, "TCP_NOPUSH" },
+ { TCP_NOOPT, "TCP_NOOPT" },
+ { TCP_MD5SIG, "TCP_MD5SIG" },
+ { TCP_INFO, "TCP_INFO" },
+ { TCP_CONGESTION, "TCP_CONGESTION" },
+
+ /* /usr/include/sys/fcntl.h */
+ { O_RDONLY, "O_RDONLY" },
+ { O_WRONLY, "O_WRONLY" },
+ { O_RDWR, "O_RDWR" },
+ { O_ACCMODE, "O_ACCMODE" },
+ { FREAD, "FREAD" },
+ { FWRITE, "FWRITE" },
+ { O_NONBLOCK, "O_NONBLOCK" },
+ { O_APPEND, "O_APPEND" },
+ { O_SHLOCK, "O_SHLOCK" },
+ { O_EXLOCK, "O_EXLOCK" },
+ { O_ASYNC, "O_ASYNC" },
+ { O_FSYNC, "O_FSYNC" },
+ { O_SYNC, "O_SYNC" },
+ { O_NOFOLLOW, "O_NOFOLLOW" },
+ { O_CREAT, "O_CREAT" },
+ { O_TRUNC, "O_TRUNC" },
+ { O_EXCL, "O_EXCL" },
+ { O_NOCTTY, "O_NOCTTY" },
+ { O_DIRECT, "O_DIRECT" },
+ { O_DIRECTORY, "O_DIRECTORY" },
+ { O_EXEC, "O_EXEC" },
+ { O_TTY_INIT, "O_TTY_INIT" },
+ { O_CLOEXEC, "O_CLOEXEC" },
+ { FAPPEND, "FAPPEND" },
+ { FASYNC, "FASYNC" },
+ { FFSYNC, "FFSYNC" },
+ { FNONBLOCK, "FNONBLOCK" },
+ { FNDELAY, "FNDELAY" },
+ { O_NDELAY, "O_NDELAY" },
+ { FRDAHEAD, "FRDAHEAD" },
+ { AT_FDCWD, "AT_FDCWD" },
+ { AT_EACCESS, "AT_EACCESS" },
+ { AT_SYMLINK_NOFOLLOW, "AT_SYMLINK_NOFOLLOW" },
+ { AT_SYMLINK_FOLLOW, "AT_SYMLINK_FOLLOW" },
+ { AT_REMOVEDIR, "AT_REMOVEDIR" },
+ { F_DUPFD, "F_DUPFD" },
+ { F_GETFD, "F_GETFD" },
+ { F_SETFD, "F_SETFD" },
+ { F_GETFL, "F_GETFL" },
+ { F_SETFL, "F_SETFL" },
+ { F_GETOWN, "F_GETOWN" },
+ { F_SETOWN, "F_SETOWN" },
+ { F_OGETLK, "F_OGETLK" },
+ { F_OSETLK, "F_OSETLK" },
+ { F_OSETLKW, "F_OSETLKW" },
+ { F_DUP2FD, "F_DUP2FD" },
+ { F_GETLK, "F_GETLK" },
+ { F_SETLK, "F_SETLK" },
+ { F_SETLKW, "F_SETLKW" },
+ { F_SETLK_REMOTE, "F_SETLK_REMOTE" },
+ { F_READAHEAD, "F_READAHEAD" },
+ { F_RDAHEAD, "F_RDAHEAD" },
+ { FD_CLOEXEC, "FD_CLOEXEC" },
+ { F_RDLCK, "F_RDLCK" },
+ { F_UNLCK, "F_UNLCK" },
+ { F_WRLCK, "F_WRLCK" },
+ { F_UNLCKSYS, "F_UNLCKSYS" },
+ { F_CANCEL, "F_CANCEL" },
+ { LOCK_SH, "LOCK_SH" },
+ { LOCK_EX, "LOCK_EX" },
+ { LOCK_NB, "LOCK_NB" },
+ { LOCK_UN, "LOCK_UN" },
+
+ /* /usr/include/sys/unistd.h */
+ { SEEK_SET, "SEEK_SET" },
+ { SEEK_CUR, "SEEK_CUR" },
+ { SEEK_END, "SEEK_END" },
+
+ /* /usr/include/sys/socket.h */
+ { MSG_OOB, "MSG_OOB" },
+ { MSG_PEEK, "MSG_PEEK" },
+ { MSG_DONTROUTE, "MSG_DONTROUTE" },
+ { MSG_EOR, "MSG_EOR" },
+ { MSG_TRUNC, "MSG_TRUNC" },
+ { MSG_CTRUNC, "MSG_CTRUNC" },
+ { MSG_WAITALL, "MSG_WAITALL" },
+ { MSG_NOTIFICATION, "MSG_NOTIFICATION" },
+ { MSG_DONTWAIT, "MSG_DONTWAIT" },
+ { MSG_EOF, "MSG_EOF" },
+ { MSG_NBIO, "MSG_NBIO" },
+ { MSG_COMPAT, "MSG_COMPAT" },
+ { MSG_NOSIGNAL, "MSG_NOSIGNAL" },
+
+ /* /usr/include/sys/filio.h */
+ { FIOCLEX, "FIOCLEX" },
+ { FIONCLEX, "FIONCLEX" },
+ { FIONREAD, "FIONREAD" },
+ { FIONBIO, "FIONBIO" },
+ { FIOASYNC, "FIOASYNC" },
+ { FIOSETOWN, "FIOSETOWN" },
+ { FIOGETOWN, "FIOGETOWN" },
+ { FIODTYPE, "FIODTYPE" },
+ { FIOGETLBA, "FIOGETLBA" },
+ { FIODGNAME, "FIODGNAME" },
+ { FIONWRITE, "FIONWRITE" },
+ { FIONSPACE, "FIONSPACE" },
+ { FIOSEEKDATA, "FIOSEEKDATA" },
+ { FIOSEEKHOLE, "FIOSEEKHOLE" },
+
+ /* /usr/include/sys/poll.h */
+ { POLLIN, "POLLIN" },
+ { POLLPRI, "POLLPRI" },
+ { POLLOUT, "POLLOUT" },
+ { POLLRDNORM, "POLLRDNORM" },
+ { POLLWRNORM, "POLLWRNORM" },
+ { POLLRDBAND, "POLLRDBAND" },
+ { POLLWRBAND, "POLLWRBAND" },
+ { POLLINIGNEOF, "POLLINIGNEOF" },
+ { POLLERR, "POLLERR" },
+ { POLLHUP, "POLLHUP" },
+ { POLLNVAL, "POLLNVAL" },
+
+ /* /usr/include/sys/errno.h */
+ { EPERM, "EPERM" },
+ { ENOENT, "ENOENT" },
+ { ESRCH, "ESRCH" },
+ { EINTR, "EINTR" },
+ { EIO, "EIO" },
+ { ENXIO, "ENXIO" },
+ { E2BIG, "E2BIG" },
+ { ENOEXEC, "ENOEXEC" },
+ { EBADF, "EBADF" },
+ { ECHILD, "ECHILD" },
+ { EDEADLK, "EDEADLK" },
+ { ENOMEM, "ENOMEM" },
+ { EACCES, "EACCES" },
+ { EFAULT, "EFAULT" },
+ { ENOTBLK, "ENOTBLK" },
+ { EBUSY, "EBUSY" },
+ { EEXIST, "EEXIST" },
+ { EXDEV, "EXDEV" },
+ { ENODEV, "ENODEV" },
+ { ENOTDIR, "ENOTDIR" },
+ { EISDIR, "EISDIR" },
+ { EINVAL, "EINVAL" },
+ { ENFILE, "ENFILE" },
+ { EMFILE, "EMFILE" },
+ { ENOTTY, "ENOTTY" },
+ { ETXTBSY, "ETXTBSY" },
+ { EFBIG, "EFBIG" },
+ { ENOSPC, "ENOSPC" },
+ { ESPIPE, "ESPIPE" },
+ { EROFS, "EROFS" },
+ { EMLINK, "EMLINK" },
+ { EPIPE, "EPIPE" },
+ { EDOM, "EDOM" },
+ { ERANGE, "ERANGE" },
+ { EAGAIN, "EAGAIN" },
+ { EWOULDBLOCK, "EWOULDBLOCK" },
+ { EINPROGRESS, "EINPROGRESS" },
+ { EALREADY, "EALREADY" },
+ { ENOTSOCK, "ENOTSOCK" },
+ { EDESTADDRREQ, "EDESTADDRREQ" },
+ { EMSGSIZE, "EMSGSIZE" },
+ { EPROTOTYPE, "EPROTOTYPE" },
+ { ENOPROTOOPT, "ENOPROTOOPT" },
+ { EPROTONOSUPPORT, "EPROTONOSUPPORT" },
+ { ESOCKTNOSUPPORT, "ESOCKTNOSUPPORT" },
+ { EOPNOTSUPP, "EOPNOTSUPP" },
+ { ENOTSUP, "ENOTSUP" },
+ { EPFNOSUPPORT, "EPFNOSUPPORT" },
+ { EAFNOSUPPORT, "EAFNOSUPPORT" },
+ { EADDRINUSE, "EADDRINUSE" },
+ { EADDRNOTAVAIL, "EADDRNOTAVAIL" },
+ { ENETDOWN, "ENETDOWN" },
+ { ENETUNREACH, "ENETUNREACH" },
+ { ENETRESET, "ENETRESET" },
+ { ECONNABORTED, "ECONNABORTED" },
+ { ECONNRESET, "ECONNRESET" },
+ { ENOBUFS, "ENOBUFS" },
+ { EISCONN, "EISCONN" },
+ { ENOTCONN, "ENOTCONN" },
+ { ESHUTDOWN, "ESHUTDOWN" },
+ { ETOOMANYREFS, "ETOOMANYREFS" },
+ { ETIMEDOUT, "ETIMEDOUT" },
+ { ECONNREFUSED, "ECONNREFUSED" },
+ { ELOOP, "ELOOP" },
+ { ENAMETOOLONG, "ENAMETOOLONG" },
+ { EHOSTDOWN, "EHOSTDOWN" },
+ { EHOSTUNREACH, "EHOSTUNREACH" },
+ { ENOTEMPTY, "ENOTEMPTY" },
+ { EPROCLIM, "EPROCLIM" },
+ { EUSERS, "EUSERS" },
+ { EDQUOT, "EDQUOT" },
+ { ESTALE, "ESTALE" },
+ { EREMOTE, "EREMOTE" },
+ { EBADRPC, "EBADRPC" },
+ { ERPCMISMATCH, "ERPCMISMATCH" },
+ { EPROGUNAVAIL, "EPROGUNAVAIL" },
+ { EPROGMISMATCH, "EPROGMISMATCH" },
+ { EPROCUNAVAIL, "EPROCUNAVAIL" },
+ { ENOLCK, "ENOLCK" },
+ { ENOSYS, "ENOSYS" },
+ { EFTYPE, "EFTYPE" },
+ { EAUTH, "EAUTH" },
+ { ENEEDAUTH, "ENEEDAUTH" },
+ { EIDRM, "EIDRM" },
+ { ENOMSG, "ENOMSG" },
+ { EOVERFLOW, "EOVERFLOW" },
+ { ECANCELED, "ECANCELED" },
+ { EILSEQ, "EILSEQ" },
+ { ENOATTR, "ENOATTR" },
+ { EDOOFUS, "EDOOFUS" },
+ { EBADMSG, "EBADMSG" },
+ { EMULTIHOP, "EMULTIHOP" },
+ { ENOLINK, "ENOLINK" },
+ { EPROTO, "EPROTO" },
+ { ENOTCAPABLE, "ENOTCAPABLE" },
+ { ECAPMODE, "ECAPMODE" },
+
+ /* Sentinel marking the end of the table. */
+ { 0, NULL },
+};
+
+struct int_symbol *platform_symbols(void)
+{
+ return platform_symbols_table;
+}
+
+#endif /* __FreeBSD__ */
diff --git a/test/packetdrill/symbols_linux.c b/test/packetdrill/symbols_linux.c
new file mode 100644
index 0000000..bc684ff
--- /dev/null
+++ b/test/packetdrill/symbols_linux.c
@@ -0,0 +1,502 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Definitions of strace-style symbols for Linux.
+ * Allows us to map from symbolic strings to integers for system call inputs.
+ */
+
+#if linux
+
+#include "symbols.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <netinet/in.h>
+#include <netinet/udp.h>
+#include <poll.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/unistd.h>
+#include <sys/epoll.h>
+
+#include <linux/sockios.h>
+#include <linux/capability.h>
+
+#include "epoll.h"
+#include "ip.h"
+#include "tcp.h"
+#include "capability.h"
+
+/* A table of platform-specific string->int mappings. */
+struct int_symbol platform_symbols_table[] = {
+ { SOL_IP, "SOL_IP" },
+ { SOL_IPV6, "SOL_IPV6" },
+ { SOL_TCP, "SOL_TCP" },
+ { SOL_UDP, "SOL_UDP" },
+
+ { SO_ACCEPTCONN, "SO_ACCEPTCONN" },
+ { SO_ATTACH_FILTER, "SO_ATTACH_FILTER" },
+ { SO_BINDTODEVICE, "SO_BINDTODEVICE" },
+ { SO_BROADCAST, "SO_BROADCAST" },
+ { SO_BSDCOMPAT, "SO_BSDCOMPAT" },
+ { SO_DEBUG, "SO_DEBUG" },
+ { SO_DETACH_FILTER, "SO_DETACH_FILTER" },
+ { SO_DONTROUTE, "SO_DONTROUTE" },
+ { SO_ERROR, "SO_ERROR" },
+ { SO_KEEPALIVE, "SO_KEEPALIVE" },
+ { SO_LINGER, "SO_LINGER" },
+ { SO_MARK, "SO_MARK" },
+ { SO_NO_CHECK, "SO_NO_CHECK" },
+ { SO_OOBINLINE, "SO_OOBINLINE" },
+ { SO_PASSCRED, "SO_PASSCRED" },
+ { SO_PEERCRED, "SO_PEERCRED" },
+ { SO_PEERNAME, "SO_PEERNAME" },
+ { SO_PEERSEC, "SO_PEERSEC" },
+ { SO_PRIORITY, "SO_PRIORITY" },
+ { SO_RCVBUF, "SO_RCVBUF" },
+ { SO_RCVLOWAT, "SO_RCVLOWAT" },
+ { SO_RCVTIMEO, "SO_RCVTIMEO" },
+ { SO_REUSEADDR, "SO_REUSEADDR" },
+ { SO_REUSEPORT, "SO_REUSEPORT" },
+ { SO_SECURITY_AUTHENTICATION, "SO_SECURITY_AUTHENTICATION" },
+ { SO_SECURITY_ENCRYPTION_NETWORK, "SO_SECURITY_ENCRYPTION_NETWORK" },
+ { SO_SECURITY_ENCRYPTION_TRANSPORT, "SO_SECURITY_ENCRYPTION_TRANSPORT"},
+ { SO_SNDBUF, "SO_SNDBUF" },
+ { SO_SNDLOWAT, "SO_SNDLOWAT" },
+ { SO_SNDTIMEO, "SO_SNDTIMEO" },
+ { SO_TIMESTAMP, "SO_TIMESTAMP" },
+ { SO_TIMESTAMPING, "SO_TIMESTAMPING" },
+ { SO_TYPE, "SO_TYPE" },
+ { SO_MAX_PACING_RATE, "SO_MAX_PACING_RATE" },
+ { SO_ZEROCOPY, "SO_ZEROCOPY" },
+
+ { SO_EE_ORIGIN_TIMESTAMPING, "SO_EE_ORIGIN_TIMESTAMPING" },
+ { SO_EE_ORIGIN_ZEROCOPY, "SO_EE_ORIGIN_ZEROCOPY" },
+
+ { SO_EE_CODE_ZEROCOPY_COPIED, "SO_EE_CODE_ZEROCOPY_COPIED" },
+
+ { SOF_TIMESTAMPING_TX_HARDWARE, "SOF_TIMESTAMPING_TX_HARDWARE" },
+ { SOF_TIMESTAMPING_TX_SOFTWARE, "SOF_TIMESTAMPING_TX_SOFTWARE" },
+ { SOF_TIMESTAMPING_RX_HARDWARE, "SOF_TIMESTAMPING_RX_HARDWARE" },
+ { SOF_TIMESTAMPING_RX_SOFTWARE, "SOF_TIMESTAMPING_RX_SOFTWARE" },
+ { SOF_TIMESTAMPING_SOFTWARE, "SOF_TIMESTAMPING_SOFTWARE" },
+ { SOF_TIMESTAMPING_SYS_HARDWARE, "SOF_TIMESTAMPING_SYS_HARDWARE" },
+ { SOF_TIMESTAMPING_RAW_HARDWARE, "SOF_TIMESTAMPING_RAW_HARDWARE" },
+ { SOF_TIMESTAMPING_OPT_ID, "SOF_TIMESTAMPING_OPT_ID" },
+ { SOF_TIMESTAMPING_TX_SCHED, "SOF_TIMESTAMPING_TX_SCHED" },
+ { SOF_TIMESTAMPING_TX_ACK, "SOF_TIMESTAMPING_TX_ACK" },
+ { SOF_TIMESTAMPING_OPT_CMSG, "SOF_TIMESTAMPING_OPT_CMSG" },
+ { SOF_TIMESTAMPING_OPT_TSONLY, "SOF_TIMESTAMPING_OPT_TSONLY" },
+ { SOF_TIMESTAMPING_OPT_STATS, "SOF_TIMESTAMPING_OPT_STATS" },
+
+ { SCM_TIMESTAMPING, "SCM_TIMESTAMPING" },
+ { SCM_TSTAMP_SND, "SCM_TSTAMP_SND" },
+ { SCM_TSTAMP_SCHED, "SCM_TSTAMP_SCHED" },
+ { SCM_TSTAMP_ACK, "SCM_TSTAMP_ACK" },
+ { SCM_TIMESTAMPING_OPT_STATS, "SCM_TIMESTAMPING_OPT_STATS" },
+
+ { _TCP_NLA_BUSY, "TCP_NLA_BUSY" },
+ { _TCP_NLA_RWND_LIMITED, "TCP_NLA_RWND_LIMITED" },
+ { _TCP_NLA_SNDBUF_LIMITED, "TCP_NLA_SNDBUF_LIMITED" },
+ { _TCP_NLA_DATA_SEGS_OUT, "TCP_NLA_DATA_SEGS_OUT" },
+ { _TCP_NLA_TOTAL_RETRANS, "TCP_NLA_TOTAL_RETRANS" },
+ { _TCP_NLA_PACING_RATE, "TCP_NLA_PACING_RATE" },
+ { _TCP_NLA_DELIVERY_RATE, "TCP_NLA_DELIVERY_RATE" },
+ { _TCP_NLA_SND_CWND, "TCP_NLA_SND_CWND" },
+ { _TCP_NLA_REORDERING, "TCP_NLA_REORDERING" },
+ { _TCP_NLA_MIN_RTT, "TCP_NLA_MIN_RTT" },
+ { _TCP_NLA_RECUR_RETRANS, "TCP_NLA_RECUR_RETRANS" },
+ { _TCP_NLA_DELIVERY_RATE_APP_LMT, "TCP_NLA_DELIVERY_RATE_APP_LMT" },
+ { _TCP_NLA_SNDQ_SIZE, "TCP_NLA_SNDQ_SIZE" },
+ { _TCP_NLA_CA_STATE, "TCP_NLA_CA_STATE" },
+
+ { _TCP_CA_Open, "TCP_CA_OPEN" },
+ { _TCP_CA_Disorder, "TCP_CA_DISORDER" },
+ { _TCP_CA_CWR, "TCP_CA_CWR" },
+ { _TCP_CA_Recovery, "TCP_CA_RECOVERY" },
+ { _TCP_CA_Loss, "TCP_CA_LOSS" },
+
+ { IP_TOS, "IP_TOS" },
+ { IP_MTU_DISCOVER, "IP_MTU_DISCOVER" },
+ { IPV6_MTU_DISCOVER, "IPV6_MTU_DISCOVER" },
+ { IP_PMTUDISC_WANT, "IP_PMTUDISC_WANT" },
+ { IP_PMTUDISC_DONT, "IP_PMTUDISC_DONT" },
+ { IP_PMTUDISC_DO, "IP_PMTUDISC_DO" },
+ { IP_PMTUDISC_PROBE, "IP_PMTUDISC_PROBE" },
+ { IP_RECVERR, "IP_RECVERR" },
+ { IPV6_RECVERR, "IPV6_RECVERR" },
+ { IP_FREEBIND, "IP_FREEBIND" },
+ { IP_TTL, "IP_TTL" },
+#ifdef IP_MTU
+ { IP_MTU, "IP_MTU" },
+#endif
+#ifdef IPV6_MTU
+ { IPV6_MTU, "IPV6_MTU" },
+#endif
+ { IPV6_TCLASS, "IPV6_TCLASS" },
+ { IPV6_HOPLIMIT, "IPV6_HOPLIMIT" },
+
+ { TCP_NODELAY, "TCP_NODELAY" },
+ { TCP_MAXSEG, "TCP_MAXSEG" },
+ { TCP_CORK, "TCP_CORK" },
+ { TCP_KEEPIDLE, "TCP_KEEPIDLE" },
+ { TCP_KEEPINTVL, "TCP_KEEPINTVL" },
+ { TCP_KEEPCNT, "TCP_KEEPCNT" },
+ { TCP_SYNCNT, "TCP_SYNCNT" },
+ { TCP_LINGER2, "TCP_LINGER2" },
+ { TCP_DEFER_ACCEPT, "TCP_DEFER_ACCEPT" },
+ { TCP_INFO, "TCP_INFO" },
+ { TCP_QUICKACK, "TCP_QUICKACK" },
+ { TCP_CONGESTION, "TCP_CONGESTION" },
+ { TCP_MD5SIG, "TCP_MD5SIG" },
+ { TCP_COOKIE_TRANSACTIONS, "TCP_COOKIE_TRANSACTIONS" },
+ { TCP_THIN_LINEAR_TIMEOUTS, "TCP_THIN_LINEAR_TIMEOUTS" },
+ { TCP_THIN_DUPACK, "TCP_THIN_DUPACK" },
+ { TCP_USER_TIMEOUT, "TCP_USER_TIMEOUT" },
+ { TCP_FASTOPEN, "TCP_FASTOPEN" },
+ { TCP_FASTOPEN_CONNECT, "TCP_FASTOPEN_CONNECT" },
+ { TCP_TIMESTAMP, "TCP_TIMESTAMP" },
+ { TCP_NOTSENT_LOWAT, "TCP_NOTSENT_LOWAT" },
+
+ { O_RDONLY, "O_RDONLY" },
+ { O_WRONLY, "O_WRONLY" },
+ { O_RDWR, "O_RDWR" },
+ { O_ACCMODE, "O_ACCMODE" },
+ { O_CREAT, "O_CREAT" },
+ { O_EXCL, "O_EXCL" },
+ { O_NOCTTY, "O_NOCTTY" },
+ { O_TRUNC, "O_TRUNC" },
+ { O_APPEND, "O_APPEND" },
+ { O_NONBLOCK, "O_NONBLOCK" },
+
+ { F_DUPFD, "F_DUPFD" },
+ { F_GETFD, "F_GETFD" },
+ { F_SETFD, "F_SETFD" },
+ { F_GETFL, "F_GETFL" },
+ { F_SETFL, "F_SETFL" },
+ { F_GETLK, "F_GETLK" },
+ { F_SETLK, "F_SETLK" },
+ { F_SETLKW, "F_SETLKW" },
+ { F_GETOWN, "F_GETOWN" },
+ { F_SETOWN, "F_SETOWN" },
+ { F_SETSIG, "F_SETSIG" },
+ { F_GETSIG, "F_GETSIG" },
+ { F_GETOWN, "F_GETOWN" },
+ { F_SETOWN, "F_SETOWN" },
+ { F_SETLK, "F_SETLK" },
+ { F_SETLKW, "F_SETLKW" },
+ { F_GETLK, "F_GETLK" },
+ { F_SETLK64, "F_SETLK64" },
+ { F_SETLKW64, "F_SETLKW64" },
+ { F_GETLK64, "F_GETLK64" },
+ { F_SETLEASE, "F_SETLEASE" },
+ { F_GETLEASE, "F_GETLEASE" },
+ { F_NOTIFY, "F_NOTIFY" },
+ { F_DUPFD_CLOEXEC, "F_DUPFD_CLOEXEC" },
+ { FD_CLOEXEC, "FD_CLOEXEC" },
+
+ { LOCK_SH, "LOCK_SH" },
+ { LOCK_EX, "LOCK_EX" },
+ { LOCK_NB, "LOCK_NB" },
+ { LOCK_UN, "LOCK_UN" },
+
+ { F_RDLCK, "F_RDLCK" },
+ { F_WRLCK, "F_WRLCK" },
+ { F_UNLCK, "F_UNLCK" },
+ { F_EXLCK, "F_EXLCK" },
+ { F_SHLCK, "F_SHLCK" },
+
+ { SEEK_SET, "SEEK_SET" },
+ { SEEK_CUR, "SEEK_CUR" },
+ { SEEK_END, "SEEK_END" },
+
+ { MSG_OOB, "MSG_OOB" },
+ { MSG_DONTROUTE, "MSG_DONTROUTE" },
+ { MSG_PEEK, "MSG_PEEK" },
+ { MSG_CTRUNC, "MSG_CTRUNC" },
+ { MSG_PROXY, "MSG_PROXY" },
+ { MSG_EOR, "MSG_EOR" },
+ { MSG_WAITALL, "MSG_WAITALL" },
+ { MSG_TRUNC, "MSG_TRUNC" },
+ { MSG_CTRUNC, "MSG_CTRUNC" },
+ { MSG_ERRQUEUE, "MSG_ERRQUEUE" },
+ { MSG_DONTWAIT, "MSG_DONTWAIT" },
+ { MSG_CONFIRM, "MSG_CONFIRM" },
+ { MSG_FIN, "MSG_FIN" },
+ { MSG_SYN, "MSG_SYN" },
+ { MSG_RST, "MSG_RST" },
+ { MSG_NOSIGNAL, "MSG_NOSIGNAL" },
+ { MSG_MORE, "MSG_MORE" },
+ { MSG_CMSG_CLOEXEC, "MSG_CMSG_CLOEXEC" },
+ { MSG_FASTOPEN, "MSG_FASTOPEN" },
+ { MSG_ZEROCOPY, "MSG_ZEROCOPY" },
+
+#ifdef SIOCINQ
+ { SIOCINQ, "SIOCINQ" },
+#endif
+
+#ifdef FIONREAD
+ { FIONREAD, "FIONREAD" },
+#endif
+
+ { POLLIN, "POLLIN" },
+ { POLLPRI, "POLLPRI" },
+ { POLLOUT, "POLLOUT" },
+#ifdef POLLRDNORM
+ { POLLRDNORM, "POLLRDNORM" },
+#endif
+#ifdef POLLRDBAND
+ { POLLRDBAND, "POLLRDBAND" },
+#endif
+#ifdef POLLWRNORM
+ { POLLWRNORM, "POLLWRNORM" },
+#endif
+#ifdef POLLWRBAND
+ { POLLWRBAND, "POLLWRBAND" },
+#endif
+
+#ifdef POLLMSG
+ { POLLMSG, "POLLMSG" },
+#endif
+#ifdef POLLREMOVE
+ { POLLREMOVE, "POLLREMOVE" },
+#endif
+#ifdef POLLRDHUP
+ { POLLRDHUP, "POLLRDHUP" },
+#endif
+ { POLLERR, "POLLERR" },
+ { POLLHUP, "POLLHUP" },
+ { POLLNVAL, "POLLNVAL" },
+
+ { EPERM, "EPERM" },
+ { ENOENT, "ENOENT" },
+ { ESRCH, "ESRCH" },
+ { EINTR, "EINTR" },
+ { EIO, "EIO" },
+ { ENXIO, "ENXIO" },
+ { E2BIG, "E2BIG" },
+ { ENOEXEC, "ENOEXEC" },
+ { EBADF, "EBADF" },
+ { ECHILD, "ECHILD" },
+ { EAGAIN, "EAGAIN" },
+ { ENOMEM, "ENOMEM" },
+ { EACCES, "EACCES" },
+ { EFAULT, "EFAULT" },
+ { ENOTBLK, "ENOTBLK" },
+ { EBUSY, "EBUSY" },
+ { EEXIST, "EEXIST" },
+ { EXDEV, "EXDEV" },
+ { ENODEV, "ENODEV" },
+ { ENOTDIR, "ENOTDIR" },
+ { EISDIR, "EISDIR" },
+ { EINVAL, "EINVAL" },
+ { ENFILE, "ENFILE" },
+ { EMFILE, "EMFILE" },
+ { ENOTTY, "ENOTTY" },
+ { ETXTBSY, "ETXTBSY" },
+ { EFBIG, "EFBIG" },
+ { ENOSPC, "ENOSPC" },
+ { ESPIPE, "ESPIPE" },
+ { EROFS, "EROFS" },
+ { EMLINK, "EMLINK" },
+ { EPIPE, "EPIPE" },
+ { EDOM, "EDOM" },
+ { ERANGE, "ERANGE" },
+ { EDEADLK, "EDEADLK" },
+ { ENAMETOOLONG, "ENAMETOOLONG" },
+ { ENOLCK, "ENOLCK" },
+ { ENOSYS, "ENOSYS" },
+ { ENOTEMPTY, "ENOTEMPTY" },
+ { ELOOP, "ELOOP" },
+ { EWOULDBLOCK, "EWOULDBLOCK" },
+ { ENOMSG, "ENOMSG" },
+ { EIDRM, "EIDRM" },
+ { ECHRNG, "ECHRNG" },
+ { EL2NSYNC, "EL2NSYNC" },
+ { EL3HLT, "EL3HLT" },
+ { EL3RST, "EL3RST" },
+ { ELNRNG, "ELNRNG" },
+ { EUNATCH, "EUNATCH" },
+ { ENOCSI, "ENOCSI" },
+ { EL2HLT, "EL2HLT" },
+ { EBADE, "EBADE" },
+ { EBADR, "EBADR" },
+ { EXFULL, "EXFULL" },
+ { ENOANO, "ENOANO" },
+ { EBADRQC, "EBADRQC" },
+ { EBADSLT, "EBADSLT" },
+ { EDEADLOCK, "EDEADLOCK" },
+ { EBFONT, "EBFONT" },
+ { ENOSTR, "ENOSTR" },
+ { ENODATA, "ENODATA" },
+ { ETIME, "ETIME" },
+ { ENOSR, "ENOSR" },
+ { ENONET, "ENONET" },
+ { ENOPKG, "ENOPKG" },
+ { EREMOTE, "EREMOTE" },
+ { ENOLINK, "ENOLINK" },
+ { EADV, "EADV" },
+ { ESRMNT, "ESRMNT" },
+ { ECOMM, "ECOMM" },
+ { EPROTO, "EPROTO" },
+ { EMULTIHOP, "EMULTIHOP" },
+ { EDOTDOT, "EDOTDOT" },
+ { EBADMSG, "EBADMSG" },
+ { EOVERFLOW, "EOVERFLOW" },
+ { ENOTUNIQ, "ENOTUNIQ" },
+ { EBADFD, "EBADFD" },
+ { EREMCHG, "EREMCHG" },
+ { ELIBACC, "ELIBACC" },
+ { ELIBBAD, "ELIBBAD" },
+ { ELIBSCN, "ELIBSCN" },
+ { ELIBMAX, "ELIBMAX" },
+ { ELIBEXEC, "ELIBEXEC" },
+ { EILSEQ, "EILSEQ" },
+ { ERESTART, "ERESTART" },
+ { ESTRPIPE, "ESTRPIPE" },
+ { EUSERS, "EUSERS" },
+ { ENOTSOCK, "ENOTSOCK" },
+ { EDESTADDRREQ, "EDESTADDRREQ" },
+ { EMSGSIZE, "EMSGSIZE" },
+ { EPROTOTYPE, "EPROTOTYPE" },
+ { ENOPROTOOPT, "ENOPROTOOPT" },
+ { EPROTONOSUPPORT, "EPROTONOSUPPORT" },
+ { ESOCKTNOSUPPORT, "ESOCKTNOSUPPORT" },
+ { EOPNOTSUPP, "EOPNOTSUPP" },
+ { EPFNOSUPPORT, "EPFNOSUPPORT" },
+ { EAFNOSUPPORT, "EAFNOSUPPORT" },
+ { EADDRINUSE, "EADDRINUSE" },
+ { EADDRNOTAVAIL, "EADDRNOTAVAIL" },
+ { ENETDOWN, "ENETDOWN" },
+ { ENETUNREACH, "ENETUNREACH" },
+ { ENETRESET, "ENETRESET" },
+ { ECONNABORTED, "ECONNABORTED" },
+ { ECONNRESET, "ECONNRESET" },
+ { ENOBUFS, "ENOBUFS" },
+ { EISCONN, "EISCONN" },
+ { ENOTCONN, "ENOTCONN" },
+ { ESHUTDOWN, "ESHUTDOWN" },
+ { ETOOMANYREFS, "ETOOMANYREFS" },
+ { ETIMEDOUT, "ETIMEDOUT" },
+ { ECONNREFUSED, "ECONNREFUSED" },
+ { EHOSTDOWN, "EHOSTDOWN" },
+ { EHOSTUNREACH, "EHOSTUNREACH" },
+ { EALREADY, "EALREADY" },
+ { EINPROGRESS, "EINPROGRESS" },
+ { ESTALE, "ESTALE" },
+ { EUCLEAN, "EUCLEAN" },
+ { ENOTNAM, "ENOTNAM" },
+ { ENAVAIL, "ENAVAIL" },
+ { EISNAM, "EISNAM" },
+ { EREMOTEIO, "EREMOTEIO" },
+ { EDQUOT, "EDQUOT" },
+ { ENOMEDIUM, "ENOMEDIUM" },
+ { EMEDIUMTYPE, "EMEDIUMTYPE" },
+ { ECANCELED, "ECANCELED" },
+ { ENOKEY, "ENOKEY" },
+ { EKEYEXPIRED, "EKEYEXPIRED" },
+ { EKEYREVOKED, "EKEYREVOKED" },
+ { EKEYREJECTED, "EKEYREJECTED" },
+ { EOWNERDEAD, "EOWNERDEAD" },
+ { ENOTRECOVERABLE, "ENOTRECOVERABLE" },
+ { ERFKILL, "ERFKILL" },
+ /* cap_flag */
+ { CAP_EFFECTIVE, "CAP_EFFECTIVE" },
+ { CAP_PERMITTED, "CAP_PERMITTED" },
+ { CAP_INHERITABLE, "CAP_INHERITABLE" },
+ /* cap_option */
+ { CAP_SET, "CAP_SET" },
+ { CAP_CLEAR, "CAP_CLEAR" },
+ { CAP_CHOWN, "CAP_CHOWN" },
+ /* linux capabilities */
+ { CAP_DAC_OVERRIDE, "CAP_DAC_OVERRIDE" },
+ { CAP_DAC_READ_SEARCH, "CAP_DAC_READ_SEARCH" },
+ { CAP_FOWNER, "CAP_FOWNER" },
+ { CAP_FSETID, "CAP_FSETID" },
+ { CAP_KILL, "CAP_KILL" },
+ { CAP_SETGID, "CAP_SETGID" },
+ { CAP_SETUID, "CAP_SETUID" },
+ { CAP_SETPCAP, "CAP_SETPCAP" },
+ { CAP_LINUX_IMMUTABLE, "CAP_LINUX_IMMUTABLE" },
+ { CAP_NET_BIND_SERVICE, "CAP_NET_BIND_SERVICE" },
+ { CAP_NET_BROADCAST, "CAP_NET_BROADCAST" },
+ { CAP_NET_ADMIN, "CAP_NET_ADMIN" },
+ { CAP_NET_RAW, "CAP_NET_RAW" },
+ { CAP_IPC_LOCK, "CAP_IPC_LOCK" },
+ { CAP_IPC_OWNER, "CAP_IPC_OWNER" },
+ { CAP_SYS_MODULE, "CAP_SYS_MODULE" },
+ { CAP_SYS_RAWIO, "CAP_SYS_RAWIO" },
+ { CAP_SYS_CHROOT, "CAP_SYS_CHROOT" },
+ { CAP_SYS_PTRACE, "CAP_SYS_PTRACE" },
+ { CAP_SYS_PACCT, "CAP_SYS_PACCT" },
+ { CAP_SYS_ADMIN, "CAP_SYS_ADMIN" },
+ { CAP_SYS_BOOT, "CAP_SYS_BOOT" },
+ { CAP_SYS_NICE, "CAP_SYS_NICE" },
+ { CAP_SYS_RESOURCE, "CAP_SYS_RESOURCE" },
+ { CAP_SYS_TIME, "CAP_SYS_TIME" },
+ { CAP_SYS_TTY_CONFIG, "CAP_SYS_TTY_CONFIG" },
+ { CAP_MKNOD, "CAP_MKNOD" },
+ { CAP_LEASE, "CAP_LEASE" },
+ { CAP_AUDIT_WRITE, "CAP_AUDIT_WRITE" },
+ { CAP_AUDIT_CONTROL, "CAP_AUDIT_CONTROL" },
+ { CAP_SETFCAP, "CAP_SETFCAP" },
+ { CAP_MAC_OVERRIDE, "CAP_MAC_OVERRIDE" },
+ { CAP_MAC_ADMIN, "CAP_MAC_ADMIN" },
+ { CAP_SYSLOG, "CAP_SYSLOG" },
+ { CAP_WAKE_ALARM, "CAP_WAKE_ALARM" },
+ { CAP_BLOCK_SUSPEND, "CAP_BLOCK_SUSPEND" },
+ { EPOLLIN, "EPOLLIN" },
+ { EPOLLPRI, "EPOLLPRI" },
+ { EPOLLOUT, "EPOLLOUT" },
+ { EPOLLRDNORM, "EPOLLRDNORM" },
+ { EPOLLRDBAND, "EPOLLRDBAND" },
+ { EPOLLWRNORM, "EPOLLWRNORM" },
+ { EPOLLWRBAND, "EPOLLWRBAND" },
+ { EPOLLMSG, "EPOLLMSG" },
+ { EPOLLERR, "EPOLLERR" },
+ { EPOLLHUP, "EPOLLHUP" },
+ { EPOLLRDHUP, "EPOLLRDHUP" },
+ { EPOLLONESHOT, "EPOLLONESHOT" },
+ { EPOLLET, "EPOLLET" },
+ { EPOLLEXCLUSIVE, "EPOLLEXCLUSIVE" },
+ { EPOLL_CTL_ADD, "EPOLL_CTL_ADD" },
+ { EPOLL_CTL_MOD, "EPOLL_CTL_MOD" },
+ { EPOLL_CTL_DEL, "EPOLL_CTL_DEL" },
+ { SPLICE_F_MOVE, "SPLICE_F_MOVE" },
+ { SPLICE_F_NONBLOCK, "SPLICE_F_NONBLOCK" },
+ { SPLICE_F_MORE, "SPLICE_F_MORE" },
+ { SPLICE_F_GIFT, "SPLICE_F_GIFT" },
+ { AF_UNSPEC, "AF_UNSPEC" },
+
+ /* Sentinel marking the end of the table. */
+ { 0, NULL },
+};
+
+struct int_symbol *platform_symbols(void)
+{
+ return platform_symbols_table;
+}
+
+#endif /* linux */
diff --git a/test/packetdrill/symbols_netbsd.c b/test/packetdrill/symbols_netbsd.c
new file mode 100644
index 0000000..2ae3a1b
--- /dev/null
+++ b/test/packetdrill/symbols_netbsd.c
@@ -0,0 +1,320 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Definitions of strace-style symbols for NetBSD.
+ * Allows us to map from symbolic strings to integers for system call inputs.
+ */
+
+#if defined(__NetBSD__)
+
+#include "symbols.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <netinet/in.h>
+#include <netinet/udp.h>
+#include <poll.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/unistd.h>
+
+#include "tcp.h"
+
+/* A table of platform-specific string->int mappings. */
+struct int_symbol platform_symbols_table[] = {
+
+ /* /usr/include/sys/socket.h */
+ { SO_DEBUG, "SO_DEBUG" },
+ { SO_ACCEPTCONN, "SO_ACCEPTCONN" },
+ { SO_REUSEADDR, "SO_REUSEADDR" },
+ { SO_KEEPALIVE, "SO_KEEPALIVE" },
+ { SO_DONTROUTE, "SO_DONTROUTE" },
+ { SO_BROADCAST, "SO_BROADCAST" },
+ { SO_USELOOPBACK, "SO_USELOOPBACK" },
+ { SO_LINGER, "SO_LINGER" },
+ { SO_OOBINLINE, "SO_OOBINLINE" },
+ { SO_REUSEPORT, "SO_REUSEPORT" },
+ { SO_NOSIGPIPE, "SO_NOSIGPIPE" },
+ { SO_ACCEPTFILTER, "SO_ACCEPTFILTER" },
+ { SO_TIMESTAMP, "SO_TIMESTAMP" },
+ { SO_SNDBUF, "SO_SNDBUF" },
+ { SO_RCVBUF, "SO_RCVBUF" },
+ { SO_SNDLOWAT, "SO_SNDLOWAT" },
+ { SO_RCVLOWAT, "SO_RCVLOWAT" },
+ { SO_ERROR, "SO_ERROR" },
+ { SO_TYPE, "SO_TYPE" },
+ { SO_OVERFLOWED, "SO_OVERFLOWED" },
+ { SO_NOHEADER, "SO_NOHEADER" },
+ { SO_SNDTIMEO, "SO_SNDTIMEO" },
+ { SO_RCVTIMEO, "SO_RCVTIMEO" },
+
+ /* /usr/include/netinet/tcp.h */
+ { TCP_NODELAY, "TCP_NODELAY" },
+ { TCP_MAXSEG, "TCP_MAXSEG" },
+ { TCP_KEEPIDLE, "TCP_KEEPIDLE" },
+ { TCP_KEEPINTVL, "TCP_KEEPINTVL" },
+ { TCP_KEEPCNT, "TCP_KEEPCNT" },
+ { TCP_KEEPINIT, "TCP_KEEPINIT" },
+ { TCP_MD5SIG, "TCP_MD5SIG" },
+ { TCP_CONGCTL, "TCP_CONGCTL" },
+
+ /* /usr/include/sys/fcntl.h */
+ { O_RDONLY, "O_RDONLY" },
+ { O_WRONLY, "O_WRONLY" },
+ { O_RDWR, "O_RDWR" },
+ { O_ACCMODE, "O_ACCMODE" },
+ { FREAD, "FREAD" },
+ { FWRITE, "FWRITE" },
+ { O_NONBLOCK, "O_NONBLOCK" },
+ { O_APPEND, "O_APPEND" },
+ { O_SHLOCK, "O_SHLOCK" },
+ { O_EXLOCK, "O_EXLOCK" },
+ { O_ASYNC, "O_ASYNC" },
+ { O_SYNC, "O_SYNC" },
+ { O_NOFOLLOW, "O_NOFOLLOW" },
+ { O_CREAT, "O_CREAT" },
+ { O_TRUNC, "O_TRUNC" },
+ { O_EXCL, "O_EXCL" },
+ { O_NOCTTY, "O_NOCTTY" },
+ { O_DSYNC, "O_DSYNC" },
+ { O_RSYNC, "O_RSYNC" },
+ { O_ALT_IO, "O_ALT_IO" },
+ { O_DIRECT, "O_DIRECT" },
+ { O_DIRECTORY, "O_DIRECTORY" },
+ { O_CLOEXEC, "O_CLOEXEC" },
+ { O_NOSIGPIPE, "O_NOSIGPIPE" },
+ { FAPPEND, "FAPPEND" },
+ { FASYNC, "FASYNC" },
+ { O_FSYNC, "O_FSYNC" },
+ { FNDELAY, "FNDELAY" },
+ { O_NDELAY, "O_NDELAY" },
+ { F_DUPFD, "F_DUPFD" },
+ { F_GETFD, "F_GETFD" },
+ { F_SETFD, "F_SETFD" },
+ { F_GETFL, "F_GETFL" },
+ { F_SETFL, "F_SETFL" },
+ { F_GETOWN, "F_GETOWN" },
+ { F_SETOWN, "F_SETOWN" },
+ { F_GETLK, "F_GETLK" },
+ { F_SETLK, "F_SETLK" },
+ { F_SETLKW, "F_SETLKW" },
+ { F_CLOSEM, "F_CLOSEM" },
+ { F_MAXFD, "F_MAXFD" },
+ { F_DUPFD_CLOEXEC, "F_DUPFD_CLOEXEC" },
+ { F_GETNOSIGPIPE, "F_GETNOSIGPIPE" },
+ { F_SETNOSIGPIPE, "F_SETNOSIGPIPE" },
+ { FD_CLOEXEC, "FD_CLOEXEC" },
+ { F_RDLCK, "F_RDLCK" },
+ { F_UNLCK, "F_UNLCK" },
+ { F_WRLCK, "F_WRLCK" },
+ { F_PARAM_MASK, "F_PARAM_MASK" },
+ { F_PARAM_MAX, "F_PARAM_MAX" },
+ { F_FSCTL, "F_FSCTL" },
+ { F_FSVOID, "F_FSVOID" },
+ { F_FSOUT, "F_FSOUT" },
+ { F_FSIN, "F_FSIN" },
+ { F_FSINOUT, "F_FSINOUT" },
+ { F_FSDIRMASK, "F_FSDIRMASK" },
+ { F_FSPRIV, "F_FSPRIV" },
+ { LOCK_SH, "LOCK_SH" },
+ { LOCK_EX, "LOCK_EX" },
+ { LOCK_NB, "LOCK_NB" },
+ { LOCK_UN, "LOCK_UN" },
+ { SEEK_SET, "SEEK_SET" },
+ { SEEK_CUR, "SEEK_CUR" },
+ { SEEK_END, "SEEK_END" },
+ { POSIX_FADV_NORMAL, "POSIX_FADV_NORMAL" },
+ { POSIX_FADV_RANDOM, "POSIX_FADV_RANDOM" },
+ { POSIX_FADV_SEQUENTIAL, "POSIX_FADV_SEQUENTIAL" },
+ { POSIX_FADV_WILLNEED, "POSIX_FADV_WILLNEED" },
+ { POSIX_FADV_DONTNEED, "POSIX_FADV_DONTNEED" },
+ { POSIX_FADV_NOREUSE, "POSIX_FADV_NOREUSE" },
+
+ /* /usr/include/sys/unistd.h */
+ { F_OK, "F_OK" },
+ { X_OK, "X_OK" },
+ { W_OK, "W_OK" },
+ { R_OK, "R_OK" },
+ { SEEK_SET, "SEEK_SET" },
+ { SEEK_CUR, "SEEK_CUR" },
+ { SEEK_END, "SEEK_END" },
+ { L_SET, "L_SET" },
+ { L_INCR, "L_INCR" },
+ { L_XTND, "L_XTND" },
+ { FDATASYNC, "FDATASYNC" },
+ { FFILESYNC, "FFILESYNC" },
+ { FDISKSYNC, "FDISKSYNC" },
+
+ /* /usr/include/sys/socket.h */
+ { MSG_OOB, "MSG_OOB" },
+ { MSG_PEEK, "MSG_PEEK" },
+ { MSG_DONTROUTE, "MSG_DONTROUTE" },
+ { MSG_EOR, "MSG_EOR" },
+ { MSG_TRUNC, "MSG_TRUNC" },
+ { MSG_CTRUNC, "MSG_CTRUNC" },
+ { MSG_WAITALL, "MSG_WAITALL" },
+ { MSG_DONTWAIT, "MSG_DONTWAIT" },
+ { MSG_BCAST, "MSG_BCAST" },
+ { MSG_MCAST, "MSG_MCAST" },
+ { MSG_NOSIGNAL, "MSG_NOSIGNAL" },
+ { MSG_CMSG_CLOEXEC, "MSG_CMSG_CLOEXEC" },
+ { MSG_NBIO, "MSG_NBIO" },
+
+ /* /usr/include/sys/filio.h */
+ { FIOCLEX, "FIOCLEX" },
+ { FIONCLEX, "FIONCLEX" },
+ { FIONREAD, "FIONREAD" },
+ { FIONBIO, "FIONBIO" },
+ { FIOASYNC, "FIOASYNC" },
+ { FIOSETOWN, "FIOSETOWN" },
+ { FIOGETOWN, "FIOGETOWN" },
+ { FIOGETBMAP, "FIOGETBMAP" },
+ { FIONWRITE, "FIONWRITE" },
+ { FIONSPACE, "FIONSPACE" },
+ { FIBMAP, "FIBMAP" },
+
+ /* /usr/include/sys/poll.h */
+ { POLLIN, "POLLIN" },
+ { POLLPRI, "POLLPRI" },
+ { POLLOUT, "POLLOUT" },
+ { POLLRDNORM, "POLLRDNORM" },
+ { POLLWRNORM, "POLLWRNORM" },
+ { POLLRDBAND, "POLLRDBAND" },
+ { POLLWRBAND, "POLLWRBAND" },
+ { POLLERR, "POLLERR" },
+ { POLLHUP, "POLLHUP" },
+ { POLLNVAL, "POLLNVAL" },
+
+ /* /usr/include/sys/errno.h */
+ { EPERM, "EPERM" },
+ { ENOENT, "ENOENT" },
+ { ESRCH, "ESRCH" },
+ { EINTR, "EINTR" },
+ { EIO, "EIO" },
+ { ENXIO, "ENXIO" },
+ { E2BIG, "E2BIG" },
+ { ENOEXEC, "ENOEXEC" },
+ { EBADF, "EBADF" },
+ { ECHILD, "ECHILD" },
+ { EDEADLK, "EDEADLK" },
+ { ENOMEM, "ENOMEM" },
+ { EACCES, "EACCES" },
+ { EFAULT, "EFAULT" },
+ { ENOTBLK, "ENOTBLK" },
+ { EBUSY, "EBUSY" },
+ { EEXIST, "EEXIST" },
+ { EXDEV, "EXDEV" },
+ { ENODEV, "ENODEV" },
+ { ENOTDIR, "ENOTDIR" },
+ { EISDIR, "EISDIR" },
+ { EINVAL, "EINVAL" },
+ { ENFILE, "ENFILE" },
+ { EMFILE, "EMFILE" },
+ { ENOTTY, "ENOTTY" },
+ { ETXTBSY, "ETXTBSY" },
+ { EFBIG, "EFBIG" },
+ { ENOSPC, "ENOSPC" },
+ { ESPIPE, "ESPIPE" },
+ { EROFS, "EROFS" },
+ { EMLINK, "EMLINK" },
+ { EPIPE, "EPIPE" },
+ { EDOM, "EDOM" },
+ { ERANGE, "ERANGE" },
+ { EAGAIN, "EAGAIN" },
+ { EWOULDBLOCK, "EWOULDBLOCK" },
+ { EINPROGRESS, "EINPROGRESS" },
+ { EALREADY, "EALREADY" },
+ { ENOTSOCK, "ENOTSOCK" },
+ { EDESTADDRREQ, "EDESTADDRREQ" },
+ { EMSGSIZE, "EMSGSIZE" },
+ { EPROTOTYPE, "EPROTOTYPE" },
+ { ENOPROTOOPT, "ENOPROTOOPT" },
+ { EPROTONOSUPPORT, "EPROTONOSUPPORT" },
+ { ESOCKTNOSUPPORT, "ESOCKTNOSUPPORT" },
+ { EOPNOTSUPP, "EOPNOTSUPP" },
+ { EPFNOSUPPORT, "EPFNOSUPPORT" },
+ { EAFNOSUPPORT, "EAFNOSUPPORT" },
+ { EADDRINUSE, "EADDRINUSE" },
+ { EADDRNOTAVAIL, "EADDRNOTAVAIL" },
+ { ENETDOWN, "ENETDOWN" },
+ { ENETUNREACH, "ENETUNREACH" },
+ { ENETRESET, "ENETRESET" },
+ { ECONNABORTED, "ECONNABORTED" },
+ { ECONNRESET, "ECONNRESET" },
+ { ENOBUFS, "ENOBUFS" },
+ { EISCONN, "EISCONN" },
+ { ENOTCONN, "ENOTCONN" },
+ { ESHUTDOWN, "ESHUTDOWN" },
+ { ETOOMANYREFS, "ETOOMANYREFS" },
+ { ETIMEDOUT, "ETIMEDOUT" },
+ { ECONNREFUSED, "ECONNREFUSED" },
+ { ELOOP, "ELOOP" },
+ { ENAMETOOLONG, "ENAMETOOLONG" },
+ { EHOSTDOWN, "EHOSTDOWN" },
+ { EHOSTUNREACH, "EHOSTUNREACH" },
+ { ENOTEMPTY, "ENOTEMPTY" },
+ { EPROCLIM, "EPROCLIM" },
+ { EUSERS, "EUSERS" },
+ { EDQUOT, "EDQUOT" },
+ { ESTALE, "ESTALE" },
+ { EREMOTE, "EREMOTE" },
+ { EBADRPC, "EBADRPC" },
+ { ERPCMISMATCH, "ERPCMISMATCH" },
+ { EPROGUNAVAIL, "EPROGUNAVAIL" },
+ { EPROGMISMATCH, "EPROGMISMATCH" },
+ { EPROCUNAVAIL, "EPROCUNAVAIL" },
+ { ENOLCK, "ENOLCK" },
+ { ENOSYS, "ENOSYS" },
+ { EFTYPE, "EFTYPE" },
+ { EAUTH, "EAUTH" },
+ { ENEEDAUTH, "ENEEDAUTH" },
+ { EIDRM, "EIDRM" },
+ { ENOMSG, "ENOMSG" },
+ { EOVERFLOW, "EOVERFLOW" },
+ { EILSEQ, "EILSEQ" },
+ { ENOTSUP, "ENOTSUP" },
+ { ECANCELED, "ECANCELED" },
+ { EBADMSG, "EBADMSG" },
+ { ENODATA, "ENODATA" },
+ { ENOSR, "ENOSR" },
+ { ENOSTR, "ENOSTR" },
+ { ETIME, "ETIME" },
+ { ENOATTR, "ENOATTR" },
+ { EMULTIHOP, "EMULTIHOP" },
+ { ENOLINK, "ENOLINK" },
+ { EPROTO, "EPROTO" },
+ { ELAST, "ELAST" },
+
+ /* Sentinel marking the end of the table. */
+ { 0, NULL },
+};
+
+struct int_symbol *platform_symbols(void)
+{
+ return platform_symbols_table;
+}
+
+#endif /* __NetBSD__ */
+
diff --git a/test/packetdrill/symbols_openbsd.c b/test/packetdrill/symbols_openbsd.c
new file mode 100644
index 0000000..4cf1f30
--- /dev/null
+++ b/test/packetdrill/symbols_openbsd.c
@@ -0,0 +1,281 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Definitions of strace-style symbols for OpenBSD.
+ * Allows us to map from symbolic strings to integers for system call inputs.
+ */
+
+#if defined(__OpenBSD__)
+
+#include "symbols.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <netinet/in.h>
+#include <netinet/udp.h>
+#include <poll.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/unistd.h>
+
+#include "tcp.h"
+
+/* A table of platform-specific string->int mappings. */
+struct int_symbol platform_symbols_table[] = {
+
+ /* /usr/include/sys/socket.h */
+ { SO_DEBUG, "SO_DEBUG" },
+ { SO_ACCEPTCONN, "SO_ACCEPTCONN" },
+ { SO_REUSEADDR, "SO_REUSEADDR" },
+ { SO_KEEPALIVE, "SO_KEEPALIVE" },
+ { SO_DONTROUTE, "SO_DONTROUTE" },
+ { SO_BROADCAST, "SO_BROADCAST" },
+ { SO_USELOOPBACK, "SO_USELOOPBACK" },
+ { SO_LINGER, "SO_LINGER" },
+ { SO_OOBINLINE, "SO_OOBINLINE" },
+ { SO_REUSEPORT, "SO_REUSEPORT" },
+ { SO_TIMESTAMP, "SO_TIMESTAMP" },
+ { SO_BINDANY, "SO_BINDANY" },
+ { SO_SNDBUF, "SO_SNDBUF" },
+ { SO_RCVBUF, "SO_RCVBUF" },
+ { SO_SNDLOWAT, "SO_SNDLOWAT" },
+ { SO_RCVLOWAT, "SO_RCVLOWAT" },
+ { SO_SNDTIMEO, "SO_SNDTIMEO" },
+ { SO_RCVTIMEO, "SO_RCVTIMEO" },
+ { SO_ERROR, "SO_ERROR" },
+ { SO_TYPE, "SO_TYPE" },
+ { SO_NETPROC, "SO_NETPROC" },
+ { SO_RTABLE, "SO_RTABLE" },
+ { SO_PEERCRED, "SO_PEERCRED" },
+ { SO_SPLICE, "SO_SPLICE" },
+
+ /* /usr/include/netinet/tcp.h */
+ { TCP_NODELAY, "TCP_NODELAY" },
+ { TCP_MAXSEG, "TCP_MAXSEG" },
+ { TCP_MD5SIG, "TCP_MD5SIG" },
+ { TCP_SACK_ENABLE, "TCP_SACK_ENABLE" },
+
+ /* /usr/include/sys/fcntl.h */
+ { O_RDONLY, "O_RDONLY" },
+ { O_WRONLY, "O_WRONLY" },
+ { O_RDWR, "O_RDWR" },
+ { O_ACCMODE, "O_ACCMODE" },
+ { FREAD, "FREAD" },
+ { FWRITE, "FWRITE" },
+ { O_NONBLOCK, "O_NONBLOCK" },
+ { O_APPEND, "O_APPEND" },
+ { O_SHLOCK, "O_SHLOCK" },
+ { O_EXLOCK, "O_EXLOCK" },
+ { O_ASYNC, "O_ASYNC" },
+ { O_FSYNC, "O_FSYNC" },
+ { O_NOFOLLOW, "O_NOFOLLOW" },
+ { O_SYNC, "O_SYNC" },
+ { O_CREAT, "O_CREAT" },
+ { O_TRUNC, "O_TRUNC" },
+ { O_EXCL, "O_EXCL" },
+ { O_DSYNC, "O_DSYNC" },
+ { O_RSYNC, "O_RSYNC" },
+ { O_NOCTTY, "O_NOCTTY" },
+ { O_CLOEXEC, "O_CLOEXEC" },
+ { O_DIRECTORY, "O_DIRECTORY" },
+ { FAPPEND, "FAPPEND" },
+ { FASYNC, "FASYNC" },
+ { FFSYNC, "FFSYNC" },
+ { FNONBLOCK, "FNONBLOCK" },
+ { FNDELAY, "FNDELAY" },
+ { O_NDELAY, "O_NDELAY" },
+ { F_DUPFD, "F_DUPFD" },
+ { F_GETFD, "F_GETFD" },
+ { F_SETFD, "F_SETFD" },
+ { F_GETFL, "F_GETFL" },
+ { F_SETFL, "F_SETFL" },
+ { F_GETOWN, "F_GETOWN" },
+ { F_SETOWN, "F_SETOWN" },
+ { F_GETLK, "F_GETLK" },
+ { F_SETLK, "F_SETLK" },
+ { F_SETLKW, "F_SETLKW" },
+ { F_DUPFD_CLOEXEC, "F_DUPFD_CLOEXEC" },
+ { FD_CLOEXEC, "FD_CLOEXEC" },
+ { F_RDLCK, "F_RDLCK" },
+ { F_UNLCK, "F_UNLCK" },
+ { F_WRLCK, "F_WRLCK" },
+ { LOCK_SH, "LOCK_SH" },
+ { LOCK_EX, "LOCK_EX" },
+ { LOCK_NB, "LOCK_NB" },
+ { LOCK_UN, "LOCK_UN" },
+ { AT_FDCWD, "AT_FDCWD" },
+ { AT_EACCESS, "AT_EACCESS" },
+ { AT_SYMLINK_NOFOLLOW, "AT_SYMLINK_NOFOLLOW" },
+ { AT_SYMLINK_FOLLOW, "AT_SYMLINK_FOLLOW" },
+ { AT_REMOVEDIR, "AT_REMOVEDIR" },
+
+ /* /usr/include/sys/unistd.h */
+ { F_OK, "F_OK" },
+ { X_OK, "X_OK" },
+ { W_OK, "W_OK" },
+ { R_OK, "R_OK" },
+ { SEEK_SET, "SEEK_SET" },
+ { SEEK_CUR, "SEEK_CUR" },
+ { SEEK_END, "SEEK_END" },
+
+ /* /usr/include/sys/socket.h */
+ { MSG_OOB, "MSG_OOB" },
+ { MSG_PEEK, "MSG_PEEK" },
+ { MSG_DONTROUTE, "MSG_DONTROUTE" },
+ { MSG_EOR, "MSG_EOR" },
+ { MSG_TRUNC, "MSG_TRUNC" },
+ { MSG_CTRUNC, "MSG_CTRUNC" },
+ { MSG_WAITALL, "MSG_WAITALL" },
+ { MSG_DONTWAIT, "MSG_DONTWAIT" },
+ { MSG_BCAST, "MSG_BCAST" },
+ { MSG_MCAST, "MSG_MCAST" },
+ { MSG_NOSIGNAL, "MSG_NOSIGNAL" },
+
+ /* /usr/include/sys/filio.h */
+ { FIOCLEX, "FIOCLEX" },
+ { FIONCLEX, "FIONCLEX" },
+ { FIONREAD, "FIONREAD" },
+ { FIONBIO, "FIONBIO" },
+ { FIOASYNC, "FIOASYNC" },
+ { FIOSETOWN, "FIOSETOWN" },
+ { FIOGETOWN, "FIOGETOWN" },
+
+ /* /usr/include/sys/poll.h */
+ { POLLIN, "POLLIN" },
+ { POLLPRI, "POLLPRI" },
+ { POLLOUT, "POLLOUT" },
+ { POLLERR, "POLLERR" },
+ { POLLHUP, "POLLHUP" },
+ { POLLNVAL, "POLLNVAL" },
+ { POLLRDNORM, "POLLRDNORM" },
+ { POLLNORM, "POLLNORM" },
+ { POLLWRNORM, "POLLWRNORM" },
+ { POLLRDBAND, "POLLRDBAND" },
+ { POLLWRBAND, "POLLWRBAND" },
+
+ /* /usr/include/sys/errno.h */
+ { ENOENT, "ENOENT" },
+ { ESRCH, "ESRCH" },
+ { EINTR, "EINTR" },
+ { EIO, "EIO" },
+ { ENXIO, "ENXIO" },
+ { E2BIG, "E2BIG" },
+ { ENOEXEC, "ENOEXEC" },
+ { EBADF, "EBADF" },
+ { ECHILD, "ECHILD" },
+ { EDEADLK, "EDEADLK" },
+ { ENOMEM, "ENOMEM" },
+ { EACCES, "EACCES" },
+ { EFAULT, "EFAULT" },
+ { ENOTBLK, "ENOTBLK" },
+ { EBUSY, "EBUSY" },
+ { EEXIST, "EEXIST" },
+ { EXDEV, "EXDEV" },
+ { ENODEV, "ENODEV" },
+ { ENOTDIR, "ENOTDIR" },
+ { EISDIR, "EISDIR" },
+ { EINVAL, "EINVAL" },
+ { ENFILE, "ENFILE" },
+ { EMFILE, "EMFILE" },
+ { ENOTTY, "ENOTTY" },
+ { ETXTBSY, "ETXTBSY" },
+ { EFBIG, "EFBIG" },
+ { ENOSPC, "ENOSPC" },
+ { ESPIPE, "ESPIPE" },
+ { EROFS, "EROFS" },
+ { EMLINK, "EMLINK" },
+ { EPIPE, "EPIPE" },
+ { EDOM, "EDOM" },
+ { ERANGE, "ERANGE" },
+ { EAGAIN, "EAGAIN" },
+ { EWOULDBLOCK, "EWOULDBLOCK" },
+ { EINPROGRESS, "EINPROGRESS" },
+ { EALREADY, "EALREADY" },
+ { ENOTSOCK, "ENOTSOCK" },
+ { EDESTADDRREQ, "EDESTADDRREQ" },
+ { EMSGSIZE, "EMSGSIZE" },
+ { EPROTOTYPE, "EPROTOTYPE" },
+ { ENOPROTOOPT, "ENOPROTOOPT" },
+ { EPROTONOSUPPORT, "EPROTONOSUPPORT" },
+ { ESOCKTNOSUPPORT, "ESOCKTNOSUPPORT" },
+ { EOPNOTSUPP, "EOPNOTSUPP" },
+ { EPFNOSUPPORT, "EPFNOSUPPORT" },
+ { EAFNOSUPPORT, "EAFNOSUPPORT" },
+ { EADDRINUSE, "EADDRINUSE" },
+ { EADDRNOTAVAIL, "EADDRNOTAVAIL" },
+ { ENETDOWN, "ENETDOWN" },
+ { ENETUNREACH, "ENETUNREACH" },
+ { ENETRESET, "ENETRESET" },
+ { ECONNABORTED, "ECONNABORTED" },
+ { ECONNRESET, "ECONNRESET" },
+ { ENOBUFS, "ENOBUFS" },
+ { EISCONN, "EISCONN" },
+ { ENOTCONN, "ENOTCONN" },
+ { ESHUTDOWN, "ESHUTDOWN" },
+ { ETOOMANYREFS, "ETOOMANYREFS" },
+ { ETIMEDOUT, "ETIMEDOUT" },
+ { ECONNREFUSED, "ECONNREFUSED" },
+ { ELOOP, "ELOOP" },
+ { ENAMETOOLONG, "ENAMETOOLONG" },
+ { EHOSTDOWN, "EHOSTDOWN" },
+ { EHOSTUNREACH, "EHOSTUNREACH" },
+ { ENOTEMPTY, "ENOTEMPTY" },
+ { EPROCLIM, "EPROCLIM" },
+ { EUSERS, "EUSERS" },
+ { EDQUOT, "EDQUOT" },
+ { ESTALE, "ESTALE" },
+ { EREMOTE, "EREMOTE" },
+ { EBADRPC, "EBADRPC" },
+ { ERPCMISMATCH, "ERPCMISMATCH" },
+ { EPROGUNAVAIL, "EPROGUNAVAIL" },
+ { EPROGMISMATCH, "EPROGMISMATCH" },
+ { EPROCUNAVAIL, "EPROCUNAVAIL" },
+ { ENOLCK, "ENOLCK" },
+ { ENOSYS, "ENOSYS" },
+ { EFTYPE, "EFTYPE" },
+ { EAUTH, "EAUTH" },
+ { ENEEDAUTH, "ENEEDAUTH" },
+ { EIPSEC, "EIPSEC" },
+ { ENOATTR, "ENOATTR" },
+ { EILSEQ, "EILSEQ" },
+ { ENOMEDIUM, "ENOMEDIUM" },
+ { EMEDIUMTYPE, "EMEDIUMTYPE" },
+ { EOVERFLOW, "EOVERFLOW" },
+ { ECANCELED, "ECANCELED" },
+ { EIDRM, "EIDRM" },
+ { ENOMSG, "ENOMSG" },
+ { ENOTSUP, "ENOTSUP" },
+
+ /* Sentinel marking the end of the table. */
+ { 0, NULL },
+
+};
+
+struct int_symbol *platform_symbols(void)
+{
+ return platform_symbols_table;
+}
+
+#endif /* __OpenBSD__ */
diff --git a/test/packetdrill/system.c b/test/packetdrill/system.c
new file mode 100644
index 0000000..1b17abe
--- /dev/null
+++ b/test/packetdrill/system.c
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * A module to execute a system(3) shell command and check the result.
+ */
+
+#include "system.h"
+
+#include <errno.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+int safe_system(const char *command, char **error)
+{
+ int status = system(command);
+ if (status == -1) {
+ asprintf(error, "%s", strerror(errno));
+ return STATUS_ERR;
+ }
+ if (WIFSIGNALED(status) &&
+ (WTERMSIG(status) == SIGINT || WTERMSIG(status) == SIGQUIT)) {
+ asprintf(error, "got signal %d (%s)",
+ WTERMSIG(status), strsignal(WTERMSIG(status)));
+ return STATUS_ERR;
+ }
+ if (WEXITSTATUS(status) != 0) {
+ asprintf(error, "non-zero status %d", WEXITSTATUS(status));
+ return STATUS_ERR;
+ }
+ return STATUS_OK;
+}
diff --git a/test/packetdrill/system.h b/test/packetdrill/system.h
new file mode 100644
index 0000000..1f7564d
--- /dev/null
+++ b/test/packetdrill/system.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Interface to execute a system(3) shell command and check the result.
+ */
+
+#ifndef __SYSTEM_H__
+#define __SYSTEM_H__
+
+#include "types.h"
+
+/* Execute the given command with system(3). On success, returns
+ * STATUS_OK. On error returns STATUS_ERR and fills in *error.
+ */
+extern int safe_system(const char *command, char **error);
+
+#endif /* __SYSTEM_H__ */
diff --git a/test/packetdrill/tcp.h b/test/packetdrill/tcp.h
new file mode 100644
index 0000000..5a03f35
--- /dev/null
+++ b/test/packetdrill/tcp.h
@@ -0,0 +1,339 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Our own TCP header declarations, so we have something that's
+ * portable and somewhat more readable than a typical system header
+ * file.
+ *
+ * We cannot include the kernel's linux/tcp.h because this tool tries
+ * to compile and work for basically any Linux/BSD kernel version. So
+ * we declare our own version of various TCP-related definitions here.
+ */
+
+#ifndef __TCP_HEADERS_H__
+#define __TCP_HEADERS_H__
+
+#include "types.h"
+
+#include <netinet/tcp.h>
+
+#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
+#define SOL_TCP IPPROTO_TCP
+#endif /* defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) */
+
+#ifdef linux
+
+/* TCP socket options used by Linux kernels under test but not in
+ * standard Linux header files.
+ */
+#define SO_REUSEPORT 15
+
+/* TCP socket options used by Linux kernels under test but not in
+ * standard Linux header files.
+ */
+#define TCP_COOKIE_TRANSACTIONS 15 /* TCP Cookie Transactions */
+#define TCP_THIN_LINEAR_TIMEOUTS 16 /* Use linear timeouts for thin streams */
+#define TCP_THIN_DUPACK 17 /* Fast retrans. after 1 dupack */
+#define TCP_USER_TIMEOUT 18 /* How long to retry losses */
+#define TCP_FASTOPEN 23 /* TCP Fast Open: data in SYN */
+#define TCP_TIMESTAMP 24
+#define TCP_NOTSENT_LOWAT 25 /* limit unsent bytes in write queue */
+#define TCP_CC_INFO 26 /* Get Congestion Control (optional) info */
+#define TCP_SAVE_SYN 27 /* Record SYN headers for new connections */
+#define TCP_SAVED_SYN 28 /* Get SYN headers recorded for connection */
+#define TCP_REPAIR_WINDOW 29 /* Get/set window parameters */
+#define TCP_FASTOPEN_CONNECT 30 /* Attempt FastOpen with connect */
+
+/* TODO: remove these when netinet/tcp.h has them */
+#ifndef TCPI_OPT_ECN_SEEN
+#define TCPI_OPT_ECN_SEEN 16 /* received at least one packet with ECT */
+#endif
+#ifndef TCPI_OPT_SYN_DATA
+#define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */
+#endif
+
+#endif /* linux */
+
+/* New TCP flags for sendto(2)/sendmsg(2). */
+#ifndef MSG_FASTOPEN
+#define MSG_FASTOPEN 0x20000000 /* TCP Fast Open: data in SYN */
+#endif
+
+#ifndef MSG_ZEROCOPY
+#define MSG_ZEROCOPY 0x4000000
+#endif
+
+/* TCP option numbers and lengths. */
+#define TCPOPT_EOL 0
+#define TCPOPT_NOP 1
+#define TCPOPT_MAXSEG 2
+#define TCPOLEN_MAXSEG 4
+#define TCPOPT_WINDOW 3
+#define TCPOLEN_WINDOW 3
+#define TCPOPT_SACK_PERMITTED 4
+#define TCPOLEN_SACK_PERMITTED 2
+#define TCPOPT_SACK 5
+#define TCPOPT_TIMESTAMP 8
+#define TCPOLEN_TIMESTAMP 10
+#define TCPOPT_MD5SIG 19 /* MD5 Signature (RFC2385) */
+#define TCPOLEN_MD5SIG 18
+#define TCPOLEN_MD5_BASE 2
+#define TCPOPT_FASTOPEN 34
+#define TCPOPT_EXP 254 /* Experimental */
+
+#define TCP_MD5_DIGEST_LEN 16 /* bytes in RFC2385 TCP MD5 digest */
+
+/* A portable TCP header definition (Linux and *BSD use different names). */
+struct tcp {
+ __be16 src_port;
+ __be16 dst_port;
+ __be32 seq;
+ __be32 ack_seq;
+# if __BYTE_ORDER == __LITTLE_ENDIAN
+ __u16 res1:4,
+ doff:4,
+ fin:1,
+ syn:1,
+ rst:1,
+ psh:1,
+ ack:1,
+ urg:1,
+ ece:1,
+ cwr:1;
+# elif __BYTE_ORDER == __BIG_ENDIAN
+ __u16 doff:4,
+ res1:4,
+ cwr:1,
+ ece:1,
+ urg:1,
+ ack:1,
+ psh:1,
+ rst:1,
+ syn:1,
+ fin:1;
+# else
+# error "Adjust your defines"
+# endif
+ __be16 window;
+ __sum16 check;
+ __be16 urg_ptr;
+};
+
+#ifdef linux
+
+/* Data returned by the TCP_INFO socket option. */
+struct _tcp_info {
+ __u8 tcpi_state;
+ __u8 tcpi_ca_state;
+ __u8 tcpi_retransmits;
+ __u8 tcpi_probes;
+ __u8 tcpi_backoff;
+ __u8 tcpi_options;
+ __u8 tcpi_snd_wscale:4, tcpi_rcv_wscale:4;
+ __u8 tcpi_delivery_rate_app_limited:1;
+
+ __u32 tcpi_rto;
+ __u32 tcpi_ato;
+ __u32 tcpi_snd_mss;
+ __u32 tcpi_rcv_mss;
+
+ __u32 tcpi_unacked;
+ __u32 tcpi_sacked;
+ __u32 tcpi_lost;
+ __u32 tcpi_retrans;
+ __u32 tcpi_fackets;
+
+ /* Times. */
+ __u32 tcpi_last_data_sent;
+ __u32 tcpi_last_ack_sent; /* Not remembered, sorry. */
+ __u32 tcpi_last_data_recv;
+ __u32 tcpi_last_ack_recv;
+
+ /* Metrics. */
+ __u32 tcpi_pmtu;
+ __u32 tcpi_rcv_ssthresh;
+ __u32 tcpi_rtt;
+ __u32 tcpi_rttvar;
+ __u32 tcpi_snd_ssthresh;
+ __u32 tcpi_snd_cwnd;
+ __u32 tcpi_advmss;
+ __u32 tcpi_reordering;
+
+ __u32 tcpi_rcv_rtt;
+ __u32 tcpi_rcv_space;
+
+ __u32 tcpi_total_retrans;
+
+ __u64 tcpi_pacing_rate;
+ __u64 tcpi_max_pacing_rate;
+ __u64 tcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */
+ __u64 tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */
+ __u32 tcpi_segs_out; /* RFC4898 tcpEStatsPerfSegsOut */
+ __u32 tcpi_segs_in; /* RFC4898 tcpEStatsPerfSegsIn */
+
+ __u32 tcpi_notsent_bytes;
+ __u32 tcpi_min_rtt;
+ __u32 tcpi_data_segs_in; /* RFC4898 tcpEStatsDataSegsIn */
+ __u32 tcpi_data_segs_out; /* RFC4898 tcpEStatsDataSegsOut */
+ __u64 tcpi_delivery_rate;
+
+ __u64 tcpi_busy_time; /* Time (usec) busy sending data */
+ __u64 tcpi_rwnd_limited; /* Time (usec) limited by receive window */
+ __u64 tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */
+};
+
+/* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */
+enum {
+ _TCP_NLA_PAD,
+ _TCP_NLA_BUSY, /* Time (usec) busy sending data */
+ _TCP_NLA_RWND_LIMITED, /* Time (usec) limited by receive window */
+ _TCP_NLA_SNDBUF_LIMITED,/* Time (usec) limited by send buffer */
+ _TCP_NLA_DATA_SEGS_OUT, /* Data pkts sent including retransmission */
+ _TCP_NLA_TOTAL_RETRANS, /* Data pkts retransmitted */
+ _TCP_NLA_PACING_RATE, /* Pacing rate in bytes per second */
+ _TCP_NLA_DELIVERY_RATE, /* Delivery rate in bytes per second */
+ _TCP_NLA_SND_CWND, /* Sending congestion window */
+ _TCP_NLA_REORDERING, /* Reordering metric */
+ _TCP_NLA_MIN_RTT, /* minimum RTT */
+ _TCP_NLA_RECUR_RETRANS, /* Recurring retransmits for the current pkt */
+ _TCP_NLA_DELIVERY_RATE_APP_LMT, /* delivery rate application limited ? */
+ _TCP_NLA_SNDQ_SIZE, /* Data pending in send queue */
+ _TCP_NLA_CA_STATE, /* ca_state of socket */
+};
+
+/* TCP ca_state */
+enum {
+ _TCP_CA_Open,
+ _TCP_CA_Disorder,
+ _TCP_CA_CWR,
+ _TCP_CA_Recovery,
+ _TCP_CA_Loss,
+};
+
+enum {
+ _SK_MEMINFO_RMEM_ALLOC,
+ _SK_MEMINFO_RCVBUF,
+ _SK_MEMINFO_WMEM_ALLOC,
+ _SK_MEMINFO_SNDBUF,
+ _SK_MEMINFO_FWD_ALLOC,
+ _SK_MEMINFO_WMEM_QUEUED,
+ _SK_MEMINFO_OPTMEM,
+ _SK_MEMINFO_BACKLOG,
+ _SK_MEMINFO_DROPS,
+
+ _SK_MEMINFO_VARS,
+};
+
+/* INET_DIAG_VEGASINFO */
+
+struct _tcpvegas_info {
+ __u32 tcpv_enabled;
+ __u32 tcpv_rttcnt;
+ __u32 tcpv_rtt;
+ __u32 tcpv_minrtt;
+};
+
+/* INET_DIAG_DCTCPINFO */
+
+struct _tcp_dctcp_info {
+ __u16 dctcp_enabled;
+ __u16 dctcp_ce_state;
+ __u32 dctcp_alpha;
+ __u32 dctcp_ab_ecn;
+ __u32 dctcp_ab_tot;
+};
+
+/* INET_DIAG_BBRINFO */
+
+struct _tcp_bbr_info {
+ /* u64 bw: max-filtered BW (app throughput) estimate in Byte per sec: */
+ __u32 bbr_bw_lo; /* lower 32 bits of bw */
+ __u32 bbr_bw_hi; /* upper 32 bits of bw */
+ __u32 bbr_min_rtt; /* min-filtered RTT in uSec */
+ __u32 bbr_pacing_gain; /* pacing gain shifted left 8 bits */
+ __u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */
+};
+
+union _tcp_cc_info {
+ struct _tcpvegas_info vegas;
+ struct _tcp_dctcp_info dctcp;
+ struct _tcp_bbr_info bbr;
+};
+#endif /* linux */
+
+#if defined(__FreeBSD__)
+
+/* Data returned by the TCP_INFO socket option on FreeBSD. */
+struct _tcp_info {
+ u_int8_t tcpi_state;
+ u_int8_t __tcpi_ca_state;
+ u_int8_t __tcpi_retransmits;
+ u_int8_t __tcpi_probes;
+ u_int8_t __tcpi_backoff;
+ u_int8_t tcpi_options;
+ u_int8_t tcpi_snd_wscale:4,
+ tcpi_rcv_wscale:4;
+
+ u_int32_t tcpi_rto;
+ u_int32_t __tcpi_ato;
+ u_int32_t tcpi_snd_mss;
+ u_int32_t tcpi_rcv_mss;
+
+ u_int32_t __tcpi_unacked;
+ u_int32_t __tcpi_sacked;
+ u_int32_t __tcpi_lost;
+ u_int32_t __tcpi_retrans;
+ u_int32_t __tcpi_fackets;
+
+ u_int32_t __tcpi_last_data_sent;
+ u_int32_t __tcpi_last_ack_sent;
+ u_int32_t tcpi_last_data_recv;
+ u_int32_t __tcpi_last_ack_recv;
+
+ u_int32_t __tcpi_pmtu;
+ u_int32_t __tcpi_rcv_ssthresh;
+ u_int32_t tcpi_rtt;
+ u_int32_t tcpi_rttvar;
+ u_int32_t tcpi_snd_ssthresh;
+ u_int32_t tcpi_snd_cwnd;
+ u_int32_t __tcpi_advmss;
+ u_int32_t __tcpi_reordering;
+
+ u_int32_t __tcpi_rcv_rtt;
+ u_int32_t tcpi_rcv_space;
+
+ /* FreeBSD extensions to tcp_info. */
+ u_int32_t tcpi_snd_wnd;
+ u_int32_t tcpi_snd_bwnd;
+ u_int32_t tcpi_snd_nxt;
+ u_int32_t tcpi_rcv_nxt;
+ u_int32_t tcpi_toe_tid;
+ u_int32_t tcpi_snd_rexmitpack;
+ u_int32_t tcpi_rcv_ooopack;
+ u_int32_t tcpi_snd_zerowin;
+
+ /* Padding to grow without breaking ABI. */
+ u_int32_t __tcpi_pad[26]; /* Padding. */
+};
+
+#endif /* __FreeBSD__ */
+
+#endif /* __TCP_HEADERS_H__ */
diff --git a/test/packetdrill/tcp_options.c b/test/packetdrill/tcp_options.c
new file mode 100644
index 0000000..b7def9d
--- /dev/null
+++ b/test/packetdrill/tcp_options.c
@@ -0,0 +1,70 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Implementation for reading and writing TCP options in their wire format.
+ */
+
+#include "tcp_options.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include "packet.h"
+
+struct tcp_options *tcp_options_new(void)
+{
+ return calloc(1, sizeof(struct tcp_options));
+}
+
+struct tcp_option *tcp_option_new(u8 kind, u8 length)
+{
+ struct tcp_option *option = calloc(1, sizeof(struct tcp_option));
+ option->kind = kind;
+ option->length = length;
+ return option;
+}
+
+int tcp_options_append(struct tcp_options *options,
+ struct tcp_option *option)
+{
+ if (options->length + option->length > sizeof(options->data))
+ return STATUS_ERR;
+ memcpy(options->data + options->length, option, option->length);
+ options->length += option->length;
+ assert(options->length <= sizeof(options->data));
+ free(option);
+ return STATUS_OK;
+}
+
+int num_sack_blocks(u8 opt_len, int *num_blocks, char **error)
+{
+ if (opt_len <= 2) {
+ asprintf(error, "TCP SACK option too short");
+ return STATUS_ERR;
+ }
+ const int num_bytes = opt_len - 2;
+ if (num_bytes % sizeof(struct sack_block) != 0) {
+ asprintf(error,
+ "TCP SACK option not a multiple of SACK block size");
+ return STATUS_ERR;
+ }
+ *num_blocks = num_bytes / sizeof(struct sack_block);
+ return STATUS_OK;
+}
diff --git a/test/packetdrill/tcp_options.h b/test/packetdrill/tcp_options.h
new file mode 100644
index 0000000..4967c2d
--- /dev/null
+++ b/test/packetdrill/tcp_options.h
@@ -0,0 +1,129 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Interfaces for reading and writing TCP options in their wire format.
+ */
+
+#ifndef __TCP_OPTIONS_H__
+#define __TCP_OPTIONS_H__
+
+#include "types.h"
+
+#include "packet.h"
+
+#define MAX_TCP_OPTION_BYTES (MAX_TCP_HEADER_BYTES - (int)sizeof(struct tcp))
+
+/* TCP Fast Open uses the following magic number to be after the
+ * option value for sharing TCP experimental options.
+ *
+ * For a description of experimental options, see:
+ * http://tools.ietf.org/html/draft-ietf-tcpm-experimental-options-00
+ *
+ * For a description of TFO, see:
+ * http://tools.ietf.org/html/draft-cheng-tcpm-fastopen-02
+ */
+#define TCPOPT_FASTOPEN_MAGIC 0xF989
+
+/* Experimental TFO option must have:
+ * 1-byte kind, 1-byte length, and 2-byte magic: */
+#define TCPOLEN_EXP_FASTOPEN_BASE 4 /* smallest legal TFO option size */
+
+/* RFC7413 TFO option must have: 1-byte kind, 1-byte length: */
+#define TCPOLEN_FASTOPEN_BASE 2 /* smallest legal TFO option size */
+
+/* The TFO option base prefix leaves this amount of space: */
+#define MAX_TCP_FAST_OPEN_COOKIE_BYTES \
+ (MAX_TCP_OPTION_BYTES - TCPOLEN_FASTOPEN_BASE)
+#define MAX_TCP_FAST_OPEN_EXP_COOKIE_BYTES \
+ (MAX_TCP_OPTION_BYTES - TCPOLEN_EXP_FASTOPEN_BASE)
+
+/* Represents a list of TCP options in their wire format. */
+struct tcp_options {
+ u8 data[MAX_TCP_OPTION_BYTES]; /* The options data, in wire format */
+ u8 length; /* The length, in bytes, of the data */
+};
+
+/* Specification of a TCP SACK block (RFC 2018) */
+struct sack_block {
+ u32 left; /* left edge: 1st sequence number in block */
+ u32 right; /* right edge: 1st sequence number just past block */
+};
+
+/* Represents a single TCP option in its wire format. Note that for
+ * EOL and NOP options the length and data field are not included in
+ * the on-the-wire data. For other options, the length field describes
+ * the number of bytes of the struct that go on the wire. */
+struct tcp_option {
+ u8 kind;
+ u8 length; /* bytes on the wire; includes kind and length byte */
+ union {
+ struct {
+ u16 bytes; /* in network order */
+ } mss;
+ struct {
+ u32 val; /* in network order */
+ u32 ecr; /* in network order */
+ } time_stamp;
+ struct {
+ u8 shift_count;
+ } window_scale;
+ struct {
+ /* actual number of blocks will be 1..4 */
+ struct sack_block block[4];
+ } sack;
+ struct {
+ u8 digest[TCP_MD5_DIGEST_LEN];
+ } md5; /* TCP MD5 Signature Option: RFC 2385 */
+ struct {
+ /* The fast open chookie should be 4-16 bytes
+ * of cookie, multiple of 2 bytes, but we
+ * allow for larger sizes, so we can test what
+ * stacks do with illegal options.
+ */
+ u8 cookie[MAX_TCP_FAST_OPEN_COOKIE_BYTES];
+ } fast_open;
+ struct {
+ u16 magic; /* must be TCPOPT_FASTOPEN_MAGIC */
+ u8 cookie[MAX_TCP_FAST_OPEN_EXP_COOKIE_BYTES];
+ } fast_open_exp;
+ } data;
+} __packed tcp_option;
+
+/* Allocate a new options list. */
+extern struct tcp_options *tcp_options_new(void);
+
+/* Allocate a new option and initialize its kind and length fields. */
+extern struct tcp_option *tcp_option_new(u8 kind, u8 length);
+
+/* Appends the given option to the given list of options. Returns
+ * STATUS_OK on success; on failure returns STATUS_ERR and sets
+ * error message.
+ */
+extern int tcp_options_append(struct tcp_options *options,
+ struct tcp_option *option);
+
+/* Calculate the number of SACK blocks in a SACK option of the given
+ * length and store it in *num_blocks. Returns STATUS_OK on success;
+ * on failure returns STATUS_ERR and sets error message.
+ */
+extern int num_sack_blocks(u8 opt_len, int *num_blocks, char **error);
+
+#endif /* __TCP_OPTIONS_H__ */
diff --git a/test/packetdrill/tcp_options_iterator.c b/test/packetdrill/tcp_options_iterator.c
new file mode 100644
index 0000000..6123387
--- /dev/null
+++ b/test/packetdrill/tcp_options_iterator.c
@@ -0,0 +1,169 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Implementation for module to allow iteration over TCP options in
+ * wire format.
+ */
+
+#include "tcp_options_iterator.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include "packet.h"
+#include "tcp.h"
+#include "tcp_options.h"
+
+/* Return the length (in bytes) we expect to see for the TCP option of
+ * the given kind, or 0 if the option is variable-length. Returns
+ * STATUS_OK on success; on failure returns STATUS_ERR and sets
+ * error message.
+ */
+static int get_expected_tcp_option_length(u8 kind, u8 *expected_length,
+ char **error)
+{
+ switch (kind) {
+ case TCPOPT_EOL:
+ case TCPOPT_NOP:
+ *expected_length = 1; /* no length byte or data */
+ break;
+
+ case TCPOPT_MAXSEG:
+ *expected_length = TCPOLEN_MAXSEG;
+ break;
+
+ case TCPOPT_WINDOW:
+ *expected_length = TCPOLEN_WINDOW;
+ break;
+
+ case TCPOPT_SACK_PERMITTED:
+ *expected_length = TCPOLEN_SACK_PERMITTED;
+ break;
+
+ case TCPOPT_TIMESTAMP:
+ *expected_length = TCPOLEN_TIMESTAMP;
+ break;
+
+ case TCPOPT_SACK:
+ case TCPOPT_MD5SIG:
+ case TCPOPT_FASTOPEN:
+ case TCPOPT_EXP:
+ *expected_length = 0; /* variable-length option */
+ break;
+
+ default:
+ asprintf(error, "unexpected TCP option kind: %u", kind);
+ return STATUS_ERR;
+ }
+ return STATUS_OK;
+}
+
+/* Calculate the length of the TCP option at 'opt', in a block of TCP
+ * options that ends at 'end'. If 'expected_length' is non-zero,
+ * verify that length matches the expectation. Return length of
+ * option in bytes in *length. Returns STATUS_OK on success; on
+ * failure returns STATUS_ERR and sets error message.
+ */
+static int get_tcp_option_length(const u8 *option, const u8 *end,
+ u8 expected_length, u8 *length, char **error)
+{
+ int result = STATUS_ERR;
+ if (option + 1 >= end) {
+ asprintf(error, "TCP option length byte extends too far");
+ goto out;
+ }
+ *length = *(option + 1);
+ if (*length < 2) {
+ asprintf(error, "TCP option with length byte is too short");
+ goto out;
+ }
+
+ if (option + (*length) > end) {
+ asprintf(error, "TCP option data extends too far");
+ goto out;
+ }
+ if (expected_length && (*length != expected_length)) {
+ asprintf(error,
+ "bad TCP option length: was %u but expected %u",
+ *length, expected_length);
+ goto out;
+ }
+ result = STATUS_OK;
+
+out:
+ return result;
+}
+
+static struct tcp_option *get_current_option(
+ struct tcp_options_iterator *iter)
+{
+ assert(iter->current_option <= iter->options_end);
+ if (iter->current_option >= iter->options_end)
+ iter->current_option = NULL;
+ return (struct tcp_option *)iter->current_option;
+}
+
+struct tcp_option *tcp_options_begin(
+ struct packet *packet,
+ struct tcp_options_iterator *iter)
+{
+ memset(iter, 0, sizeof(*iter));
+ iter->current_option = packet_tcp_options(packet);
+ iter->options_end = packet_payload(packet);
+ return get_current_option(iter);
+}
+
+struct tcp_option *tcp_options_next(
+ struct tcp_options_iterator *iter, char **error)
+{
+ /* Ensure we haven't hit the end. */
+ assert(iter->current_option < iter->options_end);
+ assert(iter->current_option != NULL);
+
+ /* Find the length we expect for this kind of option. */
+ u8 length = 0; /* length of this option in bytes */
+ u8 expected_length = 0; /* expected length for this kind */
+ struct tcp_option *option = (struct tcp_option *)iter->current_option;
+ if (get_expected_tcp_option_length(
+ option->kind, &expected_length, error))
+ goto out;
+
+ /* Calculate and validate the actual length of the option. */
+ if (expected_length == 1) {
+ /* 1 byte length means no length byte, so real length is 1. */
+ length = 1;
+ } else {
+ /* Parse and validate length byte. */
+ if (get_tcp_option_length(iter->current_option,
+ iter->options_end,
+ expected_length, &length, error))
+ goto out;
+ }
+
+ /* Advance to the next TCP option. */
+ assert(length > 0);
+ iter->current_option += length;
+ assert(iter->current_option <= iter->options_end);
+ return get_current_option(iter);
+
+out:
+ return NULL;
+
+}
diff --git a/test/packetdrill/tcp_options_iterator.h b/test/packetdrill/tcp_options_iterator.h
new file mode 100644
index 0000000..21f6c86
--- /dev/null
+++ b/test/packetdrill/tcp_options_iterator.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Interface for a module to allow iteration over TCP options in wire format.
+ */
+
+#ifndef __TCP_OPTIONS_ITERATOR_H__
+#define __TCP_OPTIONS_ITERATOR_H__
+
+#include "types.h"
+
+#include "packet.h"
+#include "tcp_options.h"
+
+/* Internal state for an iterator for TCP options in wire format. */
+struct tcp_options_iterator {
+ u8 *current_option;
+ u8 *options_end;
+};
+
+/* Initialize the iterator to iterate over the TCP options in the
+ * given packet. Return a pointer to the first option in the packet,
+ * or NULL if there are none.
+ */
+extern struct tcp_option *tcp_options_begin(
+ struct packet *packet,
+ struct tcp_options_iterator *iter);
+
+/* Return a pointer to the next option in the packet, or NULL if there
+ * are no more. On failure returns NULL and sets error message.
+ */
+extern struct tcp_option *tcp_options_next(
+ struct tcp_options_iterator *iter, char **error);
+
+#endif /* __TCP_OPTIONS_ITERATOR_H__ */
diff --git a/test/packetdrill/tcp_options_to_string.c b/test/packetdrill/tcp_options_to_string.c
new file mode 100644
index 0000000..09a5230
--- /dev/null
+++ b/test/packetdrill/tcp_options_to_string.c
@@ -0,0 +1,167 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Implementation for generating human-readable representations of TCP options.
+ */
+
+#include "tcp_options_to_string.h"
+
+#include "tcp_options_iterator.h"
+
+/* If the MD5 digest option is in the valid range of sizes, print the MD5
+ * option and digest and return STATUS_OK. Otherwise, return STATUS_ERR.
+ */
+static int tcp_md5_option_to_string(FILE *s, struct tcp_option *option)
+{
+ int digest_bytes, i;
+
+ if (option->length < TCPOLEN_MD5_BASE ||
+ option->length > TCPOLEN_MD5SIG)
+ return STATUS_ERR;
+
+ digest_bytes = option->length - TCPOLEN_MD5_BASE;
+ fprintf(s, "md5");
+ if (digest_bytes > 0)
+ fprintf(s, " ");
+ for (i = 0; i < digest_bytes; ++i)
+ fprintf(s, "%02x", option->data.md5.digest[i]);
+ return STATUS_OK;
+}
+
+/* See if the given experimental option is a TFO option, and if so
+ * then print the TFO option and return STATUS_OK. Otherwise, return
+ * STATUS_ERR.
+ */
+static int tcp_fast_open_option_to_string(FILE *s, struct tcp_option *option,
+ bool exp)
+{
+ if (exp && ((option->length < TCPOLEN_EXP_FASTOPEN_BASE) ||
+ (ntohs(option->data.fast_open_exp.magic) != TCPOPT_FASTOPEN_MAGIC)))
+ return STATUS_ERR;
+
+ fprintf(s, exp ? "FOEXP" : "FO");
+ int cookie_bytes = option->length - (exp ? TCPOLEN_EXP_FASTOPEN_BASE :
+ TCPOLEN_FASTOPEN_BASE);
+ assert(cookie_bytes >= 0);
+ assert(cookie_bytes <= (exp ? MAX_TCP_FAST_OPEN_EXP_COOKIE_BYTES :
+ MAX_TCP_FAST_OPEN_COOKIE_BYTES));
+ if (cookie_bytes > 0)
+ fprintf(s, " ");
+ int i;
+ for (i = 0; i < cookie_bytes; ++i)
+ fprintf(s, "%02x", exp ? option->data.fast_open_exp.cookie[i] :
+ option->data.fast_open.cookie[i]);
+ return STATUS_OK;
+}
+
+int tcp_options_to_string(struct packet *packet,
+ char **ascii_string, char **error)
+{
+ int result = STATUS_ERR; /* return value */
+ size_t size = 0;
+ FILE *s = open_memstream(ascii_string, &size); /* output string */
+
+ int index = 0; /* number of options seen so far */
+
+ struct tcp_options_iterator iter;
+ struct tcp_option *option = NULL;
+ for (option = tcp_options_begin(packet, &iter);
+ option != NULL; option = tcp_options_next(&iter, error)) {
+ if (index > 0)
+ fputc(',', s);
+
+ switch (option->kind) {
+ case TCPOPT_EOL:
+ fputs("eol", s);
+ break;
+
+ case TCPOPT_NOP:
+ fputs("nop", s);
+ break;
+
+ case TCPOPT_MAXSEG:
+ fprintf(s, "mss %u", ntohs(option->data.mss.bytes));
+ break;
+
+ case TCPOPT_WINDOW:
+ fprintf(s, "wscale %u",
+ option->data.window_scale.shift_count);
+ break;
+
+ case TCPOPT_SACK_PERMITTED:
+ fputs("sackOK", s);
+ break;
+
+ case TCPOPT_SACK:
+ fprintf(s, "sack ");
+ int num_blocks = 0;
+ if (num_sack_blocks(option->length,
+ &num_blocks, error))
+ goto out;
+ int i = 0;
+ for (i = 0; i < num_blocks; ++i) {
+ if (i > 0)
+ fputc(' ', s);
+ fprintf(s, "%u:%u",
+ ntohl(option->data.sack.block[i].left),
+ ntohl(option->data.sack.block[i].right));
+ }
+ break;
+
+ case TCPOPT_TIMESTAMP:
+ fprintf(s, "TS val %u ecr %u",
+ ntohl(option->data.time_stamp.val),
+ ntohl(option->data.time_stamp.ecr));
+ break;
+
+ case TCPOPT_MD5SIG:
+ tcp_md5_option_to_string(s, option);
+ break;
+
+ case TCPOPT_FASTOPEN:
+ tcp_fast_open_option_to_string(s, option, false);
+ break;
+
+ case TCPOPT_EXP:
+ if (tcp_fast_open_option_to_string(s, option, true)) {
+ asprintf(error,
+ "unknown experimental option");
+ goto out;
+ }
+ break;
+
+ default:
+ asprintf(error, "unexpected TCP option kind: %u",
+ option->kind);
+ goto out;
+ }
+ ++index;
+ }
+ if (*error != NULL) /* bogus TCP options prevented iteration */
+ goto out;
+
+ result = STATUS_OK;
+
+out:
+ fclose(s);
+ return result;
+
+}
diff --git a/test/packetdrill/tcp_options_to_string.h b/test/packetdrill/tcp_options_to_string.h
new file mode 100644
index 0000000..34f1105
--- /dev/null
+++ b/test/packetdrill/tcp_options_to_string.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Interface for generating human-readable representations of TCP options.
+ */
+
+#ifndef __TCP_OPTIONS_TO_STRING_H__
+#define __TCP_OPTIONS_TO_STRING_H__
+
+#include "types.h"
+
+#include "packet.h"
+#include "tcp_options.h"
+
+/* Returns in *ascii_string a human-readable representation of the TCP
+ * options for 'packet'. Returns STATUS_OK on success; on failure
+ * returns STATUS_ERR and sets error message.
+ */
+extern int tcp_options_to_string(struct packet *packet,
+ char **ascii_string, char **error);
+
+#endif /* __TCP_OPTIONS_TO_STRING_H__ */
diff --git a/test/packetdrill/tcp_packet.c b/test/packetdrill/tcp_packet.c
new file mode 100644
index 0000000..aa667b6
--- /dev/null
+++ b/test/packetdrill/tcp_packet.c
@@ -0,0 +1,166 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Implementation for module for formatting TCP packets.
+ */
+
+#include "tcp_packet.h"
+
+#include "ip_packet.h"
+#include "tcp.h"
+
+/* The full list of valid TCP bit flag characters */
+static const char valid_tcp_flags[] = "FSRP.EWCU";
+
+/* Are all the TCP flags in the given string valid? */
+static bool is_tcp_flags_spec_valid(const char *flags, char **error)
+{
+ const char *s;
+
+ for (s = flags; *s != '\0'; ++s) {
+ if (!strchr(valid_tcp_flags, *s)) {
+ asprintf(error, "Invalid TCP flag: '%c'", *s);
+ return false;
+ }
+ }
+ return true;
+}
+
+/* Parse tcpdump-style ASCII representation of flags to look for a flag */
+static inline int is_tcp_flag_set(char flag, const char *flags)
+{
+ return (strchr(flags, flag) != NULL) ? 1 : 0;
+}
+
+struct packet *new_tcp_packet(int address_family,
+ enum direction_t direction,
+ struct ip_info ip_info,
+ u16 src_port,
+ u16 dst_port,
+ const char *flags,
+ u32 start_sequence,
+ u16 tcp_payload_bytes,
+ u32 ack_sequence,
+ s32 window,
+ u16 urg_ptr,
+ const struct tcp_options *tcp_options,
+ char **error)
+{
+ struct packet *packet = NULL; /* the newly-allocated result packet */
+ struct header *tcp_header = NULL; /* the TCP header info */
+ /* Calculate lengths in bytes of all sections of the packet */
+ const int ip_option_bytes = 0;
+ const int tcp_option_bytes = tcp_options ? tcp_options->length : 0;
+ const int ip_header_bytes = (ip_header_min_len(address_family) +
+ ip_option_bytes);
+ const int tcp_header_bytes = sizeof(struct tcp) + tcp_option_bytes;
+ const int ip_bytes =
+ ip_header_bytes + tcp_header_bytes + tcp_payload_bytes;
+
+ /* Sanity-check all the various lengths */
+ if (ip_option_bytes & 0x3) {
+ asprintf(error, "IP options are not padded correctly "
+ "to ensure IP header is a multiple of 4 bytes: "
+ "%d excess bytes", ip_option_bytes & 0x3);
+ return NULL;
+ }
+ if (tcp_option_bytes & 0x3) {
+ asprintf(error,
+ "TCP options are not padded correctly "
+ "to ensure TCP header is a multiple of 4 bytes: "
+ "%d excess bytes", tcp_option_bytes & 0x3);
+ return NULL;
+ }
+ assert((tcp_header_bytes & 0x3) == 0);
+ assert((ip_header_bytes & 0x3) == 0);
+
+ if (tcp_header_bytes > MAX_TCP_HEADER_BYTES) {
+ asprintf(error, "TCP header too large");
+ return NULL;
+ }
+
+ if (ip_bytes > MAX_TCP_DATAGRAM_BYTES) {
+ asprintf(error, "TCP segment too large");
+ return NULL;
+ }
+
+ if (!is_tcp_flags_spec_valid(flags, error))
+ return NULL;
+
+ /* Allocate and zero out a packet object of the desired size */
+ packet = packet_new(ip_bytes);
+ memset(packet->buffer, 0, ip_bytes);
+
+ packet->direction = direction;
+ packet->flags = 0;
+ packet->tos_chk = ip_info.tos.check;
+
+ /* Set IP header fields */
+ set_packet_ip_header(packet, address_family, ip_bytes,
+ ip_info.tos.value, ip_info.flow_label,
+ ip_info.ttl, IPPROTO_TCP);
+
+ tcp_header = packet_append_header(packet, HEADER_TCP, tcp_header_bytes);
+ tcp_header->total_bytes = tcp_header_bytes + tcp_payload_bytes;
+
+ /* Find the start of TCP sections of the packet */
+ packet->tcp = (struct tcp *) (ip_start(packet) + ip_header_bytes);
+ u8 *tcp_option_start = (u8 *) (packet->tcp + 1);
+
+ /* Set TCP header fields */
+ packet->tcp->src_port = htons(src_port);
+ packet->tcp->dst_port = htons(dst_port);
+ packet->tcp->seq = htonl(start_sequence);
+ packet->tcp->ack_seq = htonl(ack_sequence);
+ packet->tcp->doff = tcp_header_bytes / 4;
+ if (window == -1) {
+ if (direction == DIRECTION_INBOUND) {
+ asprintf(error, "window must be specified"
+ " for inbound packets");
+ return NULL;
+ }
+ packet->tcp->window = 0;
+ packet->flags |= FLAG_WIN_NOCHECK;
+ } else {
+ packet->tcp->window = htons(window);
+ }
+ packet->tcp->check = 0;
+ packet->tcp->urg_ptr = htons(urg_ptr);
+ packet->tcp->fin = is_tcp_flag_set('F', flags);
+ packet->tcp->syn = is_tcp_flag_set('S', flags);
+ packet->tcp->rst = is_tcp_flag_set('R', flags);
+ packet->tcp->psh = is_tcp_flag_set('P', flags);
+ packet->tcp->ack = is_tcp_flag_set('.', flags);
+ packet->tcp->urg = is_tcp_flag_set('U', flags);
+ packet->tcp->ece = is_tcp_flag_set('E', flags);
+ packet->tcp->cwr = is_tcp_flag_set('W', flags);
+
+ if (tcp_options == NULL) {
+ packet->flags |= FLAG_OPTIONS_NOCHECK;
+ } else if (tcp_options->length > 0) {
+ /* Copy TCP options into packet */
+ memcpy(tcp_option_start, tcp_options->data,
+ tcp_options->length);
+ }
+
+ packet->ip_bytes = ip_bytes;
+ return packet;
+}
diff --git a/test/packetdrill/tcp_packet.h b/test/packetdrill/tcp_packet.h
new file mode 100644
index 0000000..6e2a6b0
--- /dev/null
+++ b/test/packetdrill/tcp_packet.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Interface for module for formatting TCP packets.
+ */
+
+#ifndef __TCP_PACKET_H__
+#define __TCP_PACKET_H__
+
+#include "types.h"
+
+#include "packet.h"
+#include "tcp_options.h"
+
+/* Create and initialize a new struct packet containing a TCP segment.
+ * The 'flags' are a tcpdump-style sequence of TCP header flags.
+ * On success, returns a newly-allocated packet. On failure, returns NULL
+ * and fills in *error with an error message.
+ */
+extern struct packet *new_tcp_packet(int address_family,
+ enum direction_t direction,
+ struct ip_info ip_info,
+ u16 src_port,
+ u16 dst_port,
+ const char *flags,
+ u32 start_sequence,
+ u16 tcp_payload_bytes,
+ u32 ack_sequence,
+ s32 window,
+ u16 urg_ptr,
+ const struct tcp_options *tcp_options,
+ char **error);
+#endif /* __TCP_PACKET_H__ */
diff --git a/test/packetdrill/tests/bsd/fast_retransmit/fr-4pkt-sack-bsd.pkt b/test/packetdrill/tests/bsd/fast_retransmit/fr-4pkt-sack-bsd.pkt
new file mode 100644
index 0000000..1980739
--- /dev/null
+++ b/test/packetdrill/tests/bsd/fast_retransmit/fr-4pkt-sack-bsd.pkt
@@ -0,0 +1,38 @@
+// Test fast retransmit with 4 packets outstanding, receiver sending SACKs.
+// In this variant the receiver supports SACK.
+
+// Establish a connection.
+0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+
++0 bind(3, ..., ...) = 0
++0 listen(3, 1) = 0
+
++0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
++0 > S. 0:0(0) ack 1 <...>
+
++.1 < . 1:1(0) ack 1 win 257
++0 accept(3, ..., ...) = 4
+
+// Send 1 data segment and get an ACK, so cwnd is now 4.
++0 write(4, ..., 1000) = 1000
++0 > P. 1:1001(1000) ack 1
+
++.1 < . 1:1(0) ack 1001 win 257
+
+// Write 4 data segments.
++0 write(4, ..., 4000) = 4000
++0 > . 1001:2001(1000) ack 1
++0 > . 2001:3001(1000) ack 1
++0 > . 3001:4001(1000) ack 1
++0 > P. 4001:5001(1000) ack 1
+
+// Get 3 SACKs.
++.1 < . 1:1(0) ack 1001 win 257 <sack 2001:3001,nop,nop>
++0 < . 1:1(0) ack 1001 win 257 <sack 2001:4001,nop,nop>
++0 < . 1:1(0) ack 1001 win 257 <sack 2001:5001,nop,nop>
+// We've received 3 duplicate ACKs, so we do a fast retransmit.
++0 > . 1001:2001(1000) ack 1
+
+// Receiver ACKs all data.
++.1 < . 1:1(0) ack 6001 win 257
diff --git a/test/packetdrill/tests/linux/README b/test/packetdrill/tests/linux/README
new file mode 100644
index 0000000..0e6db19
--- /dev/null
+++ b/test/packetdrill/tests/linux/README
@@ -0,0 +1,7 @@
+Packetdrill tests for Linux.
+
+This directory contains Packetdrill tests for Linux. The tests all pass under
+kernel 3.11.0-rc4 (installed on an Ubuntu 13.04 machine). However, due to TCP
+metrics caching in recent kernels, a second run of all tests can result in
+failures. The script run_tests.sh in this directory uses the iproute tool to
+flush the TCP metrics cache before each test.
diff --git a/test/packetdrill/tests/linux/blocking/blocking-accept.pkt b/test/packetdrill/tests/linux/blocking/blocking-accept.pkt
new file mode 100644
index 0000000..02c7cd8
--- /dev/null
+++ b/test/packetdrill/tests/linux/blocking/blocking-accept.pkt
@@ -0,0 +1,15 @@
+// Test for blocking accept.
+
+// Establish a connection.
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0.000 bind(3, ..., ...) = 0
+0.000 listen(3, 1) = 0
+0.000...0.200 accept(3, ..., ...) = 4
+
+0.100 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7>
+0.100 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 6>
+0.200 < . 1:1(0) ack 1 win 257
+
+0.300 write(4, ..., 2000) = 2000
+0.300 > P. 1:2001(2000) ack 1
diff --git a/test/packetdrill/tests/linux/blocking/blocking-read.pkt b/test/packetdrill/tests/linux/blocking/blocking-read.pkt
new file mode 100644
index 0000000..1c734c1
--- /dev/null
+++ b/test/packetdrill/tests/linux/blocking/blocking-read.pkt
@@ -0,0 +1,25 @@
+// Test for blocking read.
+
+// Establish a connection.
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0.000 bind(3, ..., ...) = 0
+0.000 listen(3, 1) = 0
+
+0.100 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7>
+0.100 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 6>
+0.200 < . 1:1(0) ack 1 win 257
+0.200 accept(3, ..., ...) = 4
+
+0.200...0.300 read(4, ..., 2000) = 2000
+0.300 < P. 1:2001(2000) ack 1 win 257
+0.300 > . 1:1(0) ack 2001
+
+0.400...0.500 read(4, ..., 2000) = 2000
+0.500 < P. 2001:4001(2000) ack 1 win 257
+0.500 > . 1:1(0) ack 4001
+
+0.600 < P. 4001:6001(2000) ack 1 win 257
+0.600 > . 1:1(0) ack 6001
+0.600...0.600 read(4, ..., 1000) = 1000
+0.600...0.600 read(4, ..., 1000) = 1000
diff --git a/test/packetdrill/tests/linux/close/close-read-data-fin.pkt b/test/packetdrill/tests/linux/close/close-read-data-fin.pkt
new file mode 100644
index 0000000..bad95c2
--- /dev/null
+++ b/test/packetdrill/tests/linux/close/close-read-data-fin.pkt
@@ -0,0 +1,38 @@
+// If we close the connection after read()'ing what
+// the other side sent, a FIN will be generated. This
+// behavior conforms to RFC 793.
+
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0.000 bind(3, ..., ...) = 0
+0.000 listen(3, 1) = 0
+
+0.100 < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 7>
+0.100 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 6>
+0.200 < . 1:1(0) ack 1 win 257
+0.200 accept(3, ..., ...) = 4
+
+// Receive first segment
+0.210 < P. 1:1001(1000) ack 1 win 46
+
+// Send one ACK
+0.210 > . 1:1(0) ack 1001
+
+// Application writes 1000 bytes
+0.250 write(4, ..., 1000) = 1000
+0.250 > P. 1:1001(1000) ack 1001
+
+// ACK
+0.300 < . 1001:1001(0) ack 1001 win 257
+
+0.400 read(4, ..., 1000) = 1000
+
+// Client closes the connection
+0.610 < F. 1001:1001(0) ack 1001 win 260
+
+// Respond with (delayed) ACK
+0.650 > . 1001:1001(0) ack 1002
+
+// We close the connection
+0.700 close(4) = 0
+0.701 > F. 1001:1001(0) ack 1002
diff --git a/test/packetdrill/tests/linux/close/close-so-linger-onoff-1-linger-0-rst.pkt b/test/packetdrill/tests/linux/close/close-so-linger-onoff-1-linger-0-rst.pkt
new file mode 100644
index 0000000..dcec1cf
--- /dev/null
+++ b/test/packetdrill/tests/linux/close/close-so-linger-onoff-1-linger-0-rst.pkt
@@ -0,0 +1,28 @@
+// Verify that when a process uses SO_LINGER with {onoff=1, linger=0},
+// and then closes the socket, the kernel sends a RST.
+// (TODO(ncardwell): it also frees the socket immediately without any
+// time in TIME_WAIT; we should test this too once we have some
+// infrastructure for testing this kind of thing reliably...)
+
+// Initialize a server socket.
+0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
++0 bind(3, ..., ...) = 0
++0 listen(3, 1) = 0
+
++0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
++0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 6>
++0 < . 1:1(0) ack 1 win 257
+
++0 accept(3, ..., ...) = 4
+
++0 setsockopt(4, SOL_SOCKET, SO_LINGER, {onoff=1, linger=0}, 8) = 0
+
+// Write some data, receive an ACK.
++0 write(4, ..., 1000) = 1000
++0 > P. 1:1001(1000) ack 1
++0 < . 1:1(0) ack 1001 win 257
+
+// Clean up.
++0 close(4) = 0
++0 > R. 1001:1001(0) ack 1
diff --git a/test/packetdrill/tests/linux/close/close-unread-data-rst.pkt b/test/packetdrill/tests/linux/close/close-unread-data-rst.pkt
new file mode 100644
index 0000000..d30808b
--- /dev/null
+++ b/test/packetdrill/tests/linux/close/close-unread-data-rst.pkt
@@ -0,0 +1,38 @@
+// If we close the connection before read()'ing what
+// the other side sent, a RST will be generated instead
+// of a FIN.
+
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0.000 bind(3, ..., ...) = 0
+0.000 listen(3, 1) = 0
+
+0.100 < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 7>
+0.100 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 6>
+0.200 < . 1:1(0) ack 1 win 257
+0.200 accept(3, ..., ...) = 4
+
+// Receive first segment.
+0.210 < P. 1:1001(1000) ack 1 win 46
+
+// Send one ACK.
+0.210 > . 1:1(0) ack 1001
+
+// Application writes 1000 bytes.
+0.250 write(4, ..., 1000) = 1000
+0.250 > P. 1:1001(1000) ack 1001
+
+// ACK
+0.300 < . 1001:1001(0) ack 1001 win 257
+
+// Client closes the connection.
+0.610 < F. 1001:1001(0) ack 1001 win 260
+
+// Respond with (delayed) ACK.
+0.650 > . 1001:1001(0) ack 1002
+
+// We close the connection.
+0.700 close(4) = 0
+// Since we have not read, we generate a RST instead of a FIN
+// conforming to RFC 1122 section 4.2.2.13.
+0.701 > R. 1001:1001(0) ack 1002
diff --git a/test/packetdrill/tests/linux/connect/http-get-nonblocking-ts.pkt b/test/packetdrill/tests/linux/connect/http-get-nonblocking-ts.pkt
new file mode 100644
index 0000000..f998df4
--- /dev/null
+++ b/test/packetdrill/tests/linux/connect/http-get-nonblocking-ts.pkt
@@ -0,0 +1,34 @@
+// A simple client-side HTTP-style test that does a connect, sends a
+// short request, and receives a short response.
+
+// Create a socket and set it to non-blocking.
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 fcntl(3, F_GETFL) = 0x2 (flags O_RDWR)
+0.000 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+
+// Establish connection and verify that there was no error.
+0.100 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress)
+0.100 > S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 6>
+0.200 < S. 0:0(0) ack 1 win 5792 <mss 1460,sackOK,TS val 700 ecr 100,nop,wscale 7>
+0.200 > . 1:1(0) ack 1 <nop,nop,TS val 200 ecr 700>
+0.200 getsockopt(3, SOL_SOCKET, SO_ERROR, [0], [4]) = 0
+0.200 fcntl(3, F_SETFL, O_RDWR) = 0 // set back to blocking
+
+// Send the HTTP request.
+0.200 write(3, ..., 57) = 57
+0.200 > P. 1:58(57) ack 1 <nop,nop,TS val 200 ecr 700>
+0.300 < . 1:1(0) ack 58 win 92 <nop,nop,TS val 800 ecr 200>
+
+// Receive the HTTP response and the server's FIN.
+0.300 < P. 1:786(785) ack 58 win 92 <nop,nop,TS val 800 ecr 200>
+0.300 > . 58:58(0) ack 786 <nop,nop,TS val 300 ecr 800>
+0.300 < F. 786:786(0) ack 58 win 92 <nop,nop,TS val 800 ecr 200>
+0.300 read(3, ..., 1024) = 785
+0.300 read(3, ..., 1024) = 0
+// Delayed ACK.
+0.340 > . 58:58(0) ack 787 <nop,nop,TS val 300 ecr 800>
+
+// Close the connection.
+0.350 close(3) = 0
+0.350 > F. 58:58(0) ack 787 <nop,nop,TS val 300 ecr 800>
+0.450 < . 787:787(0) ack 59 win 92 <nop,nop,TS val 900 ecr 300>
diff --git a/test/packetdrill/tests/linux/early_retransmit/er-delayed-2pkt-sack.pkt b/test/packetdrill/tests/linux/early_retransmit/er-delayed-2pkt-sack.pkt
new file mode 100644
index 0000000..72afec0
--- /dev/null
+++ b/test/packetdrill/tests/linux/early_retransmit/er-delayed-2pkt-sack.pkt
@@ -0,0 +1,27 @@
+// Test delayed ER with 2 packets outstanding, receiver sending SACKs.
+
+// Enable delayed early retransmit.
+`sysctl -q net.ipv4.tcp_early_retrans=2`
+
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0.000 bind(3, ..., ...) = 0
+0.000 listen(3, 1) = 0
+
+0.100 < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 7>
+0.100 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 6>
+0.200 < . 1:1(0) ack 1 win 257
+0.200 accept(3, ..., ...) = 4
+
+0.200 write(4, ..., 2920) = 2920
+0.200 > P. 1:2921(2920) ack 1
+0.300 < . 1:1(0) ack 1 win 257 <sack 1461:2921,nop,nop>
+0.325 > . 1:1461(1460) ack 1 // delayed Early Retransmit at RTT/4 = 25ms
+0.425 < . 1:1(0) ack 2921 win 257
+
+0.500 close(4) = 0
+0.500 > F. 2921:2921(0) ack 1
+0.600 < F. 1:1(0) ack 2922 win 257
+0.601 > . 2922:2922(0) ack 2
+
+0.700 `sysctl -q net.ipv4.tcp_early_retrans=3`
diff --git a/test/packetdrill/tests/linux/early_retransmit/er-delayed-3pkt-sack.pkt b/test/packetdrill/tests/linux/early_retransmit/er-delayed-3pkt-sack.pkt
new file mode 100644
index 0000000..5d05264
--- /dev/null
+++ b/test/packetdrill/tests/linux/early_retransmit/er-delayed-3pkt-sack.pkt
@@ -0,0 +1,28 @@
+// Test delayed ER with 3 packets outstanding, receiver sending SACKs.
+
+// Enable delayed early retransmit.
+`sysctl -q net.ipv4.tcp_early_retrans=2`
+
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0.000 bind(3, ..., ...) = 0
+0.000 listen(3, 1) = 0
+
+0.100 < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 7>
+0.100 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 6>
+0.200 < . 1:1(0) ack 1 win 257
+0.200 accept(3, ..., ...) = 4
+
+0.200 write(4, ..., 4380) = 4380
+0.200 > P. 1:4381(4380) ack 1
+0.300 < . 1:1(0) ack 1 win 257 <sack 1461:2921,nop,nop>
+0.300 < . 1:1(0) ack 1 win 257 <sack 1461:4381,nop,nop>
+0.325 > . 1:1461(1460) ack 1 // delayed Early Retransmit at RTT/4 = 25ms
+0.425 < . 1:1(0) ack 4381 win 257
+
+0.500 close(4) = 0
+0.500 > F. 4381:4381(0) ack 1
+0.600 < F. 1:1(0) ack 4382 win 257
+0.601 > . 4382:4382(0) ack 2
+
+0.700 `sysctl -q net.ipv4.tcp_early_retrans=3`
diff --git a/test/packetdrill/tests/linux/early_retransmit/er-delayed-filled-3pkt-sack.pkt b/test/packetdrill/tests/linux/early_retransmit/er-delayed-filled-3pkt-sack.pkt
new file mode 100644
index 0000000..e06db1b
--- /dev/null
+++ b/test/packetdrill/tests/linux/early_retransmit/er-delayed-filled-3pkt-sack.pkt
@@ -0,0 +1,31 @@
+// Test delayed ER with 3 packets outstanding, receiver sending SACKs.
+// Added wrinkles: (1) ACK for missing first packet finally arrives,
+// filling the hole and making the ER superfluous.
+// This test verifies that the ER timer gets correctly cancelled.
+
+// Enable delayed early retransmit.
+`sysctl -q net.ipv4.tcp_early_retrans=2`
+
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0.000 bind(3, ..., ...) = 0
+0.000 listen(3, 1) = 0
+
+0.100 < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 7>
+0.100 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 6>
+0.200 < . 1:1(0) ack 1 win 257
+0.200 accept(3, ..., ...) = 4
+
+0.200 write(4, ..., 4380) = 4380
+0.200 > P. 1:4381(4380) ack 1
+0.300 < . 1:1(0) ack 1 win 257 <sack 1461:2921,nop,nop>
+0.300 < . 1:1(0) ack 1 win 257 <sack 1461:4381,nop,nop>
+0.310 < . 1:1(0) ack 4381 win 257
+// No ER or RTO timer should fire here, since all data is ACKed
+
+1.800 close(4) = 0
+1.800 > F. 4381:4381(0) ack 1
+1.900 < F. 1:1(0) ack 4382 win 257
+1.900 > . 4382:4382(0) ack 2
+
+2.000 `sysctl -q net.ipv4.tcp_early_retrans=3`
diff --git a/test/packetdrill/tests/linux/early_retransmit/er-delayed-get-ack-3pkt-sack.pkt b/test/packetdrill/tests/linux/early_retransmit/er-delayed-get-ack-3pkt-sack.pkt
new file mode 100644
index 0000000..794bcb0
--- /dev/null
+++ b/test/packetdrill/tests/linux/early_retransmit/er-delayed-get-ack-3pkt-sack.pkt
@@ -0,0 +1,35 @@
+// Test delayed ER with 3 packets outstanding, receiver sending SACKs.
+// Added wrinkles: (1) sender gets an ACK before delayed ER timer fires,
+// so we don't do the originally scheduled ER but instead reschedule
+// the ER timer for later.
+
+// Enable delayed early retransmit.
+`sysctl -q net.ipv4.tcp_early_retrans=2`
+
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0.000 bind(3, ..., ...) = 0
+0.000 listen(3, 1) = 0
+
+0.100 < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 7>
+0.100 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 6>
+0.200 < . 1:1(0) ack 1 win 257
+0.200 accept(3, ..., ...) = 4
+
+0.200 write(4, ..., 4380) = 4380
+0.200 > P. 1:4381(4380) ack 1
+0.300 < . 1:1(0) ack 1 win 257 <sack 1461:2921,nop,nop>
+0.300 < . 1:1(0) ack 1 win 257 <sack 1461:4381,nop,nop>
+// Next we get an ACK before ER fires. Any ACK should cause us
+// to cancel ER timer, whether it ACKs new data or (as in this case) does not.
+// Then we reschedule the ER timer again.
+0.310 < . 1:1(0) ack 1 win 257 <sack 1461:4381,nop,nop>
+0.335 > . 1:1461(1460) ack 1 // delayed ER at 0.310 + RTT/4=25ms
+0.435 < . 1:1(0) ack 4381 win 257
+
+0.700 close(4) = 0
+0.700 > F. 4381:4381(0) ack 1
+0.800 < F. 1:1(0) ack 4382 win 257
+0.800 > . 4382:4382(0) ack 2
+
+0.900 `sysctl -q net.ipv4.tcp_early_retrans=3`
diff --git a/test/packetdrill/tests/linux/early_retransmit/er-quick-2pkt-sack.pkt b/test/packetdrill/tests/linux/early_retransmit/er-quick-2pkt-sack.pkt
new file mode 100644
index 0000000..6d0652c
--- /dev/null
+++ b/test/packetdrill/tests/linux/early_retransmit/er-quick-2pkt-sack.pkt
@@ -0,0 +1,27 @@
+// Test quick ER (no delay) with 2 packets outstanding, receiver sending SACKs.
+
+// Enable quick early retransmit.
+`sysctl -q net.ipv4.tcp_early_retrans=1`
+
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0.000 bind(3, ..., ...) = 0
+0.000 listen(3, 1) = 0
+
+0.100 < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 7>
+0.100 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 6>
+0.200 < . 1:1(0) ack 1 win 257
+0.200 accept(3, ..., ...) = 4
+
+0.200 write(4, ..., 2920) = 2920
+0.200 > P. 1:2921(2920) ack 1
+0.300 < . 1:1(0) ack 1 win 257 <sack 1461:2921,nop,nop>
+0.300 > . 1:1461(1460) ack 1 // quick Early Retransmit
+0.400 < . 1:1(0) ack 2921 win 257
+
+0.500 close(4) = 0
+0.500 > F. 2921:2921(0) ack 1
+0.600 < F. 1:1(0) ack 2922 win 257
+0.601 > . 2922:2922(0) ack 2
+
+0.700 `sysctl -q net.ipv4.tcp_early_retrans=3`
diff --git a/test/packetdrill/tests/linux/early_retransmit/er-quick-3pkt-sack.pkt b/test/packetdrill/tests/linux/early_retransmit/er-quick-3pkt-sack.pkt
new file mode 100644
index 0000000..49719bb
--- /dev/null
+++ b/test/packetdrill/tests/linux/early_retransmit/er-quick-3pkt-sack.pkt
@@ -0,0 +1,28 @@
+// Test quick ER (no delay) with 3 packets outstanding, receiver sending SACKs.
+
+// Enable quick early retransmit.
+`sysctl -q net.ipv4.tcp_early_retrans=1`
+
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0.000 bind(3, ..., ...) = 0
+0.000 listen(3, 1) = 0
+
+0.100 < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 7>
+0.100 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 6>
+0.200 < . 1:1(0) ack 1 win 257
+0.200 accept(3, ..., ...) = 4
+
+0.200 write(4, ..., 4380) = 4380
+0.200 > P. 1:4381(4380) ack 1
+0.300 < . 1:1(0) ack 1 win 257 <sack 1461:2921,nop,nop>
+0.300 < . 1:1(0) ack 1 win 257 <sack 1461:4381,nop,nop>
+0.300 > . 1:1461(1460) ack 1 // quick ER (no delay)
+0.400 < . 1:1(0) ack 4381 win 257
+
+0.500 close(4) = 0
+0.500 > F. 4381:4381(0) ack 1
+0.600 < F. 1:1(0) ack 4382 win 257
+0.601 > . 4382:4382(0) ack 2
+
+0.700 `sysctl -q net.ipv4.tcp_early_retrans=3`
diff --git a/test/packetdrill/tests/linux/fast_recovery/prr-ss-ack-below-snd_una-reno.pkt b/test/packetdrill/tests/linux/fast_recovery/prr-ss-ack-below-snd_una-reno.pkt
new file mode 100644
index 0000000..4cc7b3b
--- /dev/null
+++ b/test/packetdrill/tests/linux/fast_recovery/prr-ss-ack-below-snd_una-reno.pkt
@@ -0,0 +1,51 @@
+// Test PRR-slowstart implementation.
+// In this variant we verify that the sender uses SACK info on an ACK
+// below snd_una.
+// This variant tests behavior with Reno congestion control.
+
+`sysctl -q net.ipv4.tcp_congestion_control=reno`
+
+// Establish a connection.
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0.000 bind(3, ..., ...) = 0
+0.000 listen(3, 1) = 0
+
+0.100 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+0.100 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 6>
+// RTT 100ms
+0.200 < . 1:1(0) ack 1 win 320
+0.200 accept(3, ..., ...) = 4
+
+// Send 10 data segments.
+0.200 write(4, ..., 10000) = 10000
+0.200 > P. 1:10001(10000) ack 1
+
+// Lost packet 1:1001,4001:5001,7001:8001.
+// Lots of reordering in both directions.
+0.310 < . 1:1(0) ack 1 win 320 <sack 1001:2001,nop,nop>
+0.320 < . 1:1(0) ack 1 win 320 <sack 1001:3001,nop,nop>
+0.330 < . 1:1(0) ack 1 win 320 <sack 1001:3001 8001:9001,nop,nop>
+// Enter fast recovery.
+0.330 > . 1:1001(1000) ack 1
+0.330 > . 3001:4001(1000) ack 1
+
+// An ACK advances snd_una.
+0.440 < . 1:1(0) ack 4001 win 320 <sack 8001:9001,nop,nop>
+0.440 > . 4001:5001(1000) ack 1
+0.440 > . 5001:6001(1000) ack 1
+
+// The following ACK was reordered - delayed so that it arrives with
+// an ACK field below snd_una. Here we check that the newly-SACKed
+// 2MSS at 5001:7001 cause us to send out 1 more MSS.
+0.450 < . 1:1(0) ack 3001 win 320 <sack 5001:7001,nop,nop>
+0.450 > . 7001:8001(1000) ack 1
+
+// Receiver ACKs all data.
+0.560 < . 1:1(0) ack 10001 win 320
+
+// Write another 10 MSS, of which 5MSS (cwnd=ssthresh) should go out:
+0.600 write(4, ..., 10000) = 10000
+0.600 > . 10001:15001(5000) ack 1
+
+0.700 `sysctl -q net.ipv4.tcp_congestion_control=cubic`
diff --git a/test/packetdrill/tests/linux/fast_retransmit/fr-4pkt-sack-linux.pkt b/test/packetdrill/tests/linux/fast_retransmit/fr-4pkt-sack-linux.pkt
new file mode 100644
index 0000000..a1416d9
--- /dev/null
+++ b/test/packetdrill/tests/linux/fast_retransmit/fr-4pkt-sack-linux.pkt
@@ -0,0 +1,35 @@
+// Test fast retransmit with 4 packets outstanding, receiver sending SACKs.
+// In this variant the receiver supports SACK.
+
+// Establish a connection.
+0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+
++0 bind(3, ..., ...) = 0
++0 listen(3, 1) = 0
+
++0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
++0 > S. 0:0(0) ack 1 <...>
+
++.1 < . 1:1(0) ack 1 win 257
++0 accept(3, ..., ...) = 4
+
+// Send 1 data segment and get an ACK, so cwnd is now 4.
++0 write(4, ..., 1000) = 1000
++0 > P. 1:1001(1000) ack 1
+
++.1 < . 1:1(0) ack 1001 win 257
+
+// Write 4 data segments.
++0 write(4, ..., 4000) = 4000
++0 > P. 1001:5001(4000) ack 1
+
+// Get 3 SACKs.
++.1 < . 1:1(0) ack 1001 win 257 <sack 2001:3001,nop,nop>
++0 < . 1:1(0) ack 1001 win 257 <sack 2001:4001,nop,nop>
++0 < . 1:1(0) ack 1001 win 257 <sack 2001:5001,nop,nop>
+// We've received 3 duplicate ACKs, so we do a fast retransmit.
++0 > . 1001:2001(1000) ack 1
+
+// Receiver ACKs all data.
++.1 < . 1:1(0) ack 6001 win 257
diff --git a/test/packetdrill/tests/linux/icmp/icmp-all-types.pkt b/test/packetdrill/tests/linux/icmp/icmp-all-types.pkt
new file mode 100644
index 0000000..169cdb4
--- /dev/null
+++ b/test/packetdrill/tests/linux/icmp/icmp-all-types.pkt
@@ -0,0 +1,71 @@
+// Test handling of incoming ICMP packets.
+// This test tests all known ICMP packet types, and a few unknown
+// types.
+
+// Establish a connection.
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0.000 bind(3, ..., ...) = 0
+0.000 listen(3, 1) = 0
+
+0.100 < S 0:0(0) win 32792 <mss 1460,nop,wscale 7>
+0.100 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 6>
+0.200 < . 1:1(0) ack 1 win 257
+0.200 accept(3, ..., ...) = 4
+
+// Send 1 data segment.
+0.200 write(4, ..., 1460) = 1460
+0.200 > P. 1:1461(1460) ack 1
+
+// We get lots of incoming ICMP messages.
+
+// First the unreachable type and all its codes
+0.300 < icmp unreachable net_unreachable
+0.301 < icmp unreachable host_unreachable
+0.302 < icmp unreachable protocol_unreachable
+0.303 < icmp unreachable port_unreachable
+0.304 < icmp unreachable frag_needed mtu 1234
+0.305 < icmp unreachable source_route_failed
+0.306 < icmp unreachable net_unknown
+0.307 < icmp unreachable host_unknown
+0.308 < icmp unreachable source_host_isolated
+0.309 < icmp unreachable net_prohibited
+0.310 < icmp unreachable host_prohibited
+0.311 < icmp unreachable net_unreachable_for_tos
+0.312 < icmp unreachable host_unreachable_for_tos
+0.313 < icmp unreachable packet_filtered
+0.314 < icmp unreachable precedence_violation
+0.315 < icmp unreachable precedence_cutoff
+
+// Then all the other types. These are legal because the code is optional.
+0.400 < icmp echo_reply
+0.401 < icmp source_quench
+0.402 < icmp redirect
+0.403 < icmp echo_request
+0.404 < icmp time_exceeded
+0.405 < icmp parameter_problem
+0.406 < icmp timestamp_request
+0.407 < icmp timestamp_reply
+0.408 < icmp information_request
+0.409 < icmp information_reply
+0.410 < icmp address_mask_request
+0.411 < icmp address_mask_reply
+
+// Now try symbolic types with numeric codes.
+0.450 < icmp unreachable code_0
+0.451 < icmp unreachable code_1
+0.452 < icmp unreachable code_255
+
+// Now try numeric types with numeric codes
+0.460 < icmp type_0 code_0
+0.461 < icmp type_1 code_0
+0.462 < icmp type_255 code_0
+
+// Receiver ACKs all data.
+0.470 < . 1:1(0) ack 1461 win 257
+
+// Clean up.
+0.600 close(4) = 0
+0.600 > F. 1461:1461(0) ack 1
+0.700 < F. 1:1(0) ack 1462 win 257
+0.700 > . 1462:1462(0) ack 2
diff --git a/test/packetdrill/tests/linux/inet_diag/inet-diag-ipv4-mapped-ipv6.pkt b/test/packetdrill/tests/linux/inet_diag/inet-diag-ipv4-mapped-ipv6.pkt
new file mode 100644
index 0000000..2b7d8ea
--- /dev/null
+++ b/test/packetdrill/tests/linux/inet_diag/inet-diag-ipv4-mapped-ipv6.pkt
@@ -0,0 +1,29 @@
+// Test inet_diag for AF_INET6 sockets with IPv4 traffic.
+// We use the "ss" socket statistics tool, which uses inet_diag sockets.
+// We use the default tcptest local IP address for IPv4-mapped-IPv6.
+
+// Options (command line arguments in script file) to force ipv4-mapped-ipv6.
+--ip_version="ipv4-mapped-ipv6"
+
+// Establish a connection.
+0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+
++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
++0 bind(3, ..., ...) = 0
++0 listen(3, 1) = 0
+
++0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 2>
++0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 6>
+
++0 `ss -6 -n state SYN-RECV | grep ::ffff:192.168.0.1:8080 > /dev/null`
+
++0 < . 1:1(0) ack 1 win 32890
+
++0 accept(3, ..., ...) = 4
+
+// first, use inet_diag with no filter:
++0 `ss -6 -n | grep ::ffff:192.168.0.1:8080 > /dev/null`
+
+// then try filters, which use a different code path:
++0 `ss -6 -n --options --extended --info '( sport = :8080 )' | grep ::ffff:192.168.0.1:8080 > /dev/null`
++0 `ss -6 -n --options --extended --info '( sport = :8080 )' src ::ffff:192.168.0.1/128 | grep ::ffff:192.168.0.1:8080 > /dev/null`
diff --git a/test/packetdrill/tests/linux/inet_diag/inet-diag-ipv4.pkt b/test/packetdrill/tests/linux/inet_diag/inet-diag-ipv4.pkt
new file mode 100644
index 0000000..c4e632a
--- /dev/null
+++ b/test/packetdrill/tests/linux/inet_diag/inet-diag-ipv4.pkt
@@ -0,0 +1,28 @@
+// Test inet_diag for AF_INET sockets with IPv4 traffic.
+// We use the "ss" socket statistics tool, which uses inet_diag sockets.
+// We use the default tcptest local IP address for IPv4.
+
+// Options (command line arguments in script file) to force IPv4.
+--ip_version=ipv4
+
+// Establish a connection.
+0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
++0 bind(3, ..., ...) = 0
++0 listen(3, 1) = 0
+
++0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 2>
++0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 6>
+
++0 `ss -4 -n state SYN-RECV | grep 192.168.0.1:8080 > /dev/null`
+
++0 < . 1:1(0) ack 1 win 32890
+
++0 accept(3, ..., ...) = 4
+
+// first, use inet_diag with no filter:
++0 `ss -4 -n | grep :8080 | grep 192.168.0.1:8080 > /dev/null`
+
+// then try filters, which use a different code path:
++0 `ss -4 -n --options --extended --info '( sport = :8080 )' | grep 192.168.0.1:8080 > /dev/null`
++0 `ss -4 -n --options --extended --info '( sport = :8080 )' src 192.168.0.1/32 | grep 192.168.0.1:8080 > /dev/null`
diff --git a/test/packetdrill/tests/linux/inet_diag/inet-diag-ipv6.pkt b/test/packetdrill/tests/linux/inet_diag/inet-diag-ipv6.pkt
new file mode 100644
index 0000000..183d3ce
--- /dev/null
+++ b/test/packetdrill/tests/linux/inet_diag/inet-diag-ipv6.pkt
@@ -0,0 +1,29 @@
+// Test inet_diag for AF_INET6 sockets with IPv6 traffic.
+// We use the "ss" socket statistics tool, which uses inet_diag sockets.
+// We use the default tcptest local IP address for IPv6.
+
+// Options (command line arguments in script file) to force IPv6.
+--ip_version=ipv6
+--mtu=1520
+
+// Establish a connection.
+0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
++0 bind(3, ..., ...) = 0
++0 listen(3, 1) = 0
+
++0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 2>
++0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 6>
+
++0 `ss -6 -n state SYN-RECV | grep fd3d:fa7b:d17d::1:8080 > /dev/null`
+
++0 < . 1:1(0) ack 1 win 32890
+
++0 accept(3, ..., ...) = 4
+
+// first, use inet_diag with no filter:
++0 `ss -6 -n | grep :8080 | grep fd3d:fa7b:d17d::1:8080 > /dev/null`
+
+// then try filters, which use a different code path:
++0 `ss -6 -n --options --extended --info '( sport = :8080 )' | grep fd3d:fa7b:d17d::1:8080 > /dev/null`
++0 `ss -6 -n --options --extended --info '( sport = :8080 )' src fd3d:fa7b:d17d::1/128 | grep fd3d:fa7b:d17d::1:8080 > /dev/null`
diff --git a/test/packetdrill/tests/linux/init_rto/init_rto_passive_open.pkt b/test/packetdrill/tests/linux/init_rto/init_rto_passive_open.pkt
new file mode 100644
index 0000000..8775c3c
--- /dev/null
+++ b/test/packetdrill/tests/linux/init_rto/init_rto_passive_open.pkt
@@ -0,0 +1,17 @@
+// A simple test of initRTO (sysctl_tcp_synack_rto, default to 1sec) for
+// the passive open side.
+
+// We want the SYN-ACK to be retransmitted 1 sec after the SYN, but
+// usually it happens at 1.2 or 1.4 sec due to the fact that the
+// kernel only schedules SYN-ACK retransmissions periodically.
+// Specifically, the TCP_SYNQ_INTERVAL (period of the SYN-ACK timer) is 200ms.
+--tolerance_usecs=405000
+
+// Create a listener socket.
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0.000 bind(3, ..., ...) = 0
+0.000 listen(3, 1) = 0
+0.100 < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 7>
+0.100 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 6>
+1.100 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 6>
diff --git a/test/packetdrill/tests/linux/initial_window/iw10-base-case.pkt b/test/packetdrill/tests/linux/initial_window/iw10-base-case.pkt
new file mode 100755
index 0000000..f790f56
--- /dev/null
+++ b/test/packetdrill/tests/linux/initial_window/iw10-base-case.pkt
@@ -0,0 +1,21 @@
+// A simple server-side test that sends exactly an initial window (IW10)
+// worth of packets.
+
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0.000 bind(3, ..., ...) = 0
+0.000 listen(3, 1) = 0
+
+0.100 < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 7>
+0.100 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 6>
+0.200 < . 1:1(0) ack 1 win 257
+0.200 accept(3, ..., ...) = 4
+
+0.200 write(4, ..., 14600) = 14600
+0.200 > P. 1:14601(14600) ack 1
+0.300 < . 1:1(0) ack 14601 win 257
+
+0.400 close(4) = 0
+0.401 > F. 14601:14601(0) ack 1
+0.501 < F. 1:1(0) ack 14602 win 257
+0.502 > . 14602:14602(0) ack 2
diff --git a/test/packetdrill/tests/linux/initial_window/iw10-short-response.pkt b/test/packetdrill/tests/linux/initial_window/iw10-short-response.pkt
new file mode 100755
index 0000000..6db3c4b
--- /dev/null
+++ b/test/packetdrill/tests/linux/initial_window/iw10-short-response.pkt
@@ -0,0 +1,21 @@
+// A simple server-side test that sends a response smaller
+// than the initial window of 10 MSS.
+
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0.000 bind(3, ..., ...) = 0
+0.000 listen(3, 1) = 0
+
+0.100 < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 7>
+0.100 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 6>
+0.200 < . 1:1(0) ack 1 win 257
+0.200 accept(3, ..., ...) = 4
+
+0.200 write(4, ..., 7300) = 7300
+0.200 > P. 1:7301(7300) ack 1
+0.300 < . 1:1(0) ack 7301 win 257
+
+0.400 close(4) = 0
+0.401 > F. 7301:7301(0) ack 1
+0.501 < F. 1:1(0) ack 7302 win 257
+0.502 > . 7302:7302(0) ack 2
diff --git a/test/packetdrill/tests/linux/ioctl/ioctl-siocinq-fin.pkt b/test/packetdrill/tests/linux/ioctl/ioctl-siocinq-fin.pkt
new file mode 100644
index 0000000..8499e02
--- /dev/null
+++ b/test/packetdrill/tests/linux/ioctl/ioctl-siocinq-fin.pkt
@@ -0,0 +1,30 @@
+// A simple test for the TCP SIOCINQ ioctl.
+
+// Create a socket.
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+
+0.000 bind(3, ..., ...) = 0
+0.000 listen(3, 1) = 0
+
+// Establish a connection.
+0.100 < S 0:0(0) win 20000 <mss 1000,sackOK,nop,nop>
+0.100 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK>
+0.200 < . 1:1(0) ack 1 win 20000
+0.200 accept(3, ..., ...) = 4
+
+// Receive a segment.
+0.200 < P. 1:1001(1000) ack 1 win 257
+0.200 > . 1:1(0) ack 1001
+
+0.210 ioctl(4, SIOCINQ, [1000]) = 0
+0.220 read(4, ..., 1000) = 1000
+0.230 ioctl(4, SIOCINQ, [0]) = 0
+
+// Receive a segment with a FIN.
+0.300 < FP. 1001:2001(1000) ack 1 win 257
+0.300 > . 1:1(0) ack 2002
+
+0.310 ioctl(4, SIOCINQ, [1000]) = 0
+0.320 read(4, ..., 1000) = 1000
+0.330 ioctl(4, SIOCINQ, [0]) = 0
diff --git a/test/packetdrill/tests/linux/listen/listen-incoming-ack.pkt b/test/packetdrill/tests/linux/listen/listen-incoming-ack.pkt
new file mode 100644
index 0000000..65f3733
--- /dev/null
+++ b/test/packetdrill/tests/linux/listen/listen-incoming-ack.pkt
@@ -0,0 +1,20 @@
+// Test behavior when a listener gets an incoming packet that has
+// the ACK bit set but not the SYN bit set.
+
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0.000 bind(3, ..., ...) = 0
+0.000 listen(3, 1) = 0
+
+0.100 < . 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+0.100 > R 0:0(0) win 0
+
+// Now make sure that when a valid SYN arrives shortly thereafter
+// (with the same address 4-tuple) we can still successfully establish
+// a connection.
+
+0.200 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+0.200 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 6>
+
+0.300 < . 1:1(0) ack 1 win 320
+0.300 accept(3, ..., ...) = 4
diff --git a/test/packetdrill/tests/linux/listen/listen-incoming-no-tcp-flags.pkt b/test/packetdrill/tests/linux/listen/listen-incoming-no-tcp-flags.pkt
new file mode 100644
index 0000000..3ae1ff3
--- /dev/null
+++ b/test/packetdrill/tests/linux/listen/listen-incoming-no-tcp-flags.pkt
@@ -0,0 +1,21 @@
+// Test behavior when a listener gets an incoming packet that has
+// no TCP flags set.
+
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0.000 bind(3, ..., ...) = 0
+0.000 listen(3, 1) = 0
+
+// An incoming TCP segment with no TCP flags set.
+0.100 < - 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+// Linux ignores the packet and sends nothing.
+
+// Now make sure that when a valid SYN arrives shortly thereafter
+// (with the same address 4-tuple) we can still successfully establish
+// a connection.
+
+0.200 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+0.200 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 6>
+
+0.300 < . 1:1(0) ack 1 win 320
+0.300 accept(3, ..., ...) = 4
diff --git a/test/packetdrill/tests/linux/listen/listen-incoming-rst.pkt b/test/packetdrill/tests/linux/listen/listen-incoming-rst.pkt
new file mode 100644
index 0000000..83d25f2
--- /dev/null
+++ b/test/packetdrill/tests/linux/listen/listen-incoming-rst.pkt
@@ -0,0 +1,22 @@
+// Test behavior when a listener gets an incoming packet that has
+// the RST bit set but not the SYN bit set.
+
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0.000 bind(3, ..., ...) = 0
+0.000 listen(3, 1) = 0
+
+0.100 < R 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+
+// The TCP stack should not respond to incoming RSTs, or else
+// we could get infinite RST ping-pong storms.
+
+// Now make sure that when a valid SYN arrives shortly thereafter
+// (with the same address 4-tuple) we can still successfully establish
+// a connection.
+
+0.200 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+0.200 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 6>
+
+0.300 < . 1:1(0) ack 1 win 320
+0.300 accept(3, ..., ...) = 4
diff --git a/test/packetdrill/tests/linux/listen/listen-incoming-syn-ack.pkt b/test/packetdrill/tests/linux/listen/listen-incoming-syn-ack.pkt
new file mode 100644
index 0000000..bc2569c
--- /dev/null
+++ b/test/packetdrill/tests/linux/listen/listen-incoming-syn-ack.pkt
@@ -0,0 +1,20 @@
+// Test behavior when a listener gets an incoming packet that has
+// the SYN and ACK bits set.
+
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0.000 bind(3, ..., ...) = 0
+0.000 listen(3, 1) = 0
+
+0.100 < S. 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+0.100 > R 0:0(0) win 0
+
+// Now make sure that when a valid SYN arrives shortly thereafter
+// (with the same address 4-tuple) we can still successfully establish
+// a connection.
+
+0.200 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+0.200 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 6>
+
+0.300 < . 1:1(0) ack 1 win 320
+0.300 accept(3, ..., ...) = 4
diff --git a/test/packetdrill/tests/linux/listen/listen-incoming-syn-rst.pkt b/test/packetdrill/tests/linux/listen/listen-incoming-syn-rst.pkt
new file mode 100644
index 0000000..f3c0607
--- /dev/null
+++ b/test/packetdrill/tests/linux/listen/listen-incoming-syn-rst.pkt
@@ -0,0 +1,22 @@
+// Test behavior when a listener gets an incoming packet that has
+// the SYN and RST bits set.
+
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0.000 bind(3, ..., ...) = 0
+0.000 listen(3, 1) = 0
+
+0.100 < SR 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+
+// The TCP stack should not respond to incoming RSTs, or else
+// we could get infinite RST ping-pong storms.
+
+// Now make sure that when a valid SYN arrives shortly thereafter
+// (with the same address 4-tuple) we can still successfully establish
+// a connection.
+
+0.200 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+0.200 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 6>
+
+0.300 < . 1:1(0) ack 1 win 320
+0.300 accept(3, ..., ...) = 4
diff --git a/test/packetdrill/tests/linux/listen/listen-unbound.pkt b/test/packetdrill/tests/linux/listen/listen-unbound.pkt
new file mode 100644
index 0000000..fcf74fc
--- /dev/null
+++ b/test/packetdrill/tests/linux/listen/listen-unbound.pkt
@@ -0,0 +1,5 @@
+// Test behavior when a listener gets an incoming packet that has
+// the RST bit set but not the SYN bit set.
+
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 listen(3, 1) = 0 \ No newline at end of file
diff --git a/test/packetdrill/tests/linux/mss/mss-getsockopt-tcp_maxseg-client-ts.pkt b/test/packetdrill/tests/linux/mss/mss-getsockopt-tcp_maxseg-client-ts.pkt
new file mode 100644
index 0000000..a8544f5
--- /dev/null
+++ b/test/packetdrill/tests/linux/mss/mss-getsockopt-tcp_maxseg-client-ts.pkt
@@ -0,0 +1,17 @@
+// Test that getsockopt of TCP_MAXSEG works on active/client TCP connections.
+// In this variant we test that a simple query of segment size works,
+// in the case where TCP timestamps reduce the usable payload space.
+
+// Create a socket.
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+
+0.100...0.200 connect(3, ..., ...) = 0
+
+// Establish a connection.
+0.100 > S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 6>
+0.200 < S. 0:0(0) ack 1 win 32792 <mss 1100,sackOK,TS val 200 ecr 100,nop,wscale 7>
+0.200 > . 1:1(0) ack 1 <nop,nop,TS val 200 ecr 200>
+
+// Verify that the kernel reduced the returned segment size
+// to account for TCP timestamps.
+0.300 getsockopt(3, SOL_TCP, TCP_MAXSEG, [1088], [4]) = 0
diff --git a/test/packetdrill/tests/linux/mss/mss-getsockopt-tcp_maxseg-client.pkt b/test/packetdrill/tests/linux/mss/mss-getsockopt-tcp_maxseg-client.pkt
new file mode 100644
index 0000000..a75b8b3
--- /dev/null
+++ b/test/packetdrill/tests/linux/mss/mss-getsockopt-tcp_maxseg-client.pkt
@@ -0,0 +1,14 @@
+// Test that getsockopt of TCP_MAXSEG works on active/client TCP connections.
+// In this variant we test that a simple query of segment size works.
+
+// Create a socket.
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+
+0.100...0.200 connect(3, ..., ...) = 0
+
+// Establish a connection.
+0.100 > S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 6>
+0.200 < S. 0:0(0) ack 1 win 32792 <mss 1100,nop,wscale 7>
+0.200 > . 1:1(0) ack 1
+
+0.300 getsockopt(3, SOL_TCP, TCP_MAXSEG, [1100], [4]) = 0
diff --git a/test/packetdrill/tests/linux/mss/mss-getsockopt-tcp_maxseg-server-advmss-ipv4.pkt b/test/packetdrill/tests/linux/mss/mss-getsockopt-tcp_maxseg-server-advmss-ipv4.pkt
new file mode 100644
index 0000000..c07c5c0
--- /dev/null
+++ b/test/packetdrill/tests/linux/mss/mss-getsockopt-tcp_maxseg-server-advmss-ipv4.pkt
@@ -0,0 +1,29 @@
+// Test that getsockopt of TCP_MAXSEG works on passive/server TCP connections.
+// In this variant we test that we get the expected result when
+// the routing config specifies an "advmss 1430 mtu lock 1470" for the
+// route to the remote IP under test.
+
+// To ensure that we do not cache something that interferes with other tests:
+--remote_ip="192.0.2.2"
+
+`ip route change 192.0.2.2 via 192.168.0.2 dev tun0 advmss 1430 mtu lock 1470`
+
+// Set up a listening socket.
+0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
++0 bind(3, ..., ...) = 0
++0 listen(3, 1) = 0
+
+// Establish a connection without timestamps.
++0 < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 7>
++0 > S. 0:0(0) ack 1 <mss 1430,nop,nop,sackOK,nop,wscale 6>
++0 < . 1:1(0) ack 1 win 257
+
++0 accept(3, ..., ...) = 4
+
+// Verify that the kernel returns the expected TCP max payload size.
++0 getsockopt(4, SOL_TCP, TCP_MAXSEG, [1430], [4]) = 0
+
++0 write(4, ..., 1500) = 1500
++0 > . 1:1431(1430) ack 1
++0 > P. 1431:1501(70) ack 1
diff --git a/test/packetdrill/tests/linux/mss/mss-getsockopt-tcp_maxseg-server-advmss-ts-ipv4.pkt b/test/packetdrill/tests/linux/mss/mss-getsockopt-tcp_maxseg-server-advmss-ts-ipv4.pkt
new file mode 100644
index 0000000..2222d51
--- /dev/null
+++ b/test/packetdrill/tests/linux/mss/mss-getsockopt-tcp_maxseg-server-advmss-ts-ipv4.pkt
@@ -0,0 +1,30 @@
+// Test that getsockopt of TCP_MAXSEG works on passive/server TCP connections.
+// In this variant we test that we get the expected result when
+// the routing config specifies an "advmss 1430 mtu lock 1470" for the
+// route to the remote IP under test.
+
+// To ensure that we do not cache something that interferes with other tests:
+--remote_ip="192.0.2.2"
+
+`ip route change 192.0.2.2 via 192.168.0.2 dev tun0 advmss 1430 mtu lock 1470`
+
+// Set up a listening socket.
+0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
++0 bind(3, ..., ...) = 0
++0 listen(3, 1) = 0
+
+// Establish a connection
++0 < S 0:0(0) win 32792 <mss 1460,sackOK,TS val 0 ecr 0,nop,wscale 7>
++0 > S. 0:0(0) ack 1 <mss 1430,sackOK,TS val 0 ecr 0,nop,wscale 6>
++0 < . 1:1(0) ack 1 win 257 <nop,nop,TS val 0 ecr 0>
+
++0 accept(3, ..., ...) = 4
+
+// Verify that the kernel reduced the returned segment size
+// to account for TCP timestamps.
++0 getsockopt(4, SOL_TCP, TCP_MAXSEG, [1418], [4]) = 0
+
++0 write(4, ..., 1500) = 1500
++0 > . 1:1419(1418) ack 1 <nop,nop,TS val 0 ecr 0>
++0 > P. 1419:1501(82) ack 1 <nop,nop,TS val 0 ecr 0>
diff --git a/test/packetdrill/tests/linux/mss/mss-getsockopt-tcp_maxseg-server-ts.pkt b/test/packetdrill/tests/linux/mss/mss-getsockopt-tcp_maxseg-server-ts.pkt
new file mode 100644
index 0000000..5d28c93
--- /dev/null
+++ b/test/packetdrill/tests/linux/mss/mss-getsockopt-tcp_maxseg-server-ts.pkt
@@ -0,0 +1,20 @@
+// Test that getsockopt of TCP_MAXSEG works on passive/server TCP connections.
+// In this variant we test that a simple query of segment size works,
+// in the case where TCP timestamps reduce the usable payload space.
+
+// Set up a listening socket.
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0.000 bind(3, ..., ...) = 0
+0.000 listen(3, 1) = 0
+
+// Establish a connection
+0.100 < S 0:0(0) win 32792 <mss 1100,sackOK,TS val 100 ecr 0,nop,wscale 7>
+0.100 > S. 0:0(0) ack 1 <mss 1460,sackOK,TS val 100 ecr 100,nop,wscale 6>
+0.200 < . 1:1(0) ack 1 win 257 <nop,nop,TS val 200 ecr 100>
+
+0.300 accept(3, ..., ...) = 4
+
+// Verify that the kernel reduced the returned segment size
+// to account for TCP timestamps.
+0.400 getsockopt(4, SOL_TCP, TCP_MAXSEG, [1088], [4]) = 0
diff --git a/test/packetdrill/tests/linux/mss/mss-getsockopt-tcp_maxseg-server.pkt b/test/packetdrill/tests/linux/mss/mss-getsockopt-tcp_maxseg-server.pkt
new file mode 100644
index 0000000..03516c1
--- /dev/null
+++ b/test/packetdrill/tests/linux/mss/mss-getsockopt-tcp_maxseg-server.pkt
@@ -0,0 +1,17 @@
+// Test that getsockopt of TCP_MAXSEG works on passive/server TCP connections.
+// In this variant we test that a simple query of segment size works.
+
+// Set up a listening socket.
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0.000 bind(3, ..., ...) = 0
+0.000 listen(3, 1) = 0
+
+// Establish a connection
+0.100 < S 0:0(0) win 32792 <mss 1100,nop,wscale 7>
+0.100 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 6>
+0.200 < . 1:1(0) ack 1 win 257
+
+0.300 accept(3, ..., ...) = 4
+
+0.400 getsockopt(4, SOL_TCP, TCP_MAXSEG, [1100], [4]) = 0
diff --git a/test/packetdrill/tests/linux/mss/mss-setsockopt-tcp_maxseg-client.pkt b/test/packetdrill/tests/linux/mss/mss-setsockopt-tcp_maxseg-client.pkt
new file mode 100644
index 0000000..906ad6e
--- /dev/null
+++ b/test/packetdrill/tests/linux/mss/mss-setsockopt-tcp_maxseg-client.pkt
@@ -0,0 +1,24 @@
+// Test TCP_MAXSEG works on active/client TCP connections.
+
+// Create a socket.
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+
+// Set MSS to 1100.
+0.010 setsockopt(3, SOL_TCP, TCP_MAXSEG, [1100], 4) = 0
+// TODO(ncardwell): the following is silly; should we fix it?
+0.020 getsockopt(3, SOL_TCP, TCP_MAXSEG, [536], [4]) = 0
+
+0.100...0.200 connect(3, ..., ...) = 0
+
+// Establish a connection with an outgoing advertised MSS of 1100.
+0.100 > S 0:0(0) <mss 1100,sackOK,TS val 100 ecr 0,nop,wscale 6>
+0.200 < S. 0:0(0) ack 1 win 32792 <mss 1460,nop,wscale 7>
+0.200 > . 1:1(0) ack 1
+
+0.300 getsockopt(3, SOL_TCP, TCP_MAXSEG, [1100], [4]) = 0
+
+0.400 %{ assert tcpi_advmss == 1100; assert tcpi_snd_mss == 1100 }%
+
+// IW10 MSS should yield outgoing TSO packet with 10*1100 == 11000 bytes:
+0.500 write(3, ..., 12000) = 12000
+0.500 > . 1:11001(11000) ack 1
diff --git a/test/packetdrill/tests/linux/mss/mss-setsockopt-tcp_maxseg-server.pkt b/test/packetdrill/tests/linux/mss/mss-setsockopt-tcp_maxseg-server.pkt
new file mode 100644
index 0000000..f2ed31d
--- /dev/null
+++ b/test/packetdrill/tests/linux/mss/mss-setsockopt-tcp_maxseg-server.pkt
@@ -0,0 +1,27 @@
+// Test TCP_MAXSEG works on passive/server TCP connections.
+
+// Set up a listening socket.
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0.000 bind(3, ..., ...) = 0
+0.000 listen(3, 1) = 0
+
+// Set MSS to 1100.
+0.010 setsockopt(3, SOL_TCP, TCP_MAXSEG, [1100], 4) = 0
+// TODO(ncardwell): the following is silly; should we fix it?
+0.020 getsockopt(3, SOL_TCP, TCP_MAXSEG, [536], [4]) = 0
+
+// Establish a connection with an outgoing advertised MSS of 1100.
+0.100 < S 0:0(0) win 32792 <mss 1300,nop,wscale 7>
+0.100 > S. 0:0(0) ack 1 <mss 1100,nop,wscale 6>
+0.200 < . 1:1(0) ack 1 win 257
+
+0.300 accept(3, ..., ...) = 4
+
+0.400 getsockopt(4, SOL_TCP, TCP_MAXSEG, [1100], [4]) = 0
+
+0.500 %{ assert tcpi_advmss == 1100; assert tcpi_snd_mss == 1100 }%
+
+// IW10 MSS should yield outgoing TSO packet with 10*1100 == 11000 bytes:
+0.600 write(4, ..., 12000) = 12000
+0.600 > . 1:11001(11000) ack 1
diff --git a/test/packetdrill/tests/linux/pmtu_discovery/pmtud-10pkt-1460-to-1160.pkt b/test/packetdrill/tests/linux/pmtu_discovery/pmtud-10pkt-1460-to-1160.pkt
new file mode 100644
index 0000000..df30dee
--- /dev/null
+++ b/test/packetdrill/tests/linux/pmtu_discovery/pmtud-10pkt-1460-to-1160.pkt
@@ -0,0 +1,54 @@
+// Test Path MTU discovery, RFC 1191.
+// This is a more substantive case, with 10*original_mss to send.
+// In this variant, we get an ICMP "unreachable frag_needed mtu 1200"
+// message and because the TCP sequence number is valid, TCP
+// immediately retransmits 'cwnd' packets using a smaller MSS
+// based on the MTU from the ICMP message.
+
+// Establish a connection.
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0.000 bind(3, ..., ...) = 0
+0.000 listen(3, 1) = 0
+
+0.100 < S 0:0(0) win 32792 <mss 1460,nop,wscale 7>
+0.100 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 6>
+0.200 < . 1:1(0) ack 1 win 257
+0.200 accept(3, ..., ...) = 4
+
+// Send 10 data segments.
+0.200 write(4, ..., 14600) = 14600
+0.200 > P. 1:14601(14600) ack 1
+
+// ICMP says that the first segment was too big.
+0.250 < [1:1461(1460)] icmp unreachable frag_needed mtu 1200
+// TCP picks a packet size using the MTU from the message, and
+// retransmits 'cwnd' packets:
+0.250 > . 1:1161(1160) ack 1
+0.250 > . 1161:2321(1160) ack 1
+0.250 > . 2321:3481(1160) ack 1
+0.250 > . 3481:4641(1160) ack 1
+0.250 > . 4641:5801(1160) ack 1
+0.250 > . 5801:6961(1160) ack 1
+0.250 > . 6961:8121(1160) ack 1
+0.250 > . 8121:9281(1160) ack 1
+0.250 > . 9281:10441(1160) ack 1
+0.250 > . 10441:11601(1160) ack 1
+
+// ACKs for packets retransmitted at a smaller MSS release yet more packets...
+
+0.350 < . 1:1(0) ack 1161 win 257
+0.350 > . 11601:12761(1160) ack 1
+0.350 > . 12761:13921(1160) ack 1
+
+0.355 < . 1:1(0) ack 2321 win 257
+0.355 > P. 13921:14601(680) ack 1
+
+// Receiver ACKs all data.
+0.455 < . 1:1(0) ack 14601 win 257
+
+// Clean up.
+0.500 close(4) = 0
+0.500 > F. 14601:14601(0) ack 1
+0.600 < F. 1:1(0) ack 14602 win 257
+0.600 > . 14602:14602(0) ack 2
diff --git a/test/packetdrill/tests/linux/pmtu_discovery/pmtud-1pkt-1460-to-1160.pkt b/test/packetdrill/tests/linux/pmtu_discovery/pmtud-1pkt-1460-to-1160.pkt
new file mode 100644
index 0000000..9776f10
--- /dev/null
+++ b/test/packetdrill/tests/linux/pmtu_discovery/pmtud-1pkt-1460-to-1160.pkt
@@ -0,0 +1,36 @@
+// Test Path MTU discovery, RFC 1191.
+// This is a simple case, with one packet to send.
+// In this variant, we get an ICMP "unreachable frag_needed mtu 1200"
+// message and because the TCP sequence number is valid, TCP
+// immediately retransmits our first outstanding packet
+// with a smaller MSS based on the MTU from the ICMP message.
+
+// Establish a connection.
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0.000 bind(3, ..., ...) = 0
+0.000 listen(3, 1) = 0
+
+0.100 < S 0:0(0) win 32792 <mss 1460,nop,wscale 7>
+0.100 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 6>
+0.200 < . 1:1(0) ack 1 win 257
+0.200 accept(3, ..., ...) = 4
+
+// Send 1 data segment.
+0.200 write(4, ..., 1460) = 1460
+0.200 > P. 1:1461(1460) ack 1
+
+// ICMP says that segment was too big.
+0.250 < [1:1461(1460)] icmp unreachable frag_needed mtu 1200
+// TCP picks a packet size using the MTU from the message, and retransmits.
+0.250 > . 1:1161(1160) ack 1
+0.250 > P. 1161:1461(300) ack 1
+
+// Receiver ACKs all data.
+0.350 < . 1:1(0) ack 1461 win 257
+
+// Clean up.
+1.300 close(4) = 0
+1.300 > F. 1461:1461(0) ack 1
+1.400 < F. 1:1(0) ack 1462 win 257
+1.400 > . 1462:1462(0) ack 2
diff --git a/test/packetdrill/tests/linux/receiver_rtt/rcv-rtt-with-timestamps-new.pkt b/test/packetdrill/tests/linux/receiver_rtt/rcv-rtt-with-timestamps-new.pkt
new file mode 100644
index 0000000..2155f21
--- /dev/null
+++ b/test/packetdrill/tests/linux/receiver_rtt/rcv-rtt-with-timestamps-new.pkt
@@ -0,0 +1,57 @@
+// Test that receiver-side RTT estimation is sane when
+// using TCP timestamps. We assert that the receive-side
+// RTT estimate is between 95 and 105ms.
+
+// Use a small receive buffer so that we advertise small windows, to keep the
+// test short.
+`sysctl -q net.ipv4.tcp_rmem="4096 10000 2097152"`
+
+// Create a socket.
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+
+// Verify that the receive buffer is the tcp_rmem default.
+0.000 getsockopt(3, SOL_SOCKET, SO_RCVBUF, [10000], [4]) = 0
+
+0.000 bind(3, ..., ...) = 0
+0.000 listen(3, 1) = 0
+
+// Establish a connection.
+0.100 < S 0:0(0) win 20000 <mss 1000,sackOK,TS val 100 ecr 0>
+0.100 > S. 0:0(0) ack 1 <mss 1460,sackOK,TS val 100 ecr 100>
+0.200 < . 1:1(0) ack 1 win 20000 <nop,nop,TS val 200 ecr 100>
+0.200 accept(3, ..., ...) = 4
+0.200 %{ assert tcpi_rcv_rtt == 0 }%
+
+// First flight.
+0.200 < . 1:1001(1000) ack 1 win 20000 <nop,nop,TS val 200 ecr 100>
+0.200 > . 1:1(0) ack 1001 <nop,nop,TS val 200 ecr 200>
+0.200 < . 1001:2001(1000) ack 1 win 20000 <nop,nop,TS val 200 ecr 100>
+0.200 > . 1:1(0) ack 2001 <nop,nop,TS val 200 ecr 200>
+0.200 read(4, ..., 2000) = 2000
+0.200 %{ assert tcpi_rcv_rtt >= 95*1000 and tcpi_rcv_rtt <= 105*1000 }%
+
+// Second flight.
+0.300 < . 2001:3001(1000) ack 1 win 20000 <nop,nop,TS val 300 ecr 200>
+0.300 > . 1:1(0) ack 3001 <nop,nop,TS val 300 ecr 300>
+0.300 < . 3001:4001(1000) ack 1 win 20000 <nop,nop,TS val 300 ecr 200>
+0.300 > . 1:1(0) ack 4001 <nop,nop,TS val 300 ecr 300>
+0.300 < . 4001:5001(1000) ack 1 win 20000 <nop,nop,TS val 300 ecr 200>
+0.300 > . 1:1(0) ack 5001 <nop,nop,TS val 300 ecr 300>
+0.300 < . 5001:6001(1000) ack 1 win 20000 <nop,nop,TS val 300 ecr 200>
+0.300 read(4, ..., 4000) = 4000
+0.300 > . 1:1(0) ack 6001 <nop,nop,TS val 300 ecr 300>
+0.300 %{ assert tcpi_rcv_rtt >= 95*1000 and tcpi_rcv_rtt <= 105*1000 }%
+
+// Third flight.
+// We omit outgoing ACKs because we don't care about this behavior,
+// and don't want to introduce dependencies on the receive window behavior.
+0.400 < . 6001:7001(1000) ack 1 win 20000 <nop,nop,TS val 400 ecr 300>
+0.400 < . 7001:8001(1000) ack 1 win 20000 <nop,nop,TS val 400 ecr 300>
+0.400 < . 8001:9001(1000) ack 1 win 20000 <nop,nop,TS val 400 ecr 300>
+0.400 < . 9001:10001(1000) ack 1 win 20000 <nop,nop,TS val 400 ecr 300>
+0.400 < . 10001:11001(1000) ack 1 win 20000 <nop,nop,TS val 400 ecr 300>
+0.400 read(4, ..., 5000) = 5000
+0.400 %{ assert tcpi_rcv_rtt >= 95*1000 and tcpi_rcv_rtt <= 105*1000 }%
+
+0.500 `sysctl -q net.ipv4.tcp_rmem="4096 87380 3732736"`
diff --git a/test/packetdrill/tests/linux/receiver_rtt/rcv-rtt-without-timestamps-new.pkt b/test/packetdrill/tests/linux/receiver_rtt/rcv-rtt-without-timestamps-new.pkt
new file mode 100644
index 0000000..e963993
--- /dev/null
+++ b/test/packetdrill/tests/linux/receiver_rtt/rcv-rtt-without-timestamps-new.pkt
@@ -0,0 +1,62 @@
+// Test that receiver-side RTT estimation is sane when
+// *not* using TCP timestamps. When we are not using timestamps
+// then the receive-side RTT estimation logic uses as an RTT
+// sample the time elapsed between (a) when the receiver advertises
+// that a sender may send sequence number N, and (b) when the
+// sequence number N arrives. In this (not unusual) case below,
+// this takes 1 RTT; so we assert that the receive-side
+// RTT estimate is between 95 and 105ms.
+
+// Use a small receive buffer so that we advertise small windows, to keep the
+// test short.
+`sysctl -q net.ipv4.tcp_rmem="4096 10000 2097152"`
+
+// Create a socket.
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+
+// Verify that the receive buffer is the tcp_rmem default.
+0.000 getsockopt(3, SOL_SOCKET, SO_RCVBUF, [10000], [4]) = 0
+
+0.000 bind(3, ..., ...) = 0
+0.000 listen(3, 1) = 0
+
+// Establish a connection.
+0.100 < S 0:0(0) win 20000 <mss 1000,nop,nop,sackOK>
+0.100 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK>
+0.200 < . 1:1(0) ack 1 win 20000
+0.200 accept(3, ..., ...) = 4
+0.200 %{ assert tcpi_rcv_rtt == 0 }%
+
+// First flight.
+0.200 < . 1:1001(1000) ack 1 win 20000
+0.200 > . 1:1(0) ack 1001
+0.200 < . 1001:2001(1000) ack 1 win 20000
+0.200 > . 1:1(0) ack 2001
+0.200 read(4, ..., 2000) = 2000
+0.200 %{ assert tcpi_rcv_rtt == 0 }%
+
+// Second flight.
+0.300 < . 2001:3001(1000) ack 1 win 20000
+0.300 > . 1:1(0) ack 3001
+0.300 < . 3001:4001(1000) ack 1 win 20000
+0.300 > . 1:1(0) ack 4001
+0.300 < . 4001:5001(1000) ack 1 win 20000
+0.300 > . 1:1(0) ack 5001
+0.300 < . 5001:6001(1000) ack 1 win 20000
+0.300 read(4, ..., 4000) = 4000
+0.300 > . 1:1(0) ack 6001
+0.300 %{ assert tcpi_rcv_rtt >= 95*1000 and tcpi_rcv_rtt <= 105*1000 }%
+
+// Third flight.
+// We omit outgoing ACKs because we don't care about this behavior,
+// and don't want to introduce dependencies on the receive window behavior.
+0.400 < . 6001:7001(1000) ack 1 win 20000
+0.400 < . 7001:8001(1000) ack 1 win 20000
+0.400 < . 8001:9001(1000) ack 1 win 20000
+0.400 < . 9001:10001(1000) ack 1 win 20000
+0.400 < . 10001:11001(1000) ack 1 win 20000
+0.400 read(4, ..., 5000) = 5000
+0.400 %{ assert tcpi_rcv_rtt >= 95*1000 and tcpi_rcv_rtt <= 105*1000 }%
+
+0.500 `sysctl -q net.ipv4.tcp_rmem="4096 87380 3732736"`
diff --git a/test/packetdrill/tests/linux/run_tests.sh b/test/packetdrill/tests/linux/run_tests.sh
new file mode 100755
index 0000000..309cd20
--- /dev/null
+++ b/test/packetdrill/tests/linux/run_tests.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+for f in `find . -name "*.pkt" | sort`; do
+ echo "Running $f ..."
+ ip tcp_metrics flush all > /dev/null 2>&1
+ ../../packetdrill $f
+done
diff --git a/test/packetdrill/tests/linux/sack/sack-shift-sacked-1-2-3-fack.pkt b/test/packetdrill/tests/linux/sack/sack-shift-sacked-1-2-3-fack.pkt
new file mode 100644
index 0000000..52b8cda
--- /dev/null
+++ b/test/packetdrill/tests/linux/sack/sack-shift-sacked-1-2-3-fack.pkt
@@ -0,0 +1,47 @@
+// Test shifting of newly-SACKed ranges onto the previous already-SACKed skb.
+// This variant SACKs segments 1, 2, and 3.
+
+// Establish a connection and send 10 MSS.
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0.000 bind(3, ..., ...) = 0
+0.000 listen(3, 1) = 0
+
+0.100 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+0.100 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 6>
+0.200 < . 1:1(0) ack 1 win 1024
+0.200 accept(3, ..., ...) = 4
+
+0.200 write(4, ..., 10000) = 10000
+0.200 > P. 1:10001(10000) ack 1
+0.200 %{
+assert tcpi_reordering == 3
+assert tcpi_unacked == 10
+assert tcpi_sacked == 0
+}%
+
+0.300 < . 1:1(0) ack 1 win 257 <sack 1001:2001,nop,nop>
+0.300 %{
+assert tcpi_reordering == 3
+assert tcpi_unacked == 10
+assert tcpi_sacked == 1
+}%
+
+// This SACK for an adjacent range causes the sender to
+// shift the newly-SACKed range onto the previous skb.
+0.310 < . 1:1(0) ack 1 win 257 <sack 1001:3001,nop,nop>
+0.310 %{
+assert tcpi_reordering == 3
+assert tcpi_unacked == 10
+assert tcpi_sacked == 2
+}%
+
+// This SACK for an adjacent range causes the sender to
+// shift the newly-SACKed range onto the previous skb.
+0.320 < . 1:1(0) ack 1 win 257 <sack 1001:4001,nop,nop>
+0.320 %{
+assert tcpi_reordering == 3
+assert tcpi_unacked == 10
+assert tcpi_sacked == 3
+assert tcpi_ca_state == TCP_CA_Recovery
+}%
diff --git a/test/packetdrill/tests/linux/sack/sack-shift-sacked-1-2:6-fack.pkt b/test/packetdrill/tests/linux/sack/sack-shift-sacked-1-2:6-fack.pkt
new file mode 100644
index 0000000..464ba69
--- /dev/null
+++ b/test/packetdrill/tests/linux/sack/sack-shift-sacked-1-2:6-fack.pkt
@@ -0,0 +1,39 @@
+// Test shifting of newly-SACKed ranges onto the previous already-SACKed skb.
+// This variant receives a SACK for segment 1 and then a SACK for
+// segments 1-6, to check handling of large newly-SACKed ranges.
+
+// Establish a connection and send 10 MSS.
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0.000 bind(3, ..., ...) = 0
+0.000 listen(3, 1) = 0
+
+0.100 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+0.100 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 6>
+0.200 < . 1:1(0) ack 1 win 1024
+0.200 accept(3, ..., ...) = 4
+
+0.200 write(4, ..., 10000) = 10000
+0.200 > P. 1:10001(10000) ack 1
+0.200 %{
+assert tcpi_reordering == 3
+assert tcpi_unacked == 10
+assert tcpi_sacked == 0
+}%
+
+0.300 < . 1:1(0) ack 1 win 257 <sack 1001:2001,nop,nop>
+0.300 %{
+assert tcpi_reordering == 3
+assert tcpi_unacked == 10
+assert tcpi_sacked == 1
+}%
+
+// This SACK for an adjacent range causes the sender to
+// shift the newly-SACKed range onto the previous skb.
+0.310 < . 1:1(0) ack 1 win 257 <sack 1001:7001,nop,nop>
+0.310 %{
+assert tcpi_reordering == 3
+assert tcpi_unacked == 10
+assert tcpi_sacked == 6
+assert tcpi_ca_state == TCP_CA_Recovery
+}%
diff --git a/test/packetdrill/tests/linux/shutdown/shutdown-rd-close.pkt b/test/packetdrill/tests/linux/shutdown/shutdown-rd-close.pkt
new file mode 100644
index 0000000..f5fff60
--- /dev/null
+++ b/test/packetdrill/tests/linux/shutdown/shutdown-rd-close.pkt
@@ -0,0 +1,29 @@
+// Verify behavior for the sequence:
+// shutdown(SHUT_RD), close().
+
+// Initialize a server socket.
+0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
++0 bind(3, ..., ...) = 0
++0 listen(3, 1) = 0
+
++0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
++0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 6>
++0 < . 1:1(0) ack 1 win 257
+
++0 accept(3, ..., ...) = 4
+
++.010 shutdown(4, SHUT_RD) = 0
++0 read(4, ..., 1000) = 0
+
+// Verify that writing and sending still works.
++.010 write(4, ..., 1000) = 1000
++0 > P. 1:1001(1000) ack 1
++0 < . 1:1(0) ack 1001 win 257
+
++.010 close(4) = 0
++0 > F. 1001:1001(0) ack 1
++0 < . 1:1(0) ack 1002 win 257
+
++.010 < F. 1:1(0) ack 1002 win 257
++0 > . 1002:1002(0) ack 2
diff --git a/test/packetdrill/tests/linux/shutdown/shutdown-rd-wr-close.pkt b/test/packetdrill/tests/linux/shutdown/shutdown-rd-wr-close.pkt
new file mode 100644
index 0000000..5b97fad
--- /dev/null
+++ b/test/packetdrill/tests/linux/shutdown/shutdown-rd-wr-close.pkt
@@ -0,0 +1,45 @@
+// Verify behavior for the sequence:
+// shutdown(SHUT_RD), receive, send, shutdown(SHUT_WR), close().
+
+// Initialize a server socket.
+0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
++0 bind(3, ..., ...) = 0
++0 listen(3, 1) = 0
+
++0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
++0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 6>
++0 < . 1:1(0) ack 1 win 257
+
++0 accept(3, ..., ...) = 4
+
++.010 shutdown(4, SHUT_RD) = 0
+
++0 read(4, ..., 1000) = 0
+
+// You would think that after SHUT_RD we would respond to incoming
+// data with a RST and not queue the data for reading, but we actually
+// ACK the data, enqueue it for reading, and can read() the data.
+// AFAICT in 2003 Andi Kleen seems to have decided that this case is too
+// obscure to slow down the fast path for receiving and reading data:
+// http://marc.info/?l=linux-netdev&m=105774722214242&w=2
+// So....
+// Verify that receiving and reading still works.
++0 < . 1:1001(1000) ack 1 win 257
++0 > . 1:1(0) ack 1001
++0 read(4, ..., 1000) = 1000
+
+// Verify that writing and sending still works.
++.010 write(4, ..., 1000) = 1000
++0 > P. 1:1001(1000) ack 1001
++0 < . 1001:1001(0) ack 1001 win 257
+
++.010 shutdown(4, SHUT_WR) = 0
++0 > F. 1001:1001(0) ack 1001
++0 < . 1001:1001(0) ack 1002 win 257
++0 write(4, ..., 1000) = -1 EPIPE (Broken pipe)
+
++.010 close(4) = 0
+
++.010 < F. 1001:1001(0) ack 1002 win 257
++0 > . 1002:1002(0) ack 1002
diff --git a/test/packetdrill/tests/linux/shutdown/shutdown-rdwr-close.pkt b/test/packetdrill/tests/linux/shutdown/shutdown-rdwr-close.pkt
new file mode 100644
index 0000000..cb55c3b
--- /dev/null
+++ b/test/packetdrill/tests/linux/shutdown/shutdown-rdwr-close.pkt
@@ -0,0 +1,26 @@
+// Verify behavior for the sequence:
+// shutdown(SHUT_RDWR), close().
+
+// Initialize a server socket.
+0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
++0 bind(3, ..., ...) = 0
++0 listen(3, 1) = 0
+
++0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
++0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 6>
++0 < . 1:1(0) ack 1 win 257
+
++0 accept(3, ..., ...) = 4
+
++.010 shutdown(4, SHUT_RDWR) = 0
++0 > F. 1:1(0) ack 1
++0 < . 1:1(0) ack 2 win 257
+
++0 read(4, ..., 1000) = 0
++0 write(4, ..., 1000) = -1 EPIPE (Broken pipe)
+
++.010 close(4) = 0
+
++.010 < F. 1:1(0) ack 2 win 257
++0 > . 2:2(0) ack 2
diff --git a/test/packetdrill/tests/linux/shutdown/shutdown-wr-close.pkt b/test/packetdrill/tests/linux/shutdown/shutdown-wr-close.pkt
new file mode 100644
index 0000000..c840f84
--- /dev/null
+++ b/test/packetdrill/tests/linux/shutdown/shutdown-wr-close.pkt
@@ -0,0 +1,29 @@
+// Verify behavior for the sequence:
+// shutdown(SHUT_WR), close().
+
+// Initialize a server socket.
+0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
++0 bind(3, ..., ...) = 0
++0 listen(3, 1) = 0
+
++0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
++0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 6>
++0 < . 1:1(0) ack 1 win 257
+
++0 accept(3, ..., ...) = 4
+
++.010 shutdown(4, SHUT_WR) = 0
++0 > F. 1:1(0) ack 1
++0 < . 1:1(0) ack 2 win 257
++0 write(4, ..., 1000) = -1 EPIPE (Broken pipe)
+
+// Verify that receiving and reading still works.
++.010 < . 1:1001(1000) ack 2 win 257
++0 > . 2:2(0) ack 1001
++0 read(4, ..., 1000) = 1000
+
++.010 close(4) = 0
+
++.010 < F. 1001:1001(0) ack 2 win 257
++0 > . 2:2(0) ack 1002
diff --git a/test/packetdrill/tests/linux/undo/undo-fr-ack-then-dsack-on-ack-below-snd_una.pkt b/test/packetdrill/tests/linux/undo/undo-fr-ack-then-dsack-on-ack-below-snd_una.pkt
new file mode 100644
index 0000000..b3347f9
--- /dev/null
+++ b/test/packetdrill/tests/linux/undo/undo-fr-ack-then-dsack-on-ack-below-snd_una.pkt
@@ -0,0 +1,55 @@
+// Test fast recovery and undo: send 10 MSS, get 3 dupacks, do a
+// fast retransmit, get a DSACK for the retransmitted segment, and
+// undo the cwnd reduction.
+// Assumes initial cwnd is 10. Receiver supports SACK.
+//
+// In this variant there is reordering in the return path,
+// so that we end up getting an ACK below snd_una that
+// has the critical DSACK that tells us we need to undo.
+
+// Establish a connection.
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0.000 bind(3, ..., ...) = 0
+0.000 listen(3, 1) = 0
+
+0.100 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+0.100 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 6>
+0.200 < . 1:1(0) ack 1 win 257
+0.200 accept(3, ..., ...) = 4
+
+// Send 10 MSS.
+0.200 write(4, ..., 10000) = 10000
+0.200 > P. 1:10001(10000) ack 1
+
+// Get 3 dupacks.
+0.300 < . 1:1(0) ack 1 win 257 <sack 1001:2001,nop,nop>
+0.300 < . 1:1(0) ack 1 win 257 <sack 1001:3001,nop,nop>
+0.300 < . 1:1(0) ack 1 win 257 <sack 1001:4001,nop,nop>
+// We've received 3 duplicate ACKs, so we do a fast retransmit.
+0.300 > . 1:1001(1000) ack 1
+// Apparently just reordering; receiver ACKs all data. Retransmit was spurious.
+0.300 < . 1:1(0) ack 4001 win 257
+0.300 < . 1:1(0) ack 6001 win 257
+0.300 < . 1:1(0) ack 8001 win 257
+0.300 < . 1:1(0) ack 10001 win 257
+
+// We send some more new data so we can have an ACK that races our DSACK.
+0.303 write(4, ..., 1000) = 1000
+0.303 > P. 10001:11001(1000) ack 1
+
+// Receiver ACKs all outstanding data.
+0.400 < . 1:1(0) ack 11001 win 257
+
+// Oops; there was reordering in the ACK path!
+// Now we get the DSACK for the retransmitted packet.
+// It's a DSACK on an ack below snd_una.
+0.401 < . 1:1(0) ack 10001 win 257 <sack 1:1001,nop,nop>
+
+// Verify that the DSACK caused an undo, restoring cwnd to 10.
+0.450 write(4, ..., 11000) = 11000
+0.450 > . 11001:21001(10000) ack 1
+0.450 %{
+assert tcpi_snd_cwnd == 10
+assert tcpi_unacked == 10
+}%
diff --git a/test/packetdrill/tests/linux/undo/undo-fr-acks-dropped-then-dsack.pkt b/test/packetdrill/tests/linux/undo/undo-fr-acks-dropped-then-dsack.pkt
new file mode 100644
index 0000000..f39ddc8
--- /dev/null
+++ b/test/packetdrill/tests/linux/undo/undo-fr-acks-dropped-then-dsack.pkt
@@ -0,0 +1,44 @@
+// Test fast recovery and undo: send 10 MSS, get 3 dupacks, do a
+// fast retransmit, get a DSACK for the retransmitted segment, and
+// undo the cwnd reduction.
+// Assumes initial cwnd is 10. Receiver supports SACK.
+//
+// In this variant the original ACKs are lost, and all the sender gets is
+// a DSACK.
+
+// Establish a connection.
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0.000 bind(3, ..., ...) = 0
+0.000 listen(3, 1) = 0
+
+0.100 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+0.100 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 6>
+0.200 < . 1:1(0) ack 1 win 257
+0.200 accept(3, ..., ...) = 4
+
+// Send 10 MSS.
+0.200 write(4, ..., 10000) = 10000
+0.200 > P. 1:10001(10000) ack 1
+
+// Get 3 dupacks.
+0.300 < . 1:1(0) ack 1 win 257 <sack 1001:2001,nop,nop>
+0.300 < . 1:1(0) ack 1 win 257 <sack 1001:3001,nop,nop>
+0.300 < . 1:1(0) ack 1 win 257 <sack 1001:4001,nop,nop>
+// We've received 3 duplicate ACKs, so we do a fast retransmit.
+0.300 > . 1:1001(1000) ack 1
+0.300 %{ assert tcpi_snd_cwnd == 7 }%
+
+// Apparently just reordering. Retransmit was spurious.
+// Original ACKs for sequence ranges up to 10001 are all lost.
+
+// Receiver sends DSACK for retransmitted packet.
+0.400 < . 1:1(0) ack 10001 win 257 <sack 1:1001,nop,nop>
+
+// Verify that the DSACK caused an undo, restoring cwnd to 10.
+0.400 write(4, ..., 11000) = 11000
+0.400 > . 10001:20001(10000) ack 1
+0.400 %{
+assert tcpi_snd_cwnd == 10
+assert tcpi_unacked == 10
+}%
diff --git a/test/packetdrill/tests/tldk/delay_ack/delay-ack-tldk.pkt b/test/packetdrill/tests/tldk/delay_ack/delay-ack-tldk.pkt
new file mode 100644
index 0000000..17889b7
--- /dev/null
+++ b/test/packetdrill/tests/tldk/delay_ack/delay-ack-tldk.pkt
@@ -0,0 +1,26 @@
+// Test delay ack and quick ack
+
+// Establish a connection.
+0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+
++0 bind(3, ..., ...) = 0
++0 listen(3, 1) = 0
+
++0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
++0 > S. 0:0(0) ack 1 <...>
+
++.1 < . 1:1(0) ack 1 win 32792
++0 accept(3, ..., ...) = 4
+
+//receive 1 pkt and ack immediately (quickack)
++0 < . 1:1001(1000) ack 1 win 32792
++0 > . 1:1(0) ack 1001
+
+//receive 1 pkt and delayack
++0 < . 1001:2001(1000) ack 1 win 32792
++0.1 ~ +0.2 > . 1:1(0) ack 2001
+
+//receive 1 pkt and ack immediately (quickack)
++0 < . 2001:3001(1000) ack 1 win 32792
++0 > . 1:1(0) ack 3001
diff --git a/test/packetdrill/tests/tldk/fast_retransmit/fr-4pkt-tldk.pkt b/test/packetdrill/tests/tldk/fast_retransmit/fr-4pkt-tldk.pkt
new file mode 100644
index 0000000..f665547
--- /dev/null
+++ b/test/packetdrill/tests/tldk/fast_retransmit/fr-4pkt-tldk.pkt
@@ -0,0 +1,35 @@
+// Test fast retransmit with 4 packets outstanding, receiver sending SACKs.
+// In this variant the receiver supports SACK.
+
+// Establish a connection.
+0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+
++0 bind(3, ..., ...) = 0
++0 listen(3, 1) = 0
+
++0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
++0 > S. 0:0(0) ack 1 <...>
+
++.1 < . 1:1(0) ack 1 win 32792
++0 accept(3, ..., ...) = 4
+
+// Send 1 data segment and get an ACK, so cwnd is now 4.
++0 write(4, ..., 1000) = 1000
++0 > . 1:1001(1000) ack 1
+
++.1 < . 1:1(0) ack 1001 win 32792
+
+// Write 4 data segments.
++0 write(4, ..., 4000) = 4000
++0 > . 1001:5001(4000) ack 1
+
+// Get 3 SACKs.
++.1 < . 1:1(0) ack 1001 win 32792 <sack 2001:3001,nop,nop>
++0 < . 1:1(0) ack 1001 win 32792 <sack 2001:4001,nop,nop>
++0 < . 1:1(0) ack 1001 win 32792 <sack 2001:5001,nop,nop>
+// We've received 3 duplicate ACKs, so we do a fast retransmit.
++0 > . 1001:3049(2048) ack 1
+
+// Receiver ACKs all data.
++.1 < . 1:1(0) ack 6001 win 32792
diff --git a/test/packetdrill/tests/tldk/keep_alive/keep-alive-after-accept-tldk.pkt b/test/packetdrill/tests/tldk/keep_alive/keep-alive-after-accept-tldk.pkt
new file mode 100644
index 0000000..09ff2cb
--- /dev/null
+++ b/test/packetdrill/tests/tldk/keep_alive/keep-alive-after-accept-tldk.pkt
@@ -0,0 +1,50 @@
+// Test set keepalive after accept
+
+// Establish a connection.
+0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+
++0 bind(3, ..., ...) = 0
++0 listen(3, 1) = 0
+
++0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
++0 > S. 0:0(0) ack 1 <...>
+
++.1 < . 1:1(0) ack 1 win 32792
++0 accept(3, ..., ...) = 4
+
+// enable keepalive and set args
++0 setsockopt(4, SOL_SOCKET, SO_KEEPALIVE, [1], 4) = 0
+// extra ack, it's not needed, fix it later
++0 ~ +100 > . 1:1(0) ack 1
+
++0 setsockopt(4, IPPROTO_TCP, TCP_KEEPIDLE, [5], 4) = 0
+// extra ack, it's not needed, fix it later
++0 ~ +100 > . 1:1(0) ack 1
+
++0 setsockopt(4, IPPROTO_TCP, TCP_KEEPINTVL, [2], 4) = 0
+// extra ack, it's not needed, fix it later
++0 ~ +100 > . 1:1(0) ack 1
+
++0 setsockopt(4, IPPROTO_TCP, TCP_KEEPCNT, [2], 4) = 0
+
+// first keep-alive pkt, idle = 5
++4 ~ +6 > . 0:0(0) ack 1
+
+// second keep-alive pkt, interval = 2
++1 ~ +3 > . 0:0(0) ack 1
+
+// get response, idle = 5
++0 < . 1:1(0) ack 1 win 32792
+
+// first keep-alive pkt again, idle = 5
++4 ~ +6 > . 0:0(0) ack 1
+
+// second keep-alive pkt, interval = 2
++1 ~ +3 > . 0:0(0) ack 1
+
+// get no response, send rst
++1 ~ +3 > R. 1:1(0) ack 1
+
++1 read(4, ..., 10) = -1 ETIMEDOUT (Connection timed out)
++0 close(4) = 0 \ No newline at end of file
diff --git a/test/packetdrill/tests/tldk/keep_alive/keep-alive-before-connect-tldk.pkt b/test/packetdrill/tests/tldk/keep_alive/keep-alive-before-connect-tldk.pkt
new file mode 100644
index 0000000..1e68f97
--- /dev/null
+++ b/test/packetdrill/tests/tldk/keep_alive/keep-alive-before-connect-tldk.pkt
@@ -0,0 +1,37 @@
+// Test set keepalive before connect
+
+// enable keepalive and set args
+0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 setsockopt(3, SOL_SOCKET, SO_KEEPALIVE, [1], 4) = 0
++0 setsockopt(3, IPPROTO_TCP, TCP_KEEPIDLE, [5], 4) = 0
++0 setsockopt(3, IPPROTO_TCP, TCP_KEEPINTVL, [2], 4) = 0
++0 setsockopt(3, IPPROTO_TCP, TCP_KEEPCNT, [2], 4) = 0
+
+// Establish a connection.
++0 fcntl(3, F_GETFL) = 0x2 (flags O_RDWR)
++0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
++0 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress)
++0 > S 0:0(0) <...>
++0 < S. 0:0(0) ack 1 win 32792 <mss 1460>
++0 > . 1:1(0) ack 1 <...>
+
+// first keep-alive pkt, idle = 5
++4 ~ +6 > . 0:0(0) ack 1
+
+// second keep-alive pkt, interval = 2
++1 ~ +3 > . 0:0(0) ack 1
+
+// get response, idle = 5
++0 < . 1:1(0) ack 1 win 32792
+
+// first keep-alive pkt again, idle = 5
++4 ~ +6 > . 0:0(0) ack 1
+
+// second keep-alive pkt, interval = 2
++1 ~ +3 > . 0:0(0) ack 1
+
+// get no response, send rst
++1 ~ +3 > R. 1:1(0) ack 1
+
++1 read(3, ..., 10) = -1 ETIMEDOUT (Connection timed out)
++0 close(3) = 0 \ No newline at end of file
diff --git a/test/packetdrill/tests/tldk/keep_alive/keep-alive-enable-disable-tldk.pkt b/test/packetdrill/tests/tldk/keep_alive/keep-alive-enable-disable-tldk.pkt
new file mode 100644
index 0000000..5c0db20
--- /dev/null
+++ b/test/packetdrill/tests/tldk/keep_alive/keep-alive-enable-disable-tldk.pkt
@@ -0,0 +1,26 @@
+// Test set keepalive before connect
+
+// enable keepalive and set args
+0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 setsockopt(3, SOL_SOCKET, SO_KEEPALIVE, [1], 4) = 0
++0 setsockopt(3, IPPROTO_TCP, TCP_KEEPIDLE, [5], 4) = 0
++0 setsockopt(3, IPPROTO_TCP, TCP_KEEPINTVL, [2], 4) = 0
++0 setsockopt(3, IPPROTO_TCP, TCP_KEEPCNT, [2], 4) = 0
+
+// Establish a connection.
++0 fcntl(3, F_GETFL) = 0x2 (flags O_RDWR)
++0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
++0 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress)
++0 > S 0:0(0) <...>
++0 < S. 0:0(0) ack 1 win 32792 <mss 1460>
++0 > . 1:1(0) ack 1 <...>
+
+// first keep-alive pkt, idle = 5
++4 ~ +6 > . 0:0(0) ack 1
+
+// disable keepalive
++0 setsockopt(3, SOL_SOCKET, SO_KEEPALIVE, [0], 4) = 0
+
+// no more keep-alive pkt
++10 read(3, ..., 10) = -1 EAGAIN (Resource temporarily unavailable)
++0 close(3) = 0
diff --git a/test/packetdrill/tests/tldk/out_of_order/ofo-simple-3pkt-tldk.pkt b/test/packetdrill/tests/tldk/out_of_order/ofo-simple-3pkt-tldk.pkt
new file mode 100644
index 0000000..8629d79
--- /dev/null
+++ b/test/packetdrill/tests/tldk/out_of_order/ofo-simple-3pkt-tldk.pkt
@@ -0,0 +1,27 @@
+// Test reorder pkts received out of order, receiver sending correct ACK.
+
+// Establish a connection.
+0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+
++0 bind(3, ..., ...) = 0
++0 listen(3, 1) = 0
+
++0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
++0 > S. 0:0(0) ack 1 <...>
+
++.1 < . 1:1(0) ack 1 win 32792
++0 accept(3, ..., ...) = 4
+
+// receive 3 pkts out of order
++0 < . 1:1001(1000) ack 1 win 32792
++0 > . 1:1(0) ack 1001
++0 < . 2001:3001(1000) ack 1 win 32792
++0 > . 1:1(0) ack 1001
+
++0.1 < . 1001:2001(1000) ack 1 win 32792
+// reorder pkts and ack them
++0.1 ~ +0.2 > . 1:1(0) ack 3001
+
+// read received data
++0 read(4, ..., 3000) = 3000
diff --git a/test/packetdrill/tests/tldk/tso/tso-segment-split.pkt b/test/packetdrill/tests/tldk/tso/tso-segment-split.pkt
new file mode 100644
index 0000000..6b98323
--- /dev/null
+++ b/test/packetdrill/tests/tldk/tso/tso-segment-split.pkt
@@ -0,0 +1,63 @@
+// Test TSO segment split
+
+// Establish a connection.
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0.000 bind(3, ..., ...) = 0
+0.000 listen(3, 1) = 0
+
+0.100 < S 0:0(0) win 5360 <mss 536,sackOK,nop,nop,nop,wscale 7>
++0 > S. 0:0(0) ack 1 <mss 1460,wscale 10,nop>
+
++.1 < . 1:1(0) ack 1 win 5360
+0.200 accept(3, ..., ...) = 4
+
+// Send 10 MSS.
+0.200 write(4, ..., 5360) = 5360
+0.200 > . 1:5361(5360) ack 1
+
+// Ack 1st mss (partial ack), and update rcv windows to 1 mss
+0.200 < . 1:1(0) ack 537 win 536
+
+// Get 3 dupacks, and reduce rcv window to only 1 mss
+0.300 < . 1:1(0) ack 537 win 536
+0.300 < . 1:1(0) ack 537 win 536
+0.300 < . 1:1(0) ack 537 win 536
+
+// We've received 3 duplicate ACKs, so we do a fast retransmit;
+// Segment split happens here.
+0.300 > . 537:1073(536) ack 1
+
+// Ack 2nd mss (partial ack)
+0.300 < . 1:1(0) ack 1073 win 536
+
+// 3nd mss
+0.300 > . 1073:1609(536) ack 1
+
+// Now let's try ack somehere between (3*mss, 4*mss)
+0.303 < . 1:1(0) ack 2049 win 536
+
+// We shall send from 2049, and send another mss.
+0.303 > . 2049:2585(536) ack 1
+
+// We send some more new data, but it cannot be sent for limited window.
+0.303 write(4, ..., 5360) = 5360
+
+0.305 < . 1:1(0) ack 2585 win 5360
+
+0.306 > . 2585:5361(2776) ack 1
+0.306 < . 1:1(0) ack 5361 win 5360
+
+// FIXME: the cwnd has been reduced to 2*mss
+0.307 > . 5361:6433(1072) ack 1
+0.307 < . 1:1(0) ack 6433 win 5360
+
+// FIXME: below behavisor makes no sense
+0.307 > . 6433:7409(976) ack 1
+0.307 < . 1:1(0) ack 7409 win 5360
+
+0.307 > . 7409:9457(2048) ack 1
+0.307 < . 1:1(0) ack 9457 win 5360
+
+0.308 > . 9457:10721(1264) ack 1
+0.308 < . 1:1(0) ack 10721 win 5360
diff --git a/test/packetdrill/tun.h b/test/packetdrill/tun.h
new file mode 100644
index 0000000..6a28201
--- /dev/null
+++ b/test/packetdrill/tun.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * We cannot include the kernel's linux/if_tun.h because this tool
+ * tries to compile and work for basically any Linux/BSD kernel
+ * version. So we have our version of the TUN-related
+ * declarations we require here.
+ */
+
+/*
+ * Universal TUN/TAP device driver.
+ * Copyright (C) 1999-2000 Maxim Krasnyansky <max_mk@yahoo.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __TUN_H__
+#define __TUN_H__
+
+#include "ethernet.h"
+
+/* Read queue size */
+#define TUN_READQ_SIZE 500
+
+/* TUN device flags */
+#define TUN_TUN_DEV 0x0001
+#define TUN_TAP_DEV 0x0002
+#define TUN_TYPE_MASK 0x000f
+
+#define TUN_FASYNC 0x0010
+#define TUN_NOCHECKSUM 0x0020
+#define TUN_NO_PI 0x0040
+#define TUN_ONE_QUEUE 0x0080
+#define TUN_PERSIST 0x0100
+#define TUN_VNET_HDR 0x0200
+
+/* Ioctl defines */
+#define TUNSETNOCSUM _IOW('T', 200, int)
+#define TUNSETDEBUG _IOW('T', 201, int)
+#define TUNSETIFF _IOW('T', 202, int)
+#define TUNSETPERSIST _IOW('T', 203, int)
+#define TUNSETOWNER _IOW('T', 204, int)
+#define TUNSETLINK _IOW('T', 205, int)
+#define TUNSETGROUP _IOW('T', 206, int)
+#define TUNGETFEATURES _IOR('T', 207, unsigned int)
+#define TUNSETOFFLOAD _IOW('T', 208, unsigned int)
+#define TUNSETTXFILTER _IOW('T', 209, unsigned int)
+#define TUNGETIFF _IOR('T', 210, unsigned int)
+#define TUNGETSNDBUF _IOR('T', 211, int)
+#define TUNSETSNDBUF _IOW('T', 212, int)
+#define TUNATTACHFILTER _IOW('T', 213, struct sock_fprog)
+#define TUNDETACHFILTER _IOW('T', 214, struct sock_fprog)
+#define TUNGETVNETHDRSZ _IOR('T', 215, int)
+#define TUNSETVNETHDRSZ _IOW('T', 216, int)
+
+/* TUNSETIFF ifr flags */
+#define IFF_TUN 0x0001
+#define IFF_TAP 0x0002
+#define IFF_NO_PI 0x1000
+#define IFF_ONE_QUEUE 0x2000
+#define IFF_VNET_HDR 0x4000
+#define IFF_TUN_EXCL 0x8000
+
+/* Features for GSO (TUNSETOFFLOAD). */
+#define TUN_F_CSUM 0x01 /* You can hand me unchecksummed packets. */
+#define TUN_F_TSO4 0x02 /* I can handle TSO for IPv4 packets */
+#define TUN_F_TSO6 0x04 /* I can handle TSO for IPv6 packets */
+#define TUN_F_TSO_ECN 0x08 /* I can handle TSO with ECN bits. */
+#define TUN_F_UFO 0x10 /* I can handle UFO packets */
+
+/* Protocol info prepended to the packets (when IFF_NO_PI is not set) */
+#define TUN_PKT_STRIP 0x0001
+struct tun_pi {
+ __u16 flags;
+ __be16 proto;
+};
+
+/*
+ * Filter spec (used for SETXXFILTER ioctls)
+ * This stuff is applicable only to the TAP (Ethernet) devices.
+ * If the count is zero the filter is disabled and the driver accepts
+ * all packets (promisc mode).
+ * If the filter is enabled in order to accept broadcast packets
+ * broadcast addr must be explicitly included in the addr list.
+ */
+#define TUN_FLT_ALLMULTI 0x0001 /* Accept all multicast packets */
+struct tun_filter {
+ __u16 flags; /* TUN_FLT_ flags see above */
+ __u16 count; /* Number of addresses */
+ __u8 addr[0][ETH_ALEN];
+};
+
+#endif /* __TUN_H__ */
diff --git a/test/packetdrill/types.c b/test/packetdrill/types.c
new file mode 100644
index 0000000..2b9ba2d
--- /dev/null
+++ b/test/packetdrill/types.c
@@ -0,0 +1,44 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Definitions for types and utilities used widely throughout the tool.
+ */
+
+#include "types.h"
+
+struct in_addr in4addr_any = { .s_addr = INADDR_ANY };
+
+void hex_dump(const u8 *buffer, int bytes, char **hex)
+{
+ size_t size = 0;
+ FILE *s = open_memstream(hex, &size); /* output string */
+ int i;
+ for (i = 0; i < bytes; ++i) {
+ if (i % 16 == 0) {
+ if (i > 0)
+ fprintf(s, "\n");
+ fprintf(s, "0x%04x: ", i); /* show buffer offset */
+ }
+ fprintf(s, "%02x ", buffer[i]);
+ }
+ fprintf(s, "\n");
+ fclose(s);
+}
diff --git a/test/packetdrill/types.h b/test/packetdrill/types.h
new file mode 100644
index 0000000..e47c75d
--- /dev/null
+++ b/test/packetdrill/types.h
@@ -0,0 +1,207 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Declarations for types used widely throughout this tool.
+ */
+
+#ifndef __TYPES_H__
+#define __TYPES_H__
+
+/* The files that include this file need to include it before
+ * including stdio.h in order to ensure that the declaration of
+ * asprintf is visible. So our .h files attempt to follow a
+ * convention of including types.h first, before everything else.
+ */
+#define _GNU_SOURCE /* for asprintf */
+#include <limits.h>
+#include <stdio.h>
+#include <string.h>
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+
+#include "assert.h"
+#include "platforms.h"
+
+/* We use some unconventional formatting here to avoid checkpatch.pl
+ * warnings about having to use the __packed macro, which is typically
+ * only available in the kernel.
+ */
+#ifndef __packed
+#define __packed __attribute__ ((packed))
+#endif
+
+/* We use kernel-style names for standard integer types. */
+typedef unsigned char u8;
+typedef unsigned short u16;
+typedef unsigned int u32;
+typedef unsigned long long u64;
+
+typedef signed char s8;
+typedef signed short s16;
+typedef signed int s32;
+typedef signed long long s64;
+
+typedef u8 bool;
+enum bool_t {
+ false = 0,
+ true = 1,
+};
+
+#define ARRAY_SIZE(array_name) (sizeof(array_name) / sizeof(array_name[0]))
+
+/* Most functions in this codebase return one of these two values to let the
+ * caller know whether there was a problem.
+ */
+enum status_t {
+ STATUS_OK = 0,
+ STATUS_ERR = -1,
+ STATUS_WARN = -2, /* a non-fatal error or warning */
+};
+
+/* The directions in which a packet may flow. */
+enum direction_t {
+ DIRECTION_INVALID,
+ DIRECTION_INBOUND, /* packet coming into the kernel under test */
+ DIRECTION_OUTBOUND, /* packet leaving the kernel under test */
+};
+
+/* Return the opposite direction. */
+static inline enum direction_t reverse_direction(enum direction_t direction)
+{
+ if (direction == DIRECTION_INBOUND)
+ return DIRECTION_OUTBOUND;
+ else if (direction == DIRECTION_OUTBOUND)
+ return DIRECTION_INBOUND;
+ else
+ assert(!"bad direction");
+}
+
+/* How to treat the TOS byte of a packet. */
+enum tos_chk_t {
+ TOS_CHECK_NONE,
+ TOS_CHECK_ECN,
+ TOS_CHECK_TOS,
+};
+
+struct tos_spec {
+ enum tos_chk_t check;
+ u8 value;
+};
+
+/* IPv4 ECN treatment for a packet. */
+enum ip_ecn_t {
+ ECN_NONE,
+ ECN_ECT0,
+ ECN_ECT1,
+ ECN_CE,
+ ECN_ECT01,
+};
+
+#define TTL_CHECK_NONE 255
+
+struct ip_info {
+ struct tos_spec tos;
+ u32 flow_label;
+ u8 ttl;
+};
+
+/* Length of output buffer for inet_ntop, plus prefix length (e.g. "/128"). */
+#define ADDR_STR_LEN ((INET_ADDRSTRLEN + INET6_ADDRSTRLEN)+5)
+
+/* Flavors of IP versions we support. */
+enum ip_version_t {
+ /* Native IPv4, with AF_INET sockets and IPv4 addresses. */
+ IP_VERSION_4 = 0,
+
+ /* IPv4-Mapped IPv6 addresses: (see RFC 4291 sec. 2.5.5.2) we
+ * use AF_INET6 sockets but all connect(), bind(), and
+ * accept() calls are for IPv4 addresses mapped into IPv6
+ * address space. So all interface addresses and packets on
+ * the wire are IPv4.
+ */
+ IP_VERSION_4_MAPPED_6 = 1,
+
+ /* Native IPv6, with AF_INET6 sockets and IPv6 addresses. */
+ IP_VERSION_6 = 2,
+};
+
+extern struct in_addr in4addr_any;
+
+/* Comparing IPv4 addresses for equality in C, which has no == on structs. */
+static inline bool is_equal_ipv4(struct in_addr a, struct in_addr b)
+{
+ return a.s_addr == b.s_addr;
+}
+
+/* For comparing ports, for consistency with is_equal_ipv4. */
+static inline bool is_equal_port(u16 a, u16 b)
+{
+ return a == b;
+}
+
+/* Convert microseconds to a floating-point seconds value. */
+static inline double usecs_to_secs(s64 usecs)
+{
+ return ((double)usecs) / 1.0e6;
+}
+
+/* Convert a timeval to microseconds. */
+static inline s64 timeval_to_usecs(const struct timeval *tv)
+{
+ return ((s64)tv->tv_sec) * 1000000LL + (s64)tv->tv_usec;
+}
+
+/* Return a malloc-allocated hex dump of the given buffer of the given length */
+extern void hex_dump(const u8 *buffer, int bytes, char **hex);
+
+static inline bool is_valid_u8(s64 x)
+{
+ return (x >= 0) && (x <= UCHAR_MAX);
+}
+
+static inline bool is_valid_u16(s64 x)
+{
+ return (x >= 0) && (x <= USHRT_MAX);
+}
+
+static inline bool is_valid_u32(s64 x)
+{
+ return (x >= 0) && (x <= UINT_MAX);
+}
+
+static inline bool is_valid_u20(s64 x)
+{
+ return (x >= 0) && (x <= 0xfffff);
+}
+
+static inline s64 max(s64 a, s64 b)
+{
+ return (a > b) ? a : b;
+}
+
+static inline s64 min(s64 a, s64 b)
+{
+ return (a < b) ? a : b;
+}
+
+#endif /* __TYPES_H__ */
diff --git a/test/packetdrill/uapi_linux.h b/test/packetdrill/uapi_linux.h
new file mode 100644
index 0000000..657b762
--- /dev/null
+++ b/test/packetdrill/uapi_linux.h
@@ -0,0 +1,296 @@
+/*
+ * Copyright 2018 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Our own header declarations, so we have something that's
+ * portable and somewhat more readable than a typical system header
+ * file.
+ *
+ * We cannot just include the kernel's headers because this tool tries
+ * to compile and work for basically any Linux/BSD kernel version. So
+ * we declare our own version of various network-related definitions here.
+ */
+
+#ifndef __UAPI_LINUX_H__
+#define __UAPI_LINUX_H__
+
+#include "types.h"
+
+#include <netinet/tcp.h>
+
+#ifdef linux
+
+/* From Linux include/uapi/asm-generic/socket.h: */
+
+#define SO_MAX_PACING_RATE 47
+
+#define SO_BPF_EXTENSIONS 48
+
+#define SO_INCOMING_CPU 49
+
+#define SO_ATTACH_BPF 50
+#define SO_DETACH_BPF SO_DETACH_FILTER
+
+#define SO_ATTACH_REUSEPORT_CBPF 51
+#define SO_ATTACH_REUSEPORT_EBPF 52
+
+#define SO_CNX_ADVICE 53
+
+#define SCM_TIMESTAMPING_OPT_STATS 54
+
+#define SO_MEMINFO 55
+
+#define SO_INCOMING_NAPI_ID 56
+
+#define SO_COOKIE 57
+
+#define SCM_TIMESTAMPING_PKTINFO 58
+
+#define SO_PEERGROUPS 59
+
+#define SO_ZEROCOPY 60
+
+/* From Linux include/uapi/linux/errqueue.h: */
+
+struct sock_extended_err {
+ __u32 ee_errno;
+ __u8 ee_origin;
+ __u8 ee_type;
+ __u8 ee_code;
+ __u8 ee_pad;
+ __u32 ee_info;
+ __u32 ee_data;
+};
+
+#define SO_EE_ORIGIN_NONE 0
+#define SO_EE_ORIGIN_LOCAL 1
+#define SO_EE_ORIGIN_ICMP 2
+#define SO_EE_ORIGIN_ICMP6 3
+#define SO_EE_ORIGIN_TXSTATUS 4
+#define SO_EE_ORIGIN_ZEROCOPY 5
+#define SO_EE_ORIGIN_TIMESTAMPING SO_EE_ORIGIN_TXSTATUS
+
+#define SO_EE_CODE_ZEROCOPY_COPIED 1
+
+struct scm_timestamping {
+ struct timespec ts[3];
+};
+
+enum {
+ SCM_TSTAMP_SND, /* driver passed skb to NIC, or HW */
+ SCM_TSTAMP_SCHED, /* data entered the packet scheduler */
+ SCM_TSTAMP_ACK, /* data acknowledged by peer */
+};
+
+/* From Linux include/uapi/linux/net_tstamp.h: */
+
+/* SO_TIMESTAMPING gets an integer bit field comprised of these values */
+enum {
+ SOF_TIMESTAMPING_TX_HARDWARE = (1<<0),
+ SOF_TIMESTAMPING_TX_SOFTWARE = (1<<1),
+ SOF_TIMESTAMPING_RX_HARDWARE = (1<<2),
+ SOF_TIMESTAMPING_RX_SOFTWARE = (1<<3),
+ SOF_TIMESTAMPING_SOFTWARE = (1<<4),
+ SOF_TIMESTAMPING_SYS_HARDWARE = (1<<5),
+ SOF_TIMESTAMPING_RAW_HARDWARE = (1<<6),
+ SOF_TIMESTAMPING_OPT_ID = (1<<7),
+ SOF_TIMESTAMPING_TX_SCHED = (1<<8),
+ SOF_TIMESTAMPING_TX_ACK = (1<<9),
+ SOF_TIMESTAMPING_OPT_CMSG = (1<<10),
+ SOF_TIMESTAMPING_OPT_TSONLY = (1<<11),
+ SOF_TIMESTAMPING_OPT_STATS = (1<<12),
+ SOF_TIMESTAMPING_OPT_PKTINFO = (1<<13),
+ SOF_TIMESTAMPING_OPT_TX_SWHW = (1<<14),
+
+ SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_TX_SWHW,
+
+ SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) |
+ SOF_TIMESTAMPING_LAST
+};
+
+/* From Linux include/uapi/linux/eventpoll.h: */
+
+#include <sys/epoll.h>
+
+/* Set exclusive wakeup mode for the target file descriptor */
+#ifndef EPOLLEXCLUSIVE
+#define EPOLLEXCLUSIVE (1U << 28)
+#endif
+
+/* From Linux include/uapi/linux/capability.h: */
+
+#include <linux/capability.h>
+
+#ifndef CAP_CHOWN
+#define CAP_CHOWN 0
+#endif
+
+#ifndef CAP_DAC_OVERRIDE
+#define CAP_DAC_OVERRIDE 1
+#endif
+
+#ifndef CAP_DAC_READ_SEARCH
+#define CAP_DAC_READ_SEARCH 2
+#endif
+
+#ifndef CAP_FOWNER
+#define CAP_FOWNER 3
+#endif
+
+#ifndef CAP_FSETID
+#define CAP_FSETID 4
+#endif
+
+#ifndef CAP_KILL
+#define CAP_KILL 5
+#endif
+
+#ifndef CAP_SETGID
+#define CAP_SETGID 6
+#endif
+
+#ifndef CAP_SETUID
+#define CAP_SETUID 7
+#endif
+
+#ifndef CAP_SETPCAP
+#define CAP_SETPCAP 8
+#endif
+
+#ifndef CAP_LINUX_IMMUTABLE
+#define CAP_LINUX_IMMUTABLE 9
+#endif
+
+#ifndef CAP_NET_BIND_SERVICE
+#define CAP_NET_BIND_SERVICE 10
+#endif
+
+#ifndef CAP_NET_BROADCAST
+#define CAP_NET_BROADCAST 11
+#endif
+
+#ifndef CAP_NET_ADMIN
+#define CAP_NET_ADMIN 12
+#endif
+
+#ifndef CAP_NET_RAW
+#define CAP_NET_RAW 13
+#endif
+
+#ifndef CAP_IPC_LOCK
+#define CAP_IPC_LOCK 14
+#endif
+
+#ifndef CAP_IPC_OWNER
+#define CAP_IPC_OWNER 15
+#endif
+
+#ifndef CAP_SYS_MODULE
+#define CAP_SYS_MODULE 16
+#endif
+
+#ifndef CAP_SYS_RAWIO
+#define CAP_SYS_RAWIO 17
+#endif
+
+#ifndef CAP_SYS_CHROOT
+#define CAP_SYS_CHROOT 18
+#endif
+
+#ifndef CAP_SYS_PTRACE
+#define CAP_SYS_PTRACE 19
+#endif
+
+#ifndef CAP_SYS_PACCT
+#define CAP_SYS_PACCT 20
+#endif
+
+#ifndef CAP_SYS_ADMIN
+#define CAP_SYS_ADMIN 21
+#endif
+
+#ifndef CAP_SYS_BOOT
+#define CAP_SYS_BOOT 22
+#endif
+
+#ifndef CAP_SYS_NICE
+#define CAP_SYS_NICE 23
+#endif
+
+#ifndef CAP_SYS_RESOURCE
+#define CAP_SYS_RESOURCE 24
+#endif
+
+#ifndef CAP_SYS_TIME
+#define CAP_SYS_TIME 25
+#endif
+
+#ifndef CAP_SYS_TTY_CONFIG
+#define CAP_SYS_TTY_CONFIG 26
+#endif
+
+#ifndef CAP_MKNOD
+#define CAP_MKNOD 27
+#endif
+
+#ifndef CAP_LEASE
+#define CAP_LEASE 28
+#endif
+
+#ifndef CAP_AUDIT_WRITE
+#define CAP_AUDIT_WRITE 29
+#endif
+
+#ifndef CAP_AUDIT_CONTROL
+#define CAP_AUDIT_CONTROL 30
+#endif
+
+#ifndef CAP_SETFCAP
+#define CAP_SETFCAP 31
+#endif
+
+#ifndef CAP_MAC_OVERRIDE
+#define CAP_MAC_OVERRIDE 32
+#endif
+
+#ifndef CAP_MAC_ADMIN
+#define CAP_MAC_ADMIN 33
+#endif
+
+#ifndef CAP_SYSLOG
+#define CAP_SYSLOG 34
+#endif
+
+#ifndef CAP_WAKE_ALARM
+#define CAP_WAKE_ALARM 35
+#endif
+
+#ifndef CAP_BLOCK_SUSPEND
+#define CAP_BLOCK_SUSPEND 36
+#endif
+
+#ifndef CAP_AUDIT_READ
+#define CAP_AUDIT_READ 37
+#endif
+
+#endif /* linux */
+
+
+#endif /* __UAPI_LINUX_H__ */
diff --git a/test/packetdrill/udp.h b/test/packetdrill/udp.h
new file mode 100644
index 0000000..e938930
--- /dev/null
+++ b/test/packetdrill/udp.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Our own UDP header declarations, so we have something that's
+ * portable and somewhat more readable than a typical system header
+ * file.
+ *
+ * We cannot include the kernel's linux/udp.h because this tool tries
+ * to compile and work for basically any Linux/BSD kernel version. So
+ * we declare our own version of various UDP-related definitions here.
+ */
+
+#ifndef __UDP_HEADERS_H__
+#define __UDP_HEADERS_H__
+
+#include "types.h"
+
+/* UDP header. See RFC 768. */
+struct udp {
+ __be16 src_port;
+ __be16 dst_port;
+ __be16 len; /* UDP length in bytes, includes UDP header */
+ __sum16 check; /* UDP checksum */
+};
+
+#endif /* __UDP_HEADERS_H__ */
diff --git a/test/packetdrill/udp_packet.c b/test/packetdrill/udp_packet.c
new file mode 100644
index 0000000..81c9187
--- /dev/null
+++ b/test/packetdrill/udp_packet.c
@@ -0,0 +1,91 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Implementation for module for formatting UDP packets.
+ */
+
+#include "udp_packet.h"
+
+#include "ip_packet.h"
+#include "udp.h"
+
+struct packet *new_udp_packet(int address_family,
+ enum direction_t direction,
+ struct ip_info ip_info,
+ u16 udp_payload_bytes,
+ u16 src_port,
+ u16 dst_port,
+ char **error)
+{
+ struct packet *packet = NULL; /* the newly-allocated result packet */
+ struct header *udp_header = NULL; /* the UDP header info */
+ /* Calculate lengths in bytes of all sections of the packet */
+ const int ip_option_bytes = 0;
+ const int ip_header_bytes = (ip_header_min_len(address_family) +
+ ip_option_bytes);
+ const int udp_header_bytes = sizeof(struct udp);
+ const int ip_bytes =
+ ip_header_bytes + udp_header_bytes + udp_payload_bytes;
+
+ /* Sanity-check all the various lengths */
+ if (ip_option_bytes & 0x3) {
+ asprintf(error, "IP options are not padded correctly "
+ "to ensure IP header is a multiple of 4 bytes: "
+ "%d excess bytes", ip_option_bytes & 0x3);
+ return NULL;
+ }
+ assert((udp_header_bytes & 0x3) == 0);
+ assert((ip_header_bytes & 0x3) == 0);
+
+ if (ip_bytes > MAX_UDP_DATAGRAM_BYTES) {
+ asprintf(error, "UDP datagram too large");
+ return NULL;
+ }
+
+ /* Allocate and zero out a packet object of the desired size */
+ packet = packet_new(ip_bytes);
+ memset(packet->buffer, 0, ip_bytes);
+
+ packet->direction = direction;
+ packet->flags = 0;
+ packet->tos_chk = ip_info.tos.check;
+
+ /* Set IP header fields */
+ set_packet_ip_header(packet, address_family, ip_bytes,
+ ip_info.tos.value, ip_info.flow_label,
+ ip_info.ttl, IPPROTO_UDP);
+
+ udp_header = packet_append_header(packet, HEADER_UDP,
+ sizeof(struct udp));
+ udp_header->total_bytes = udp_header_bytes + udp_payload_bytes;
+
+ /* Find the start of UDP section of the packet */
+ packet->udp = (struct udp *) (ip_start(packet) + ip_header_bytes);
+
+ /* Set UDP header fields */
+ packet->udp->src_port = htons(src_port);
+ packet->udp->dst_port = htons(dst_port);
+ packet->udp->len = htons(udp_header_bytes + udp_payload_bytes);
+ packet->udp->check = 0;
+
+ packet->ip_bytes = ip_bytes;
+ return packet;
+}
diff --git a/test/packetdrill/udp_packet.h b/test/packetdrill/udp_packet.h
new file mode 100644
index 0000000..9ace440
--- /dev/null
+++ b/test/packetdrill/udp_packet.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Interface for module for formatting UDP packets.
+ */
+
+#ifndef __UDP_PACKET_H__
+#define __UDP_PACKET_H__
+
+#include "types.h"
+
+#include "packet.h"
+
+/* Create and initialize a new struct packet containing a UDP segment.
+ * The 'flags' are a tcpdump-style sequence of UDP header flags.
+ * On success, returns a newly-allocated packet. On failure, returns NULL
+ * and fills in *error with an error message.
+ */
+extern struct packet *new_udp_packet(int address_family,
+ enum direction_t direction,
+ struct ip_info ip_info,
+ u16 udp_payload_bytes,
+ u16 src_port,
+ u16 dst_port,
+ char **error);
+#endif /* __UDP_PACKET_H__ */
diff --git a/test/packetdrill/unaligned.h b/test/packetdrill/unaligned.h
new file mode 100644
index 0000000..f44ee9e
--- /dev/null
+++ b/test/packetdrill/unaligned.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Helpers for portably safe access to unaligned multi-byte values.
+ */
+
+#ifndef __UNALIGNED_H__
+#define __UNALIGNED_H__
+
+#include "types.h"
+
+static inline u32 __get_unaligned_be32(const u8 *p)
+{
+ return p[0] << 24 | p[1] << 16 | p[2] << 8 | p[3];
+}
+
+static inline void __put_unaligned_be32(u32 val, u8 *p)
+{
+ *p++ = val >> 24;
+ *p++ = val >> 16;
+ *p++ = val >> 8;
+ *p++ = val;
+}
+
+static inline u32 get_unaligned_be32(const void *p)
+{
+ return __get_unaligned_be32((const u8 *)p);
+}
+
+static inline void put_unaligned_be32(u32 val, void *p)
+{
+ __put_unaligned_be32(val, p);
+}
+
+#endif /* __UNALIGNED_H__ */
diff --git a/test/packetdrill/wire_client.c b/test/packetdrill/wire_client.c
new file mode 100644
index 0000000..33ebbd6
--- /dev/null
+++ b/test/packetdrill/wire_client.c
@@ -0,0 +1,302 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Client-side code for remote on-the-wire testing using a real NIC.
+ */
+
+#include "wire_client.h"
+
+#include "config.h"
+#include "link_layer.h"
+#include "script.h"
+#include "run.h"
+
+struct wire_client *wire_client_new(void)
+{
+ return calloc(1, sizeof(struct wire_client));
+}
+
+void wire_client_free(struct wire_client *wire_client)
+{
+ if (wire_client->wire_conn != NULL)
+ wire_conn_free(wire_client->wire_conn);
+
+ memset(wire_client, 0, sizeof(*wire_client)); /* help catch bugs */
+ free(wire_client);
+}
+
+static void wire_client_die(struct wire_client *wire_client,
+ const char *message)
+{
+ die("error in TCP connection to wire server: %s\n", message);
+}
+
+/* Serialize client-side argv into a single string with '\0'
+ * characters between args. We do not send the -wire_client argument,
+ * since we don't want to give the server an identity crisis.
+ */
+static void wire_client_serialize_argv(const char **argv, char **args_ptr,
+ int *args_len_ptr)
+{
+ int i;
+ char *args = NULL;
+ int args_len = 0;
+ char *end = NULL;
+
+ for (i = 0; argv[i]; ++i) {
+ if (strstr(argv[i], "-wire_client"))
+ continue;
+ args_len += strlen(argv[i]) + 1; /* + 1 for '\0' */
+ }
+
+ args = calloc(args_len, 1);
+ end = args;
+
+ for (i = 0; argv[i]; ++i) {
+ int len = 0;
+ if (strstr(argv[i], "-wire_client"))
+ continue;
+ len = strlen(argv[i]) + 1; /* + 1 for '\0' */
+ memcpy(end, argv[i], len);
+ end += len;
+ }
+
+ assert(end == args + args_len);
+
+ *args_ptr = args;
+ *args_len_ptr = args_len;
+}
+
+/* Send a WIRE_COMMAND_LINE_ARGS message with our command line
+ * arguments as a single serialized string.
+ */
+static void wire_client_send_args(struct wire_client *wire_client,
+ const struct config *config)
+{
+ char *args = NULL;
+ int args_len = 0;
+
+ wire_client_serialize_argv(config->argv, &args, &args_len);
+
+ if (wire_conn_write(wire_client->wire_conn,
+ WIRE_COMMAND_LINE_ARGS,
+ args, args_len))
+ wire_client_die(wire_client,
+ "error sending WIRE_COMMAND_LINE_ARGS");
+ free(args);
+}
+
+/* Send the path name of the script we're about to run. */
+static void wire_client_send_script_path(struct wire_client *wire_client,
+ const struct config *config)
+{
+ if (wire_conn_write(wire_client->wire_conn,
+ WIRE_SCRIPT_PATH,
+ config->script_path,
+ strlen(config->script_path)))
+ wire_client_die(wire_client,
+ "error sending WIRE_SCRIPT_PATH");
+}
+
+/* Send the ASCII contents of the script we're about to run. */
+static void wire_client_send_script(struct wire_client *wire_client,
+ const struct script *script)
+{
+ if (wire_conn_write(wire_client->wire_conn,
+ WIRE_SCRIPT,
+ script->buffer, script->length))
+ wire_client_die(wire_client,
+ "error sending WIRE_SCRIPT");
+}
+
+/* Send the ethernet address to which the server should send packets. */
+static void wire_client_send_hw_address(struct wire_client *wire_client,
+ const struct config *config)
+{
+ if (wire_conn_write(wire_client->wire_conn,
+ WIRE_HARDWARE_ADDR,
+ &wire_client->client_ether_addr,
+ sizeof(wire_client->client_ether_addr)))
+ wire_client_die(wire_client,
+ "error sending WIRE_HARDWARE_ADDR");
+}
+
+/* Receive server's message that the server is ready to execute the script. */
+static void wire_client_receive_server_ready(struct wire_client *wire_client)
+{
+ enum wire_op_t op = WIRE_INVALID;
+ void *buf = NULL;
+ int buf_len = -1;
+
+ if (wire_conn_read(wire_client->wire_conn,
+ &op, &buf, &buf_len))
+ wire_client_die(wire_client, "error reading WIRE_SERVER_READY");
+ if (op != WIRE_SERVER_READY) {
+ wire_client_die(wire_client,
+ "bad wire server: expected WIRE_SERVER_READY");
+ }
+ if (buf_len != 0) {
+ wire_client_die(wire_client,
+ "bad wire server: bad WIRE_SERVER_READY len");
+ }
+}
+
+/* Tell server that client is starting script execution. */
+void wire_client_send_client_starting(struct wire_client *wire_client)
+{
+ if (wire_conn_write(wire_client->wire_conn,
+ WIRE_CLIENT_STARTING,
+ NULL, 0))
+ wire_client_die(wire_client,
+ "error sending WIRE_CLIENT_STARTING");
+}
+
+/* Send a client request for the server to execute some packet events. */
+static void wire_client_send_packets_start(struct wire_client *wire_client)
+{
+ struct wire_packets_start start;
+ start.num_events = htonl(wire_client->num_events);
+ if (wire_conn_write(wire_client->wire_conn,
+ WIRE_PACKETS_START,
+ &start, sizeof(start)))
+ wire_client_die(wire_client,
+ "error sending WIRE_PACKETS_START");
+}
+
+/* Receive a message from the server that the server is done executing
+ * some packet events. Print any warnings we receive along the way.
+ */
+static void wire_client_receive_packets_done(struct wire_client *wire_client)
+{
+ enum wire_op_t op;
+ struct wire_packets_done done;
+ void *buf = NULL;
+ int buf_len = -1;
+
+ DEBUGP("wire_client_receive_packets_done\n");
+
+ while (1) {
+ if (wire_conn_read(wire_client->wire_conn,
+ &op, &buf, &buf_len))
+ wire_client_die(wire_client, "error reading");
+ if (op == WIRE_PACKETS_DONE)
+ break;
+ else if (op == WIRE_PACKETS_WARN) {
+ /* NULL-terminate the warning and print it. */
+ char *warning = strndup(buf, buf_len);
+ fprintf(stderr, "%s", warning);
+ free(warning);
+ } else {
+ wire_client_die(
+ wire_client,
+ "bad wire server: expected "
+ "WIRE_PACKETS_DONE or WIRE_PACKETS_WARN");
+ }
+ }
+
+ if (buf_len < sizeof(done) + 1) {
+ wire_client_die(wire_client,
+ "bad wire server: bad WIRE_PACKETS_DONE len");
+ }
+ if (((char *)buf)[buf_len - 1] != '\0') {
+ wire_client_die(wire_client,
+ "bad wire server: missing string terminator");
+ }
+
+ memcpy(&done, buf, sizeof(done));
+
+ if (ntohl(done.result) == STATUS_ERR) {
+ /* Die with the error message from the server, which
+ * is a C string following the fixed "done" message.
+ */
+ die("%s", (char *)(buf + sizeof(done)));
+ } else if (ntohl(done.num_events) != wire_client->num_events) {
+ char *msg = NULL;
+ asprintf(&msg, "bad wire server: bad message count: "
+ "got: %d vs expected: %d",
+ ntohl(done.num_events), wire_client->num_events);
+ wire_client_die(wire_client, msg);
+ }
+}
+
+/* Connect to the wire server, pass it our command line argument
+ * options, the script we're going to execute, and our MAC address.
+ */
+int wire_client_init(struct wire_client *wire_client,
+ const struct config *config,
+ const struct script *script)
+{
+ DEBUGP("wire_client_init\n");
+ assert(config->is_wire_client);
+
+ get_hw_address(config->wire_client_device,
+ &wire_client->client_ether_addr,
+ config->ip_version);
+
+ wire_client->wire_conn = wire_conn_new();
+ wire_conn_connect(wire_client->wire_conn,
+ &config->wire_server_ip,
+ config->wire_server_port,
+ config->ip_version);
+
+ wire_client_send_args(wire_client, config);
+
+ wire_client_send_script_path(wire_client, config);
+
+ wire_client_send_script(wire_client, script);
+
+ wire_client_send_hw_address(wire_client, config);
+
+ wire_client_receive_server_ready(wire_client);
+
+ return STATUS_OK;
+}
+
+
+/* Tell the wire client that the interpreter has moved on to the next
+ * event. Inform the wire server if need be. The client informs the
+ * server if (a) this event is a packet event and (b) the previous
+ * event was not a packet event. In any other cases the server either
+ * (i) does not care what time this event is happening at because it's
+ * not an on-the-wire event, or (ii) already knows what time to fire
+ * this on-the-wire event because the previous event was also an
+ * on-the-wire event.
+ */
+void wire_client_next_event(struct wire_client *wire_client,
+ struct event *event)
+{
+ /* Tell the server to start executing packet events. */
+ if (event && (event->type == PACKET_EVENT) &&
+ (wire_client->last_event_type != PACKET_EVENT)) {
+ wire_client_send_packets_start(wire_client);
+ }
+
+ /* Get the result from server execution of one or more packet events. */
+ if ((!event || (event->type != PACKET_EVENT)) &&
+ (wire_client->last_event_type == PACKET_EVENT)) {
+ wire_client_receive_packets_done(wire_client);
+ }
+
+ if (event) {
+ wire_client->last_event_type = event->type;
+ ++wire_client->num_events;
+ }
+}
diff --git a/test/packetdrill/wire_client.h b/test/packetdrill/wire_client.h
new file mode 100644
index 0000000..5b3b653
--- /dev/null
+++ b/test/packetdrill/wire_client.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Client-side code for remote on-the-wire testing using a real NIC.
+ */
+
+#ifndef __WIRE_CLIENT_H__
+#define __WIRE_CLIENT_H__
+
+#include "types.h"
+
+#include "ethernet.h"
+#include "script.h"
+#include "wire_protocol.h"
+#include "wire_conn.h"
+
+struct config;
+struct state;
+
+/* Internal private state for the wire client. */
+struct wire_client {
+ struct wire_conn *wire_conn; /* connection to wire server */
+
+ struct ether_addr client_ether_addr; /* wire client hardware addr */
+
+ enum event_t last_event_type; /* type of previous event */
+ int num_events; /* events executed so far */
+};
+
+/* Allocate a new wire_client. */
+struct wire_client *wire_client_new(void);
+
+/* Initiate remote on-the-wire testing using a real NIC. */
+extern int wire_client_init(struct wire_client *wire_client,
+ const struct config *config,
+ const struct script *script);
+
+/* Delete a wire_client and its associated objects. */
+extern void wire_client_free(struct wire_client *wire_client);
+
+/* Send a message that the client is starting now. */
+extern void wire_client_send_client_starting(struct wire_client *wire_client);
+
+/* Tell the client state machine that the script interpreter has moved
+ * on to the next event, and is about to wait for and execute the
+ * given event.
+ */
+extern void wire_client_next_event(struct wire_client *wire_client,
+ struct event *event);
+
+#endif /* __WIRE_CLIENT_H__ */
diff --git a/test/packetdrill/wire_client_netdev.c b/test/packetdrill/wire_client_netdev.c
new file mode 100644
index 0000000..ce4ebef
--- /dev/null
+++ b/test/packetdrill/wire_client_netdev.c
@@ -0,0 +1,167 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Client-side network device code for remote on-the-wire testing
+ * using a real NIC.
+ */
+
+#include "wire_client_netdev.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "logging.h"
+#include "net_utils.h"
+
+struct wire_client_netdev {
+ struct netdev netdev; /* "inherit" from netdev */
+
+ char *name; /* malloc-allocated copy of interface name */
+};
+
+struct netdev_ops wire_client_netdev_ops;
+
+/* "Downcast" an abstract netdev to our flavor. */
+static inline struct wire_client_netdev *to_client_netdev(
+ struct netdev *netdev)
+{
+ return (struct wire_client_netdev *)netdev;
+}
+
+/* Check that the remote IP is actually remote. It must be to ensure
+ * that test packets will pass through our device.
+ */
+static void check_remote_address(struct config *config,
+ struct wire_client_netdev *netdev)
+{
+ if (is_ip_local(&config->live_remote_ip)) {
+ die("error: live_remote_ip %s is not remote\n",
+ config->live_remote_ip_string);
+ }
+}
+
+
+/* Route traffic destined for our remote IP through this device */
+static void route_traffic_to_wire_server(struct config *config,
+ struct wire_client_netdev *netdev)
+{
+ char *route_command = NULL;
+#ifdef linux
+ asprintf(&route_command,
+ "ip %s route del %s > /dev/null 2>&1 ; "
+ "ip %s route add %s dev %s via %s > /dev/null 2>&1",
+ (config->wire_protocol == AF_INET6) ? "-6" : "",
+ config->live_remote_prefix_string,
+ (config->wire_protocol == AF_INET6) ? "-6" : "",
+ config->live_remote_prefix_string,
+ netdev->name,
+ config->live_gateway_ip_string);
+#endif
+#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
+ if (config->wire_protocol == AF_INET) {
+ asprintf(&route_command,
+ "route delete %s > /dev/null 2>&1 ; "
+ "route add %s %s > /dev/null 2>&1",
+ config->live_remote_prefix_string,
+ config->live_remote_prefix_string,
+ config->live_gateway_ip_string);
+ } else if (config->wire_protocol == AF_INET6) {
+ asprintf(&route_command,
+ "route delete -inet6 %s > /dev/null 2>&1 ; "
+ "route add -inet6 %s %s > /dev/null 2>&1",
+ config->live_remote_prefix_string,
+ config->live_remote_prefix_string,
+ config->live_gateway_ip_string);
+ } else {
+ assert(!"bad wire protocol");
+ }
+#endif /* defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) */
+
+ /* We intentionally ignore failures and output to stderr,
+ * since they can happen if there is no previously existing
+ * route.
+ */
+ system(route_command);
+
+ free(route_command);
+}
+
+struct netdev *wire_client_netdev_new(struct config *config)
+{
+ DEBUGP("wire_client_netdev_new\n");
+
+ struct wire_client_netdev *netdev =
+ calloc(1, sizeof(struct wire_client_netdev));
+
+ netdev->netdev.ops = &wire_client_netdev_ops;
+
+ netdev->name = strdup(config->wire_client_device);
+
+ check_remote_address(config, netdev);
+
+ /* Add the client live local IP to our NIC, so we can send/receive */
+ net_setup_dev_address(netdev->name,
+ &config->live_local_ip,
+ config->live_prefix_len);
+
+ route_traffic_to_wire_server(config, netdev);
+
+ return (struct netdev *)netdev;
+}
+
+static void wire_client_netdev_free(struct netdev *a_netdev)
+{
+ DEBUGP("wire_client_netdev_free\n");
+
+ struct wire_client_netdev *netdev = to_client_netdev(a_netdev);
+
+ free(netdev->name);
+
+ memset(netdev, 0, sizeof(*netdev)); /* paranoia */
+ free(netdev);
+}
+
+static int wire_client_netdev_send(struct netdev *a_netdev,
+ struct packet *packet)
+{
+ DEBUGP("wire_client_netdev_send\n");
+ assert(!"wire clients should not be sending packets themselves!");
+ /* The server side should be sending the packets... */
+
+ return STATUS_ERR;
+}
+
+static int wire_client_netdev_receive(struct netdev *a_netdev,
+ struct packet **packet, char **error)
+{
+ DEBUGP("wire_client_netdev_receive\n");
+ assert(!"wire clients should not be receiving packets themselves!");
+ /* The server side should be receiving and checking the packets... */
+
+ return STATUS_ERR;
+}
+
+struct netdev_ops wire_client_netdev_ops = {
+ .free = wire_client_netdev_free,
+ .send = wire_client_netdev_send,
+ .receive = wire_client_netdev_receive,
+};
diff --git a/test/packetdrill/wire_client_netdev.h b/test/packetdrill/wire_client_netdev.h
new file mode 100644
index 0000000..2ec64cd
--- /dev/null
+++ b/test/packetdrill/wire_client_netdev.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Client-side network device code for remote on-the-wire testing
+ * using a real NIC.
+ */
+
+#ifndef __WIRE_CLIENT_NETDEV_H__
+#define __WIRE_CLIENT_NETDEV_H__
+
+#include "types.h"
+
+#include "config.h"
+#include "netdev.h"
+
+/* Allocate and return a new wire client netdev. */
+extern struct netdev *wire_client_netdev_new(struct config *config);
+
+#endif /* __WIRE_CLIENT_NETDEV_H__ */
diff --git a/test/packetdrill/wire_conn.c b/test/packetdrill/wire_conn.c
new file mode 100644
index 0000000..945f4b0
--- /dev/null
+++ b/test/packetdrill/wire_conn.c
@@ -0,0 +1,254 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * TCP connection handling for remote on-the-wire testing using a real NIC.
+ */
+
+#include "wire_conn.h"
+
+#include <errno.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <netdb.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "logging.h"
+#include "tcp.h"
+#include "wrap.h"
+
+/* Cap the max message we're willing to read, so remote side can't OOM us. */
+#define MAX_MESSAGE_BYTES (10*1000*1000)
+
+struct wire_conn *wire_conn_new(void)
+{
+ DEBUGP("wire_conn_new\n");
+ struct wire_conn *wire_conn = calloc(1, sizeof(struct wire_conn));
+ wire_conn->fd = -1;
+
+ return wire_conn;
+}
+
+void wire_conn_free(struct wire_conn *conn)
+{
+ if (conn->fd != -1)
+ close(conn->fd);
+ free(conn->in.buf);
+ memset(conn, 0, sizeof(*conn)); /* paranoia: catch bugs */
+ free(conn);
+}
+
+/* Create the TCP socket. */
+static void create_tcp_socket(struct wire_conn *conn,
+ enum ip_version_t ip_version)
+{
+ assert(conn->fd == -1);
+ conn->fd = wrap_socket(ip_version, SOCK_STREAM);
+}
+
+/* Set default TCP socket options for decent performance. */
+static void set_default_tcp_options(struct wire_conn *conn)
+{
+ int val;
+
+ DEBUGP("set_default_tcp_options fd %d\n", conn->fd);
+
+ /* Disable Nagle algorithm so packets go out ASAP regardless of size. */
+ val = 1;
+ if (setsockopt(conn->fd, SOL_TCP, TCP_NODELAY, &val, sizeof(val)) < 0)
+ die_perror("setsockopt TCP_NODELAY");
+
+ /* Set receive buffer to allow high throughput. */
+ val = 128*1024;
+ if (setsockopt(conn->fd, SOL_SOCKET, SO_RCVBUF, &val,
+ sizeof(val)) < 0) {
+ die_perror("setsockopt SO_RCVBUF");
+ }
+
+ /* Set send buffer to allow high throughput and avoid blocking. */
+ val = 128*1024;
+ if (setsockopt(conn->fd, SOL_SOCKET, SO_SNDBUF, &val,
+ sizeof(val)) < 0) {
+ die_perror("setsockopt SO_SNDBUF");
+ }
+}
+
+void wire_conn_connect(struct wire_conn *conn,
+ const struct ip_address *ip,
+ u16 port,
+ enum ip_version_t ip_version)
+{
+ DEBUGP("wire_conn_connect\n");
+ struct sockaddr_storage sa;
+ socklen_t length = 0;
+
+ create_tcp_socket(conn, ip_version);
+ set_default_tcp_options(conn);
+
+ /* Do a blocking connect to the server. */
+ ip_to_sockaddr(ip, port, (struct sockaddr *)&sa, &length);
+ if (connect(conn->fd, (struct sockaddr *)&sa, length) < 0) {
+ char ip_string[ADDR_STR_LEN];
+ die("error connecting to wire server at %s:%d: %s\n",
+ ip_to_string(ip, ip_string), port, strerror(errno));
+ }
+}
+
+void wire_conn_bind_listen(struct wire_conn *listen_conn,
+ u16 port,
+ enum ip_version_t ip_version)
+{
+ DEBUGP("wire_conn_bind_listen\n");
+ int val;
+
+ create_tcp_socket(listen_conn, ip_version);
+
+ val = 1;
+ if (setsockopt(listen_conn->fd, SOL_SOCKET, SO_REUSEADDR,
+ &val, sizeof(val)) < 0) {
+ die_perror("setsockopt SO_REUSEADDR");
+ }
+
+ wrap_bind_listen(listen_conn->fd, ip_version, port);
+}
+
+void wire_conn_accept(struct wire_conn *listen_conn,
+ struct wire_conn **accepted_conn)
+{
+ int fd = -1;
+
+ DEBUGP("wire_conn_accept\n");
+
+ fd = accept(listen_conn->fd, NULL, NULL);
+ if (fd < 0)
+ die_perror("accept");
+
+ DEBUGP("accepted fd %d\n", fd);
+
+ *accepted_conn = wire_conn_new();
+ (*accepted_conn)->fd = fd;
+
+ set_default_tcp_options(*accepted_conn);
+}
+
+/* Do blocking writes until all bytes are written. Given our large
+ * socket buffer size and typically small write sizes, in practice all
+ * the writes should complete in one call.
+ */
+static int write_bytes(struct wire_conn *conn,
+ const void *buf, int buf_len)
+{
+ while (buf_len > 0) {
+ int bytes_written = write(conn->fd, buf, buf_len);
+ if (bytes_written < 0) {
+ if (errno == EINTR || errno == EAGAIN) {
+ continue;
+ } else {
+ perror("TCP socket write");
+ return STATUS_ERR;
+ }
+ }
+ assert(bytes_written <= buf_len);
+ buf_len -= bytes_written;
+ buf += bytes_written;
+ }
+ return STATUS_OK;
+}
+
+int wire_conn_write(struct wire_conn *conn,
+ enum wire_op_t op,
+ const void *buf, int buf_len)
+{
+ DEBUGP("wire_conn_write -> op: %s\n",
+ wire_op_to_string(op));
+ struct wire_header header;
+
+ header.length = htonl(sizeof(header) + buf_len);
+ header.op = htonl(op);
+
+ if (write_bytes(conn, &header, sizeof(header)))
+ return STATUS_ERR;
+
+ if (write_bytes(conn, buf, buf_len))
+ return STATUS_ERR;
+
+ return STATUS_OK;
+}
+
+/* Do blocking reads until we've read the given number of bytes. */
+static int read_bytes(struct wire_conn *conn,
+ void *buf, int buf_len)
+{
+ while (buf_len > 0) {
+ int bytes_read = read(conn->fd, buf, buf_len);
+ if (bytes_read < 0) {
+ if (errno == EINTR || errno == EAGAIN) {
+ continue;
+ } else {
+ perror("TCP socket read");
+ return STATUS_ERR;
+ }
+ } else if (bytes_read == 0) {
+ fprintf(stderr, "remote side closed connection\n");
+ return STATUS_ERR;
+ }
+ assert(bytes_read <= buf_len);
+ buf_len -= bytes_read;
+ buf += bytes_read;
+ }
+ return STATUS_OK;
+}
+
+int wire_conn_read(struct wire_conn *conn,
+ enum wire_op_t *op,
+ void **buf, int *buf_len)
+{
+ DEBUGP("wire_conn_read\n");
+
+ struct wire_header header;
+
+ if (read_bytes(conn, &header, sizeof(header)))
+ return STATUS_ERR;
+
+ *op = ntohl(header.op);
+
+ DEBUGP("wire_conn_read -> op: %s\n", wire_op_to_string(*op));
+
+ *buf_len = ntohl(header.length) - sizeof(header);
+ if ((*buf_len < 0) || (*buf_len > MAX_MESSAGE_BYTES)) {
+ fprintf(stderr, "invalid length %d from remote wire conn\n",
+ *buf_len);
+ return STATUS_ERR;
+ }
+
+ if (conn->in.buf_space < *buf_len) {
+ free(conn->in.buf);
+ conn->in.buf_space = 2 * *buf_len;
+ conn->in.buf = malloc(conn->in.buf_space);
+ }
+
+ *buf = conn->in.buf;
+
+ if (read_bytes(conn, *buf, *buf_len))
+ return STATUS_ERR;
+
+ return STATUS_OK;
+}
diff --git a/test/packetdrill/wire_conn.h b/test/packetdrill/wire_conn.h
new file mode 100644
index 0000000..d8b85fe
--- /dev/null
+++ b/test/packetdrill/wire_conn.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * TCP connection handling for remote on-the-wire testing using a real NIC.
+ */
+
+#ifndef __WIRE_CONN_H__
+#define __WIRE_CONN_H__
+
+#include "types.h"
+
+#include "ip_address.h"
+#include "wire_protocol.h"
+
+struct config;
+
+/* Buffer holding input or output data for a TCP socket. */
+struct wire_conn_buffer {
+ char *buf; /* malloc-allocated buffer */
+ int buf_space; /* bytes allocated in malloc-allocated "buf" buffer */
+ int used; /* bytes of actual data at the start of "buf" */
+};
+
+/* A TCP socket used for client<->server communication for doing
+ * remote on-the-wire testing using a real NIC.
+ */
+struct wire_conn {
+ int fd; /* socket for TCP connection (or -1) */
+ struct wire_conn_buffer in; /* data read in last wire_conn_read() */
+};
+
+/* Create a wire_conn. Note that a struct wire_conn shouldn't be
+ * stack-allocated and should always use wire_conn_new() and
+ * wire_conn_free().
+ */
+struct wire_conn *wire_conn_new(void);
+
+/* Free a wire_conn. */
+void wire_conn_free(struct wire_conn *conn);
+
+/* Blocking connect. */
+void wire_conn_connect(struct wire_conn *conn,
+ const struct ip_address *ip, u16 port,
+ enum ip_version_t ip_version);
+
+/* Blocking bind and listen. */
+void wire_conn_bind_listen(struct wire_conn *listen_conn, u16 port,
+ enum ip_version_t ip_version);
+
+/* Blocking accept. */
+void wire_conn_accept(struct wire_conn *listen_conn,
+ struct wire_conn **accepted_conn);
+
+/* Blocking write of a single message. */
+int wire_conn_write(struct wire_conn *conn,
+ enum wire_op_t op,
+ const void *buf, int buf_len);
+
+/* Blocking read of a single message. Changes *buf to point to the
+ * wire_conn_buffer of this connection, which is guaranteed to be big
+ * enough to hold the whole *buf_len bytes returned. The wire_conn
+ * owns this memory, not the caller. The returned buffer can be
+ * invalidated (freed or re-written) by the next call to
+ * wire_conn_read().
+ */
+int wire_conn_read(struct wire_conn *conn,
+ enum wire_op_t *op,
+ void **buf, int *buf_len);
+
+#endif /* __WIRE_CONN_H__ */
diff --git a/test/packetdrill/wire_protocol.c b/test/packetdrill/wire_protocol.c
new file mode 100644
index 0000000..0ccfb44
--- /dev/null
+++ b/test/packetdrill/wire_protocol.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Protocol for remote on-the-wire testing using a real NIC.
+ */
+
+#include "wire_protocol.h"
+
+const char *wire_op_to_string(enum wire_op_t op)
+{
+ if (op < WIRE_INVALID)
+ return "NEGATIVE_WIRE_OP!";
+ if (op > WIRE_NUM_OPS)
+ return "WIRE_OP_TOO_BIG!";
+ switch (op) {
+ case WIRE_INVALID: return "WIRE_INVALID";
+ case WIRE_COMMAND_LINE_ARGS: return "WIRE_COMMAND_LINE_ARGS";
+ case WIRE_SCRIPT_PATH: return "WIRE_SCRIPT_PATH";
+ case WIRE_SCRIPT: return "WIRE_SCRIPT";
+ case WIRE_HARDWARE_ADDR: return "WIRE_HARDWARE_ADDR";
+ case WIRE_SERVER_READY: return "WIRE_SERVER_READY";
+ case WIRE_CLIENT_STARTING: return "WIRE_CLIENT_STARTING";
+ case WIRE_PACKETS_START: return "WIRE_PACKETS_START";
+ case WIRE_PACKETS_WARN: return "WIRE_PACKETS_WARN";
+ case WIRE_PACKETS_DONE: return "WIRE_PACKETS_DONE";
+ case WIRE_NUM_OPS: return "WIRE_NUM_OPS";
+ /* We omit the default case so compiler catches missing values. */
+ }
+ assert(!"not reached");
+ return "";
+}
diff --git a/test/packetdrill/wire_protocol.h b/test/packetdrill/wire_protocol.h
new file mode 100644
index 0000000..a07a509
--- /dev/null
+++ b/test/packetdrill/wire_protocol.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Protocol for remote on-the-wire testing using a real NIC.
+ */
+
+#ifndef __WIRE_PROTOCOL_H__
+#define __WIRE_PROTOCOL_H__
+
+#include "types.h"
+
+/* Types of messages wire_client and wire_server send to each other. */
+enum wire_op_t {
+ WIRE_INVALID = 0, /* invalid OP */
+ WIRE_COMMAND_LINE_ARGS, /* "here are my command line arguments" */
+ WIRE_SCRIPT_PATH, /* "here's the path of the script" */
+ WIRE_SCRIPT, /* "here's the script we're going to start" */
+ WIRE_HARDWARE_ADDR, /* "here's my ethernet MAC address" */
+ WIRE_SERVER_READY, /* "server ready to start script execution" */
+ WIRE_CLIENT_STARTING, /* "i'm starting script execution... now!" */
+ WIRE_PACKETS_START, /* "please start handling packet events" */
+ WIRE_PACKETS_WARN, /* "here's a warning about fishy packets" */
+ WIRE_PACKETS_DONE, /* "i'm done handling packet events" */
+ WIRE_NUM_OPS,
+};
+
+/* Return the human-readable name for a given op (static string). */
+extern const char *wire_op_to_string(enum wire_op_t op);
+
+/* Header prefix before all messages in both directions. */
+struct wire_header {
+ __be32 length; /* bytes in message (network order), including header */
+ __be32 op; /* enum wire_op_t (network order) */
+};
+
+/* A client request for the server to execute some packet events. */
+struct wire_packets_start {
+ __be32 num_events; /* total events executed (network order) */
+};
+
+/* The server is done executing some packet events. */
+struct wire_packets_done {
+ __be32 result; /* STATUS_OK or TCPEST_ERR (network order) */
+ __be32 num_events; /* total events executed (network order) */
+ char error_message[0]; /* '\0'-teriminated error message, or empty */
+};
+
+#endif /* __WIRE_PROTOCOL_H__ */
diff --git a/test/packetdrill/wire_server.c b/test/packetdrill/wire_server.c
new file mode 100644
index 0000000..e2dc636
--- /dev/null
+++ b/test/packetdrill/wire_server.c
@@ -0,0 +1,537 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Server-side code for remote on-the-wire testing using a real NIC.
+ */
+
+#include "wire_server.h"
+
+#include <pthread.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "link_layer.h"
+#include "logging.h"
+#include "run.h"
+#include "wire_conn.h"
+#include "wire_server.h"
+#include "wire_server_netdev.h"
+
+/* Internal private state for the wire server to run one script. */
+struct wire_server {
+ struct wire_conn *wire_conn; /* connection to wire client */
+ u16 port; /* port we listen on */
+
+ int argc; /* args in client cmd line */
+ char **argv; /* client command line */
+
+ struct config config; /* run-time configuration */
+ struct script script; /* raw and parsed script */
+ struct state *state; /* interpreter engine state */
+
+ char *script_path; /* path of script (on cli!) */
+ char *script_buffer; /* contents of script */
+
+ char *wire_server_device; /* name of our eth interface */
+ struct ether_addr client_ether_addr; /* wire client hardware addr */
+ struct ether_addr server_ether_addr; /* wire server hardware addr */
+
+ enum event_t last_event_type; /* type of previous event */
+ int num_events; /* events executed so far */
+};
+
+static struct wire_server *wire_server_new(struct wire_conn *accepted_conn,
+ const char *wire_server_device,
+ u16 wire_server_port,
+ enum ip_version_t ip_version)
+{
+ struct wire_server *wire_server = calloc(1, sizeof(struct wire_server));
+ wire_server->wire_conn = accepted_conn;
+ wire_server->wire_server_device = strdup(wire_server_device);
+ get_hw_address(wire_server_device, &wire_server->server_ether_addr,
+ ip_version);
+ wire_server->port = wire_server_port;
+ return wire_server;
+}
+
+static void wire_server_free(struct wire_server *wire_server)
+{
+ wire_conn_free(wire_server->wire_conn);
+ free(wire_server->script_path);
+ free(wire_server->script_buffer);
+ free(wire_server->wire_server_device);
+ memset(wire_server, 0, sizeof(*wire_server)); /* catch bugs */
+ free(wire_server);
+}
+
+/* Unserialize argv from a single string with '\0' characters between
+ * args. Add a --wire_server so that we don't have an identity crisis.
+ */
+static void wire_server_unserialize_argv(struct wire_server *wire_server,
+ const char *args, int args_len)
+{
+ int argc, i;
+ char **argv = NULL;
+ const char *end = NULL;
+
+ argc = 0;
+ for (i = 0; i < args_len; ++i) {
+ if (args[i] == '\0')
+ ++argc;
+ }
+ ++argc; /* for --wire_server argument */
+ DEBUGP("argc = %d\n", argc);
+
+ /* We use argc+1 here because, following main() calling
+ * conventions, we make the array element at argv[argc] a NULL
+ * pointer.
+ */
+ argv = calloc(argc + 1, sizeof(char *));
+
+ end = args;
+ for (i = 0; i < argc; ++i) {
+ argv[i] = strdup(end);
+ end += strlen(end) + 1; /* + 1 for '\0' */
+ }
+ asprintf(&argv[argc-1], "--wire_server");
+
+ for (i = 0; i < argc; ++i)
+ DEBUGP("argv[%d] = '%s'\n", i, argv[i]);
+
+ wire_server->argc = argc;
+ wire_server->argv = argv;
+}
+
+/* Receive a WIRE_COMMAND_LINE_ARGS message */
+static int wire_server_receive_args(struct wire_server *wire_server)
+{
+ enum wire_op_t op = WIRE_INVALID;
+ void *buf = NULL;
+ int buf_len = -1;
+
+ if (wire_conn_read(wire_server->wire_conn, &op, &buf, &buf_len))
+ return STATUS_ERR;
+ if (op != WIRE_COMMAND_LINE_ARGS) {
+ fprintf(stderr,
+ "bad wire client: expected WIRE_COMMAND_LINE_ARGS\n");
+ return STATUS_ERR;
+ }
+
+ wire_server_unserialize_argv(wire_server,
+ buf, buf_len);
+
+ return STATUS_OK;
+}
+
+/* Receive the path name of the script we're about to run. */
+static int wire_server_receive_script_path(struct wire_server *wire_server)
+{
+ enum wire_op_t op = WIRE_INVALID;
+ void *buf = NULL;
+ int buf_len = -1;
+
+ if (wire_conn_read(wire_server->wire_conn, &op, &buf, &buf_len))
+ return STATUS_ERR;
+ if (op != WIRE_SCRIPT_PATH) {
+ fprintf(stderr,
+ "bad wire client: expected WIRE_SCRIPT_PATH\n");
+ return STATUS_ERR;
+ }
+
+ wire_server->script_path = strndup(buf, buf_len);
+
+ return STATUS_OK;
+}
+
+/* Receive the script we're about to run. */
+static int wire_server_receive_script(struct wire_server *wire_server)
+{
+ enum wire_op_t op = WIRE_INVALID;
+ void *buf = NULL;
+ int buf_len = -1;
+
+ if (wire_conn_read(wire_server->wire_conn, &op, &buf, &buf_len))
+ return STATUS_ERR;
+ if (op != WIRE_SCRIPT) {
+ fprintf(stderr,
+ "bad wire client: expected WIRE_SCRIPT\n");
+ return STATUS_ERR;
+ }
+
+ wire_server->script_buffer = strndup(buf, buf_len);
+
+ return STATUS_OK;
+}
+
+
+/* Receive the ethernet address to which the server should send packets. */
+static int wire_server_receive_hw_address(struct wire_server *wire_server)
+{
+ enum wire_op_t op = WIRE_INVALID;
+ void *buf = NULL;
+ int buf_len = -1;
+
+ if (wire_conn_read(wire_server->wire_conn, &op, &buf, &buf_len))
+ return STATUS_ERR;
+ if (op != WIRE_HARDWARE_ADDR) {
+ fprintf(stderr,
+ "bad wire client: expected WIRE_HARDWARE_ADDR\n");
+ return STATUS_ERR;
+ }
+ if (buf_len != sizeof(wire_server->client_ether_addr)) {
+ fprintf(stderr,
+ "bad wire client: bad hw address length\n");
+ return STATUS_ERR;
+ }
+
+ ether_copy(&wire_server->client_ether_addr, buf);
+
+ return STATUS_OK;
+}
+
+/* Send a message to tell the client we're ready to excecute the script. */
+static int wire_server_send_server_ready(struct wire_server *wire_server)
+{
+ if (wire_conn_write(wire_server->wire_conn,
+ WIRE_SERVER_READY,
+ NULL, 0)) {
+ fprintf(stderr, "error sending WIRE_SERVER_READY\n");
+ return STATUS_ERR;
+ }
+ return STATUS_OK;
+}
+
+/* Wait for the client to say it's starting script execution. */
+static int wire_server_receive_client_starting(struct wire_server *wire_server)
+{
+ enum wire_op_t op = WIRE_INVALID;
+ void *buf = NULL;
+ int buf_len = -1;
+
+ if (wire_conn_read(wire_server->wire_conn, &op, &buf, &buf_len))
+ return STATUS_ERR;
+ if (op != WIRE_CLIENT_STARTING) {
+ fprintf(stderr,
+ "bad wire client: expected WIRE_CLIENT_STARTING\n");
+ return STATUS_ERR;
+ }
+ if (buf_len != 0) {
+ fprintf(stderr,
+ "bad wire client: bad WIRE_CLIENT_STARTING length\n");
+ return STATUS_ERR;
+ }
+
+ return STATUS_OK;
+}
+
+/* Wait for the client request for the server to execute some packet events. */
+static int wire_server_receive_packets_start(struct wire_server *wire_server)
+{
+ enum wire_op_t op = WIRE_INVALID;
+ void *buf = NULL;
+ int buf_len = -1;
+ struct wire_packets_start start;
+
+ if (wire_conn_read(wire_server->wire_conn, &op, &buf, &buf_len))
+ return STATUS_ERR;
+ if (op != WIRE_PACKETS_START) {
+ fprintf(stderr,
+ "bad wire client: expected WIRE_PACKETS_START\n");
+ return STATUS_ERR;
+ }
+ if (buf_len != sizeof(start)) {
+ fprintf(stderr,
+ "bad wire client: bad WIRE_PACKETS_START length\n");
+ return STATUS_ERR;
+ }
+
+ memcpy(&start, buf, sizeof(start));
+ if (ntohl(start.num_events) != wire_server->num_events) {
+ fprintf(stderr,
+ "bad client event count; expected %d but got %d",
+ wire_server->num_events, ntohl(start.num_events));
+ return STATUS_ERR;
+ }
+
+ return STATUS_OK;
+}
+
+/* Send back to the client a human-readable warning about a fishy packet. */
+static int wire_server_send_packet_warning(struct wire_server *wire_server,
+ const char *warning)
+{
+ if (wire_conn_write(wire_server->wire_conn, WIRE_PACKETS_WARN,
+ warning, strlen(warning))) {
+ fprintf(stderr, "error sending WIRE_PACKETS_WARN\n");
+ return STATUS_ERR;
+ }
+ return STATUS_OK;
+}
+
+/* Tell the client that the server is done executing some packet events. */
+static int wire_server_send_packets_done(struct wire_server *wire_server,
+ int result,
+ const char *error)
+{
+ struct wire_packets_done done;
+ int error_len = strlen(error) + 1; /* +1 for '\0' */
+ int buf_len = sizeof(done) + error_len;
+ char *buf = malloc(buf_len);
+
+ done.result = htonl(result);
+ done.num_events = htonl(wire_server->num_events);
+ memcpy(buf, &done, sizeof(done));
+ memcpy(buf + sizeof(done), error, error_len);
+
+ if (wire_conn_write(wire_server->wire_conn,
+ WIRE_PACKETS_DONE,
+ buf, buf_len)) {
+ fprintf(stderr, "error sending WIRE_PACKETS_DONE\n");
+ return STATUS_ERR;
+ }
+
+ return STATUS_OK;
+}
+
+/* Coordinate with the wire client. See wire_client_next_event(). */
+static int wire_server_next_event(struct wire_server *wire_server,
+ struct event *event)
+{
+ /* Wait for the client's request to start executing packet events. */
+ if (event && (event->type == PACKET_EVENT) &&
+ (wire_server->last_event_type != PACKET_EVENT)) {
+ if (wire_server_receive_packets_start(wire_server))
+ return STATUS_ERR;
+ }
+
+ /* Send the result from server execution of packet events. */
+ if ((!event || (event->type != PACKET_EVENT)) &&
+ (wire_server->last_event_type == PACKET_EVENT)) {
+ if (wire_server_send_packets_done(wire_server, STATUS_OK, ""))
+ return STATUS_ERR;
+ }
+
+ if (event) {
+ wire_server->last_event_type = event->type;
+ ++wire_server->num_events;
+ }
+
+ return STATUS_OK;
+}
+
+/* Run the given packet event; send any error or warning back to the client. */
+static int wire_server_run_packet_event(
+ struct wire_server *wire_server, struct event *event,
+ struct packet *packet, char **error)
+{
+ int result = STATUS_OK;
+
+ result = run_packet_event(wire_server->state,
+ event, packet, error);
+ if (result == STATUS_ERR) {
+ /* When we sniff an incorrect packet, don't exit the
+ * process (we're a daemon), just return the error
+ * message via the TCP socket and finish the thread.
+ */
+ DEBUGP("wire_server_run_packet_event: error!\n");
+ if (wire_server_send_packets_done(wire_server, STATUS_ERR,
+ *error))
+ return STATUS_ERR;
+ } else if (result == STATUS_WARN) {
+ /* A non-fatal problem with the packet. Return the
+ * warning message via the TCP socket and keep going.
+ */
+ DEBUGP("wire_server_run_packet_event: warning!\n");
+ if (wire_server_send_packet_warning(wire_server, *error))
+ return STATUS_ERR;
+ }
+ return result;
+}
+
+/* Execute the server-side duties for remote on-the-wire testing using
+ * a real NIC. Basically the server side just needs to send packets
+ * over the wire (to the kernel under test) and sniff and verify
+ * packets on the wire (from the kernel under test). This is analogous
+ * to run_script(), which executes scripts for stand-alone mode,
+ * and also executes the client side for remote on-the-wire testing
+ * using a real NIC.
+ */
+static int wire_server_run_script(struct wire_server *wire_server,
+ char **error)
+{
+ struct state *state = wire_server->state;
+ struct event *event = NULL;
+
+ DEBUGP("wire_server_run_script\n");
+
+ state->live_start_time_usecs = now_usecs(state);
+ DEBUGP("live_start_time_usecs is %lld\n",
+ state->live_start_time_usecs);
+
+ while (1) {
+ if (get_next_event(state, error))
+ return STATUS_ERR;
+ event = state->event;
+ if (event == NULL)
+ break;
+
+ if (wire_server_next_event(wire_server, event))
+ return STATUS_ERR;
+
+ /* We adjust relative times after getting notification
+ * that previous client-side events have completed.
+ */
+ adjust_relative_event_times(state, event);
+
+ switch (event->type) {
+ case PACKET_EVENT:
+ if (wire_server_run_packet_event(wire_server, event,
+ event->event.packet,
+ error) == STATUS_ERR)
+ return STATUS_ERR;
+ break;
+ case SYSCALL_EVENT:
+ DEBUGP("SYSCALL_EVENT happens on client side...\n");
+ break;
+ case COMMAND_EVENT:
+ DEBUGP("COMMAND_EVENT happens on client side...\n");
+ break;
+ case CODE_EVENT:
+ DEBUGP("CODE_EVENT happens on client side...\n");
+ break;
+ case INVALID_EVENT:
+ case NUM_EVENT_TYPES:
+ assert(!"bogus type");
+ break;
+ /* We omit default case so compiler catches missing values. */
+ }
+ }
+
+ /* Tell the client about any outstanding packet events it requested. */
+ wire_server_next_event(wire_server, NULL);
+
+ DEBUGP("wire_server_run_script: done running\n");
+
+ return STATUS_OK;
+}
+
+/* Handle a wire connection from a client. */
+static void *wire_server_thread(void *arg)
+{
+ struct wire_server *wire_server = (struct wire_server *)arg;
+ struct netdev *netdev = NULL;
+ char *error = NULL;
+
+ DEBUGP("wire_server_thread\n");
+
+ set_default_config(&wire_server->config);
+
+ if (wire_server_receive_args(wire_server))
+ goto error_done;
+
+ if (wire_server_receive_script_path(wire_server))
+ goto error_done;
+
+ if (wire_server_receive_script(wire_server))
+ goto error_done;
+
+ if (wire_server_receive_hw_address(wire_server))
+ goto error_done;
+
+ if (parse_script_and_set_config(wire_server->argc,
+ wire_server->argv,
+ &wire_server->config,
+ &wire_server->script,
+ wire_server->script_path,
+ wire_server->script_buffer))
+ goto error_done;
+
+ set_scheduling_priority();
+ lock_memory();
+
+ netdev =
+ wire_server_netdev_new(&wire_server->config,
+ wire_server->wire_server_device,
+ &wire_server->client_ether_addr,
+ &wire_server->server_ether_addr);
+
+ wire_server->state = state_new(&wire_server->config,
+ &wire_server->script,
+ netdev);
+
+ if (wire_server_send_server_ready(wire_server))
+ goto error_done;
+
+ if (wire_server_receive_client_starting(wire_server))
+ goto error_done;
+
+ if (wire_server_run_script(wire_server, &error))
+ goto error_done;
+
+ DEBUGP("wire_server_thread: finished test successfully\n");
+
+error_done:
+ if (error != NULL)
+ fprintf(stderr, "%s\n", error);
+
+ if (wire_server->state != NULL)
+ state_free(wire_server->state);
+
+ DEBUGP("wire_server_thread: connection is done\n");
+ wire_server_free(wire_server);
+ return NULL;
+}
+
+static void start_wire_server_thread(struct wire_server *wire_server)
+{
+ DEBUGP("start_wire_server_thread\n");
+
+ pthread_t thread; /* pthread thread handle */
+ if (pthread_create(&thread, NULL, wire_server_thread,
+ wire_server) != 0) {
+ die_perror("pthread_create");
+ }
+}
+
+void run_wire_server(const struct config *config)
+{
+ struct wire_conn *listen_conn = NULL;
+
+ wire_server_netdev_init(config->wire_server_device);
+
+ listen_conn = wire_conn_new();
+
+ wire_conn_bind_listen(listen_conn, config->wire_server_port,
+ config->ip_version);
+
+ while (1) {
+ struct wire_conn *accepted_conn = NULL;
+ wire_conn_accept(listen_conn, &accepted_conn);
+
+ struct wire_server *wire_server =
+ wire_server_new(accepted_conn,
+ config->wire_server_device,
+ config->wire_server_port,
+ config->ip_version);
+
+ start_wire_server_thread(wire_server);
+ }
+}
diff --git a/test/packetdrill/wire_server.h b/test/packetdrill/wire_server.h
new file mode 100644
index 0000000..ec3d0e0
--- /dev/null
+++ b/test/packetdrill/wire_server.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Server-side code for remote on-the-wire testing using a real NIC.
+ */
+
+#ifndef __WIRE_SERVER_H__
+#define __WIRE_SERVER_H__
+
+#include "types.h"
+
+#include "config.h"
+
+/* Become a server for remote on-the-wire testing using a real NIC. */
+void run_wire_server(const struct config *config);
+
+
+#endif /* __WIRE_SERVER_H__ */
diff --git a/test/packetdrill/wire_server_netdev.c b/test/packetdrill/wire_server_netdev.c
new file mode 100644
index 0000000..cee64e7
--- /dev/null
+++ b/test/packetdrill/wire_server_netdev.c
@@ -0,0 +1,204 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Server-side network device code for remote on-the-wire testing
+ * using a real NIC.
+ */
+
+#include "wire_server_netdev.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <sys/uio.h>
+#include <unistd.h>
+
+#include "logging.h"
+#include "net_utils.h"
+#include "packet.h"
+#include "packet_socket.h"
+#include "packet_parser.h"
+
+struct wire_server_netdev {
+ struct netdev netdev; /* "inherit" from netdev */
+
+ char *name; /* copy of the interface name (owned) */
+ struct config *config; /* this test's config (not owned) */
+
+ struct ether_addr client_ether_addr;
+ struct ether_addr server_ether_addr;
+
+ struct packet_socket *psock; /* for sniffing packets (owned) */
+};
+
+struct netdev_ops wire_server_netdev_ops;
+
+/* "Downcast" an abstract netdev to our flavor. */
+static inline struct wire_server_netdev *to_server_netdev(
+ struct netdev *netdev)
+{
+ return (struct wire_server_netdev *)netdev;
+}
+
+void wire_server_netdev_init(const char *netdev_name)
+{
+#ifdef linux
+ char *command = NULL;
+
+ /* If Large Receive Offload (LRO) or Generic Receive Offload
+ * (GRO) is enabled, then disable them both, so that we are
+ * sniffing packets as seen on the wire, not packets
+ * aggregated by LRO or GRO.
+ *
+ * TOOD(ncardwell): if netdev_name is not a bonding interface,
+ * then we should just disable LRO/GRO on that interface; if
+ * netdev_name is a bonding interface then we should
+ * programmatically figure out all the slave interfaces for
+ * the given netdev_name, instead of using this overly broad
+ * approach.
+ */
+ asprintf(&command,
+ "(ethtool --offload eth0 lro off gro off; "
+ " ethtool --offload eth1 lro off gro off; "
+ " ethtool --offload eth2 lro off gro off) "
+ " > /dev/null 2>&1");
+ /* For now, intentionally ignoring errors rather than figuring
+ * out how many Ethernet interfaces there are. TODO: clean up.
+ */
+ system(command);
+ free(command);
+
+ /* Block outgoing IPv6 "destination unreachable" messages, to
+ * block the "destination unreachable, unreachable route"
+ * messages we would otherwise send the kernel under test.
+ * That would cause the kernel under test to delete the TCP
+ * socket under test and send a RST.
+ */
+ asprintf(&command,
+ "ip6tables -F OUTPUT; "
+ "ip6tables -A OUTPUT -p icmpv6 --icmpv6-type 1 -j DROP");
+ /* For now, intentionally ignoring. TODO: clean up. */
+ system(command);
+ free(command);
+#endif
+}
+
+struct netdev *wire_server_netdev_new(
+ struct config *config,
+ const char *wire_server_device,
+ const struct ether_addr *client_ether_addr,
+ const struct ether_addr *server_ether_addr)
+{
+ DEBUGP("wire_server_netdev_new\n");
+
+ struct wire_server_netdev *netdev =
+ calloc(1, sizeof(struct wire_server_netdev));
+
+ netdev->netdev.ops = &wire_server_netdev_ops;
+ netdev->name = strdup(wire_server_device);
+ netdev->config = config;
+ ether_copy(&netdev->client_ether_addr, client_ether_addr);
+ ether_copy(&netdev->server_ether_addr, server_ether_addr);
+
+ /* Add the gateway IP to our NIC, so it answers ARP or
+ * neighbor discovery requests, so we can receive packets from
+ * the client. TODO(ncardwell): support multiple concurrent
+ * tests, by perhaps ref-counting the gateway IPs we need to
+ * be using. TODO(ncardwell): make sure we don't delete our
+ * primary host IP (the one matching our hostname).
+ */
+ net_setup_dev_address(netdev->name,
+ &config->live_gateway_ip,
+ config->live_prefix_len);
+
+ netdev->psock = packet_socket_new(netdev->name);
+
+ /* Make sure we only see packets from the machine under test. */
+ packet_socket_set_filter(netdev->psock,
+ client_ether_addr,
+ &config->live_local_ip); /* client IP */
+
+ return (struct netdev *)netdev;
+}
+
+static void wire_server_netdev_free(struct netdev *a_netdev)
+{
+ struct wire_server_netdev *netdev = to_server_netdev(a_netdev);
+
+ DEBUGP("wire_server_netdev_free\n");
+
+ net_del_dev_address(netdev->name,
+ &netdev->config->live_gateway_ip,
+ netdev->config->live_prefix_len);
+
+ free(netdev->name);
+ if (netdev->psock)
+ packet_socket_free(netdev->psock);
+
+ memset(netdev, 0, sizeof(*netdev)); /* paranoia */
+ free(netdev);
+}
+
+static int wire_server_netdev_send(struct netdev *a_netdev,
+ struct packet *packet)
+{
+ struct wire_server_netdev *netdev = to_server_netdev(a_netdev);
+ struct ether_header ether;
+ struct iovec ether_frame[2];
+ int address_family = packet_address_family(packet);
+ int result = STATUS_ERR;
+
+ DEBUGP("wire_server_netdev_send\n");
+
+ /* Prepend an ethernet header. */
+ ether_copy(ether.ether_dhost, &netdev->client_ether_addr);
+ ether_copy(ether.ether_shost, &netdev->server_ether_addr);
+ ether.ether_type = htons(ether_type_for_family(address_family));
+ ether_frame[0].iov_base = &ether;
+ ether_frame[0].iov_len = sizeof(ether);
+
+ /* Then after that we have the IP datagram. */
+ ether_frame[1].iov_base = packet_start(packet);
+ ether_frame[1].iov_len = packet->ip_bytes;
+
+ result = packet_socket_writev(netdev->psock,
+ ether_frame, ARRAY_SIZE(ether_frame));
+
+ return result;
+}
+
+static int wire_server_netdev_receive(struct netdev *a_netdev,
+ struct packet **packet, char **error)
+{
+ struct wire_server_netdev *netdev = to_server_netdev(a_netdev);
+ int num_packets = 0;
+
+ DEBUGP("wire_server_netdev_receive\n");
+
+ return netdev_receive_loop(netdev->psock, PACKET_LAYER_2_ETHERNET,
+ DIRECTION_INBOUND, packet, &num_packets,
+ error);
+}
+
+struct netdev_ops wire_server_netdev_ops = {
+ .free = wire_server_netdev_free,
+ .send = wire_server_netdev_send,
+ .receive = wire_server_netdev_receive,
+};
diff --git a/test/packetdrill/wire_server_netdev.h b/test/packetdrill/wire_server_netdev.h
new file mode 100644
index 0000000..5389eae
--- /dev/null
+++ b/test/packetdrill/wire_server_netdev.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Author: ncardwell@google.com (Neal Cardwell)
+ *
+ * Server-side network device code for remote on-the-wire testing
+ * using a real NIC.
+ */
+
+#ifndef __WIRE_SERVER_NETDEV_H__
+#define __WIRE_SERVER_NETDEV_H__
+
+#include "types.h"
+
+#include "config.h"
+#include "ethernet.h"
+#include "netdev.h"
+
+struct wire_server_netdev;
+
+/* Do any one-time start-up initialization a wire server netdev needs. */
+extern void wire_server_netdev_init(const char *netdev_name);
+
+/* Allocate and return a new wire server netdev. */
+extern struct netdev *wire_server_netdev_new(
+ struct config *config,
+ const char *wire_server_device,
+ const struct ether_addr *client_ether_addr,
+ const struct ether_addr *server_ether_addr);
+
+#endif /* __WIRE_SERVER_NETDEV_H__ */
diff --git a/test/packetdrill/wrap.c b/test/packetdrill/wrap.c
new file mode 100644
index 0000000..112af2a
--- /dev/null
+++ b/test/packetdrill/wrap.c
@@ -0,0 +1,125 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Wrappers for making L3-independent syscalls.
+ */
+
+#include "wrap.h"
+
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#include "checksum.h"
+#include "gre.h"
+#include "logging.h"
+#include "netdev.h"
+#include "packet.h"
+#include "packet_checksum.h"
+#include "packet_to_string.h"
+#include "run.h"
+#include "script.h"
+#include "tcp_options_iterator.h"
+#include "tcp_options_to_string.h"
+#include "tcp_packet.h"
+
+int wrap_socket(enum ip_version_t ip_version, int type)
+{
+ int fd = -1;
+
+ switch (ip_version) {
+ case IP_VERSION_4:
+ fd = socket(AF_INET, type, 0);
+ if (fd < 0)
+ die_perror("socket(AF_INET)");
+ break;
+
+ case IP_VERSION_4_MAPPED_6:
+ case IP_VERSION_6:
+ fd = socket(AF_INET6, type, 0);
+ if (fd < 0)
+ die_perror("socket(AF_INET6)");
+ break;
+
+ default:
+ die("bad ip_version (%d) in config\n", ip_version);
+ break;
+ }
+
+ return fd;
+}
+
+u16 wrap_bind_listen(int s, enum ip_version_t ip_version, u16 port)
+{
+ switch (ip_version) {
+ case IP_VERSION_4: {
+ struct sockaddr_in addr;
+ socklen_t addrlen = sizeof(addr);
+
+ memset(&addr, 0, addrlen);
+#ifndef linux
+ addr.sin_len = addrlen;
+#endif
+ addr.sin_family = AF_INET;
+ addr.sin_port = htons(port);
+
+ if (bind(s, (struct sockaddr *)&addr, addrlen) < 0)
+ die_perror("bind(AF_INET)");
+
+ memset(&addr, 0, sizeof(addr));
+ if (getsockname(s, (struct sockaddr *)&addr, &addrlen) < 0)
+ die_perror("getsockname(AF_INET)");
+ assert(addr.sin_family == AF_INET);
+
+ if (listen(s, 100) < 0)
+ die_perror("listen(AF_INET)");
+
+ return ntohs(addr.sin_port);
+ }
+
+ case IP_VERSION_4_MAPPED_6:
+ case IP_VERSION_6: {
+ struct sockaddr_in6 addr6;
+ socklen_t addrlen = sizeof(addr6);
+
+ memset(&addr6, 0, addrlen);
+ addr6.sin6_family = AF_INET6;
+ addr6.sin6_port = htons(port);
+
+ if (bind(s, (struct sockaddr *)&addr6, addrlen) < 0)
+ die_perror("bind(AF_INET6)");
+
+ memset(&addr6, 0, sizeof(addr6));
+ if (getsockname(s, (struct sockaddr *)&addr6, &addrlen) < 0)
+ die_perror("getsockname(AF_INET6)");
+ assert(addr6.sin6_family == AF_INET6);
+
+ if (listen(s, 100) < 0)
+ die_perror("listen(AF_INET6)");
+
+ return ntohs(addr6.sin6_port);
+ }
+
+ default:
+ die("bad ip_version (%d) in config\n", ip_version);
+ return 0;
+ }
+}
diff --git a/test/packetdrill/wrap.h b/test/packetdrill/wrap.h
new file mode 100644
index 0000000..48a78a5
--- /dev/null
+++ b/test/packetdrill/wrap.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+/*
+ * Wrappers for making L3-independent syscalls.
+ */
+
+#ifndef __WRAP_H__
+#define __WRAP_H__
+
+#include "config.h"
+#include "types.h"
+
+extern int wrap_socket(enum ip_version_t ip_version, int type);
+extern u16 wrap_bind_listen(int fd, enum ip_version_t ip_version, u16 port);
+
+#endif /* __WRAP_H__ */