From 78c896b3b3127515478090c19447e27dc406427e Mon Sep 17 00:00:00 2001 From: Jianfeng Tan Date: Mon, 18 Nov 2019 06:59:50 +0000 Subject: TLDKv2 Signed-off-by: Jianfeng Tan Signed-off-by: Jielong Zhou Signed-off-by: Jian Zhang Signed-off-by: Chen Zhao Change-Id: I55c39de4c6cd30f991f35631eb507f770230f08e --- .gitreview | 1 + Makefile | 39 +- README | 200 +- afl/lower_constructor_priority.diff | 16 + afl/run.sh | 1 + afl/seeds/seed.txt | Bin 0 -> 90 bytes angora/dpdk_abilist.txt | 1756 ++++++++++ angora/run.sh | 1 + angora/seeds/seed.txt | 1 + dpdk/Makefile | 101 +- ...1-eal-don-t-start-the-interrupt-mp-thread.patch | 35 + .../0002-eal-prioritize-constructor.patch | 25 + .../0003-mbuf-add-single-linked-list.patch | 33 + ...irtio-user-add-rss-update-for-virtio-user.patch | 43 + ...virtio-user-support-raw-socket-as-backend.patch | 645 ++++ .../0006-mempool-add-dynamic-mempool-support.patch | 247 ++ ...007-mbuf-add-dynamic-mbuf-mempool-support.patch | 305 ++ .../0008-mempool-prioritize-constructor.patch | 30 + .../0009-net-virtio-fill-desc-limit.patch | 42 + examples/Makefile | 2 +- examples/l4fwd/main.c | 1 - examples/l4fwd/port.h | 11 - lib/Makefile | 1 + lib/libtle_glue/Makefile | 62 + lib/libtle_glue/arp.c | 935 +++++ lib/libtle_glue/be.c | 256 ++ lib/libtle_glue/config.h | 71 + lib/libtle_glue/ctx.c | 535 +++ lib/libtle_glue/ctx.h | 147 + lib/libtle_glue/epoll.c | 577 ++++ lib/libtle_glue/fd.c | 122 + lib/libtle_glue/fd.h | 113 + lib/libtle_glue/gateway.h | 96 + lib/libtle_glue/icmp.c | 297 ++ lib/libtle_glue/init.c | 155 + lib/libtle_glue/internal.h | 152 + lib/libtle_glue/log.h | 77 + lib/libtle_glue/ndp.h | 33 + lib/libtle_glue/packetdrill.c | 544 +++ lib/libtle_glue/packetdrill.h | 111 + lib/libtle_glue/poll.c | 138 + lib/libtle_glue/port.c | 246 ++ lib/libtle_glue/rxcb.c | 834 +++++ lib/libtle_glue/rxtx.c | 573 ++++ lib/libtle_glue/select.c | 201 ++ lib/libtle_glue/sock.h | 154 + lib/libtle_glue/socket.c | 720 ++++ lib/libtle_glue/sym.c | 129 + lib/libtle_glue/sym.h | 118 + lib/libtle_glue/tcp.c | 558 +++ lib/libtle_glue/tle_glue.h | 114 + lib/libtle_glue/udp.c | 419 +++ lib/libtle_glue/util.c | 60 + lib/libtle_glue/util.h | 377 +++ lib/libtle_glue/zerocopy.h | 59 + lib/libtle_l4p/Makefile | 1 + lib/libtle_l4p/ctx.c | 349 +- lib/libtle_l4p/ctx.h | 38 +- lib/libtle_l4p/misc.h | 66 +- lib/libtle_l4p/net_misc.h | 21 + lib/libtle_l4p/port_statmap.h | 127 + lib/libtle_l4p/stream.h | 55 +- lib/libtle_l4p/stream_table.c | 65 +- lib/libtle_l4p/stream_table.h | 490 ++- lib/libtle_l4p/syncookie.h | 4 + lib/libtle_l4p/tcp_ctl.h | 68 +- lib/libtle_l4p/tcp_misc.h | 34 +- lib/libtle_l4p/tcp_ofo.c | 39 - lib/libtle_l4p/tcp_ofo.h | 14 +- lib/libtle_l4p/tcp_rxq.h | 4 + lib/libtle_l4p/tcp_rxtx.c | 1445 +++++--- lib/libtle_l4p/tcp_rxtx.h | 252 ++ lib/libtle_l4p/tcp_stream.c | 395 ++- lib/libtle_l4p/tcp_stream.h | 37 +- lib/libtle_l4p/tcp_timer.h | 40 +- lib/libtle_l4p/tcp_tx_seg.h | 12 +- lib/libtle_l4p/tcp_txq.h | 29 +- lib/libtle_l4p/tle_ctx.h | 41 + lib/libtle_l4p/tle_event.h | 2 +- lib/libtle_l4p/tle_stats.h | 101 + lib/libtle_l4p/tle_tcp.h | 60 + lib/libtle_l4p/tle_udp.h | 49 + lib/libtle_l4p/udp_rxtx.c | 186 +- lib/libtle_l4p/udp_stream.c | 347 +- lib/libtle_l4p/udp_stream.h | 9 + lib/libtle_timer/timer.c | 43 +- mk/tle.app.mk | 4 + mk/tle.lib.mk | 4 + test/Makefile | 4 + test/gtest/Makefile | 1 + test/gtest/test_tle_ctx.cpp | 1 + test/gtest/test_tle_tcp_stream.cpp | 4 +- test/gtest/test_tle_tcp_stream.h | 4 + test/gtest/test_tle_udp_destroy.cpp | 1 + test/gtest/test_tle_udp_stream_gen.cpp | 53 +- test/gtest/test_tle_udp_stream_gen.h | 2 + test/packetdrill/COPYING | 339 ++ test/packetdrill/Makefile | 2 + test/packetdrill/Makefile.FreeBSD | 2 + test/packetdrill/Makefile.Linux | 2 + test/packetdrill/Makefile.NetBSD | 2 + test/packetdrill/Makefile.OpenBSD | 2 + test/packetdrill/Makefile.common | 63 + test/packetdrill/README | 58 + test/packetdrill/assert.h | 10 + test/packetdrill/capability.h | 102 + test/packetdrill/checksum.c | 239 ++ test/packetdrill/checksum.h | 54 + test/packetdrill/checksum_test.c | 140 + test/packetdrill/code.c | 777 +++++ test/packetdrill/code.h | 122 + test/packetdrill/config.c | 605 ++++ test/packetdrill/config.h | 204 ++ test/packetdrill/configure | 3 + test/packetdrill/contrib/packetdrill.el | 45 + test/packetdrill/contrib/packetdrill.vim | 125 + test/packetdrill/epoll.c | 55 + test/packetdrill/epoll.h | 62 + test/packetdrill/ethernet.h | 75 + test/packetdrill/fd_state.h | 64 + test/packetdrill/file.c | 55 + test/packetdrill/file.h | 52 + test/packetdrill/fmemopen.c | 81 + test/packetdrill/fmemopen.h | 37 + test/packetdrill/gre.h | 102 + test/packetdrill/gre_packet.c | 56 + test/packetdrill/gre_packet.h | 45 + test/packetdrill/hash.c | 430 +++ test/packetdrill/hash.h | 43 + test/packetdrill/hash_map.c | 162 + test/packetdrill/hash_map.h | 56 + test/packetdrill/header.h | 93 + test/packetdrill/icmp.h | 97 + test/packetdrill/icmp_packet.c | 406 +++ test/packetdrill/icmp_packet.h | 55 + test/packetdrill/icmpv6.h | 81 + test/packetdrill/ip.h | 108 + test/packetdrill/ip_address.c | 379 +++ test/packetdrill/ip_address.h | 131 + test/packetdrill/ip_packet.c | 221 ++ test/packetdrill/ip_packet.h | 80 + test/packetdrill/ip_prefix.c | 148 + test/packetdrill/ip_prefix.h | 69 + test/packetdrill/ipv6.h | 92 + test/packetdrill/lexer.l | 280 ++ test/packetdrill/link_layer.c | 104 + test/packetdrill/link_layer.h | 38 + test/packetdrill/logging.c | 51 + test/packetdrill/logging.h | 46 + test/packetdrill/mpls.h | 113 + test/packetdrill/mpls_packet.c | 77 + test/packetdrill/mpls_packet.h | 57 + test/packetdrill/net_utils.c | 172 + test/packetdrill/net_utils.h | 56 + test/packetdrill/netdev.c | 502 +++ test/packetdrill/netdev.h | 99 + test/packetdrill/open_memstream.c | 142 + test/packetdrill/open_memstream.h | 37 + test/packetdrill/packet.c | 327 ++ test/packetdrill/packet.h | 425 +++ test/packetdrill/packet_checksum.c | 116 + test/packetdrill/packet_checksum.h | 33 + test/packetdrill/packet_parser.c | 625 ++++ test/packetdrill/packet_parser.h | 53 + test/packetdrill/packet_parser_test.c | 484 +++ test/packetdrill/packet_socket.h | 69 + test/packetdrill/packet_socket_linux.c | 280 ++ test/packetdrill/packet_socket_pcap.c | 290 ++ test/packetdrill/packet_to_string.c | 303 ++ test/packetdrill/packet_to_string.h | 44 + test/packetdrill/packet_to_string_test.c | 301 ++ test/packetdrill/packetdrill.c | 113 + test/packetdrill/packetdrill.h | 108 + test/packetdrill/parse.h | 62 + test/packetdrill/parser.y | 1739 ++++++++++ test/packetdrill/pipe.c | 55 + test/packetdrill/pipe.h | 54 + test/packetdrill/platforms.h | 121 + test/packetdrill/run.c | 695 ++++ test/packetdrill/run.h | 197 ++ test/packetdrill/run_command.c | 55 + test/packetdrill/run_command.h | 38 + test/packetdrill/run_packet.c | 1934 +++++++++++ test/packetdrill/run_packet.h | 61 + test/packetdrill/run_system_call.c | 3561 ++++++++++++++++++++ test/packetdrill/run_system_call.h | 104 + test/packetdrill/script.c | 745 ++++ test/packetdrill/script.h | 308 ++ test/packetdrill/sctp.h | 40 + test/packetdrill/so_testing.c | 169 + test/packetdrill/so_testing.h | 55 + test/packetdrill/socket.c | 80 + test/packetdrill/socket.h | 311 ++ test/packetdrill/symbols.h | 42 + test/packetdrill/symbols_freebsd.c | 310 ++ test/packetdrill/symbols_linux.c | 502 +++ test/packetdrill/symbols_netbsd.c | 320 ++ test/packetdrill/symbols_openbsd.c | 281 ++ test/packetdrill/system.c | 52 + test/packetdrill/system.h | 35 + test/packetdrill/tcp.h | 339 ++ test/packetdrill/tcp_options.c | 70 + test/packetdrill/tcp_options.h | 129 + test/packetdrill/tcp_options_iterator.c | 169 + test/packetdrill/tcp_options_iterator.h | 53 + test/packetdrill/tcp_options_to_string.c | 167 + test/packetdrill/tcp_options_to_string.h | 40 + test/packetdrill/tcp_packet.c | 166 + test/packetdrill/tcp_packet.h | 51 + .../tests/bsd/fast_retransmit/fr-4pkt-sack-bsd.pkt | 38 + test/packetdrill/tests/linux/README | 7 + .../tests/linux/blocking/blocking-accept.pkt | 15 + .../tests/linux/blocking/blocking-read.pkt | 25 + .../tests/linux/close/close-read-data-fin.pkt | 38 + .../close/close-so-linger-onoff-1-linger-0-rst.pkt | 28 + .../tests/linux/close/close-unread-data-rst.pkt | 38 + .../linux/connect/http-get-nonblocking-ts.pkt | 34 + .../early_retransmit/er-delayed-2pkt-sack.pkt | 27 + .../early_retransmit/er-delayed-3pkt-sack.pkt | 28 + .../er-delayed-filled-3pkt-sack.pkt | 31 + .../er-delayed-get-ack-3pkt-sack.pkt | 35 + .../linux/early_retransmit/er-quick-2pkt-sack.pkt | 27 + .../linux/early_retransmit/er-quick-3pkt-sack.pkt | 28 + .../prr-ss-ack-below-snd_una-reno.pkt | 51 + .../linux/fast_retransmit/fr-4pkt-sack-linux.pkt | 35 + .../tests/linux/icmp/icmp-all-types.pkt | 71 + .../linux/inet_diag/inet-diag-ipv4-mapped-ipv6.pkt | 29 + .../tests/linux/inet_diag/inet-diag-ipv4.pkt | 28 + .../tests/linux/inet_diag/inet-diag-ipv6.pkt | 29 + .../tests/linux/init_rto/init_rto_passive_open.pkt | 17 + .../tests/linux/initial_window/iw10-base-case.pkt | 21 + .../linux/initial_window/iw10-short-response.pkt | 21 + .../tests/linux/ioctl/ioctl-siocinq-fin.pkt | 30 + .../tests/linux/listen/listen-incoming-ack.pkt | 20 + .../linux/listen/listen-incoming-no-tcp-flags.pkt | 21 + .../tests/linux/listen/listen-incoming-rst.pkt | 22 + .../tests/linux/listen/listen-incoming-syn-ack.pkt | 20 + .../tests/linux/listen/listen-incoming-syn-rst.pkt | 22 + .../tests/linux/listen/listen-unbound.pkt | 5 + .../mss/mss-getsockopt-tcp_maxseg-client-ts.pkt | 17 + .../linux/mss/mss-getsockopt-tcp_maxseg-client.pkt | 14 + ...ss-getsockopt-tcp_maxseg-server-advmss-ipv4.pkt | 29 + ...getsockopt-tcp_maxseg-server-advmss-ts-ipv4.pkt | 30 + .../mss/mss-getsockopt-tcp_maxseg-server-ts.pkt | 20 + .../linux/mss/mss-getsockopt-tcp_maxseg-server.pkt | 17 + .../linux/mss/mss-setsockopt-tcp_maxseg-client.pkt | 24 + .../linux/mss/mss-setsockopt-tcp_maxseg-server.pkt | 27 + .../pmtu_discovery/pmtud-10pkt-1460-to-1160.pkt | 54 + .../pmtu_discovery/pmtud-1pkt-1460-to-1160.pkt | 36 + .../receiver_rtt/rcv-rtt-with-timestamps-new.pkt | 57 + .../rcv-rtt-without-timestamps-new.pkt | 62 + test/packetdrill/tests/linux/run_tests.sh | 6 + .../linux/sack/sack-shift-sacked-1-2-3-fack.pkt | 47 + .../linux/sack/sack-shift-sacked-1-2:6-fack.pkt | 39 + .../tests/linux/shutdown/shutdown-rd-close.pkt | 29 + .../tests/linux/shutdown/shutdown-rd-wr-close.pkt | 45 + .../tests/linux/shutdown/shutdown-rdwr-close.pkt | 26 + .../tests/linux/shutdown/shutdown-wr-close.pkt | 29 + ...undo-fr-ack-then-dsack-on-ack-below-snd_una.pkt | 55 + .../linux/undo/undo-fr-acks-dropped-then-dsack.pkt | 44 + .../tests/tldk/delay_ack/delay-ack-tldk.pkt | 26 + .../tests/tldk/fast_retransmit/fr-4pkt-tldk.pkt | 35 + .../keep_alive/keep-alive-after-accept-tldk.pkt | 50 + .../keep_alive/keep-alive-before-connect-tldk.pkt | 37 + .../keep_alive/keep-alive-enable-disable-tldk.pkt | 26 + .../tldk/out_of_order/ofo-simple-3pkt-tldk.pkt | 27 + .../tests/tldk/tso/tso-segment-split.pkt | 63 + test/packetdrill/tun.h | 117 + test/packetdrill/types.c | 44 + test/packetdrill/types.h | 207 ++ test/packetdrill/uapi_linux.h | 296 ++ test/packetdrill/udp.h | 44 + test/packetdrill/udp_packet.c | 91 + test/packetdrill/udp_packet.h | 44 + test/packetdrill/unaligned.h | 53 + test/packetdrill/wire_client.c | 302 ++ test/packetdrill/wire_client.h | 69 + test/packetdrill/wire_client_netdev.c | 167 + test/packetdrill/wire_client_netdev.h | 37 + test/packetdrill/wire_conn.c | 254 ++ test/packetdrill/wire_conn.h | 88 + test/packetdrill/wire_protocol.c | 49 + test/packetdrill/wire_protocol.h | 66 + test/packetdrill/wire_server.c | 537 +++ test/packetdrill/wire_server.h | 36 + test/packetdrill/wire_server_netdev.c | 204 ++ test/packetdrill/wire_server_netdev.h | 47 + test/packetdrill/wrap.c | 125 + test/packetdrill/wrap.h | 32 + 289 files changed, 45114 insertions(+), 1313 deletions(-) create mode 100644 afl/lower_constructor_priority.diff create mode 100755 afl/run.sh create mode 100644 afl/seeds/seed.txt create mode 100644 angora/dpdk_abilist.txt create mode 100644 angora/run.sh create mode 100644 angora/seeds/seed.txt create mode 100644 dpdk/dpdk-v18.11_patches/0001-eal-don-t-start-the-interrupt-mp-thread.patch create mode 100644 dpdk/dpdk-v18.11_patches/0002-eal-prioritize-constructor.patch create mode 100644 dpdk/dpdk-v18.11_patches/0003-mbuf-add-single-linked-list.patch create mode 100644 dpdk/dpdk-v18.11_patches/0004-net-virtio-user-add-rss-update-for-virtio-user.patch create mode 100644 dpdk/dpdk-v18.11_patches/0005-net-virtio-user-support-raw-socket-as-backend.patch create mode 100644 dpdk/dpdk-v18.11_patches/0006-mempool-add-dynamic-mempool-support.patch create mode 100644 dpdk/dpdk-v18.11_patches/0007-mbuf-add-dynamic-mbuf-mempool-support.patch create mode 100644 dpdk/dpdk-v18.11_patches/0008-mempool-prioritize-constructor.patch create mode 100644 dpdk/dpdk-v18.11_patches/0009-net-virtio-fill-desc-limit.patch create mode 100644 lib/libtle_glue/Makefile create mode 100644 lib/libtle_glue/arp.c create mode 100644 lib/libtle_glue/be.c create mode 100644 lib/libtle_glue/config.h create mode 100644 lib/libtle_glue/ctx.c create mode 100644 lib/libtle_glue/ctx.h create mode 100644 lib/libtle_glue/epoll.c create mode 100644 lib/libtle_glue/fd.c create mode 100644 lib/libtle_glue/fd.h create mode 100644 lib/libtle_glue/gateway.h create mode 100644 lib/libtle_glue/icmp.c create mode 100644 lib/libtle_glue/init.c create mode 100644 lib/libtle_glue/internal.h create mode 100644 lib/libtle_glue/log.h create mode 100644 lib/libtle_glue/ndp.h create mode 100644 lib/libtle_glue/packetdrill.c create mode 100644 lib/libtle_glue/packetdrill.h create mode 100644 lib/libtle_glue/poll.c create mode 100644 lib/libtle_glue/port.c create mode 100644 lib/libtle_glue/rxcb.c create mode 100644 lib/libtle_glue/rxtx.c create mode 100644 lib/libtle_glue/select.c create mode 100644 lib/libtle_glue/sock.h create mode 100644 lib/libtle_glue/socket.c create mode 100644 lib/libtle_glue/sym.c create mode 100644 lib/libtle_glue/sym.h create mode 100644 lib/libtle_glue/tcp.c create mode 100644 lib/libtle_glue/tle_glue.h create mode 100644 lib/libtle_glue/udp.c create mode 100644 lib/libtle_glue/util.c create mode 100644 lib/libtle_glue/util.h create mode 100644 lib/libtle_glue/zerocopy.h create mode 100644 lib/libtle_l4p/port_statmap.h create mode 100644 lib/libtle_l4p/tcp_rxtx.h create mode 100644 lib/libtle_l4p/tle_stats.h create mode 100644 test/packetdrill/COPYING create mode 100644 test/packetdrill/Makefile create mode 100644 test/packetdrill/Makefile.FreeBSD create mode 100644 test/packetdrill/Makefile.Linux create mode 100644 test/packetdrill/Makefile.NetBSD create mode 100644 test/packetdrill/Makefile.OpenBSD create mode 100644 test/packetdrill/Makefile.common create mode 100644 test/packetdrill/README create mode 100644 test/packetdrill/assert.h create mode 100644 test/packetdrill/capability.h create mode 100644 test/packetdrill/checksum.c create mode 100644 test/packetdrill/checksum.h create mode 100644 test/packetdrill/checksum_test.c create mode 100644 test/packetdrill/code.c create mode 100644 test/packetdrill/code.h create mode 100644 test/packetdrill/config.c create mode 100644 test/packetdrill/config.h create mode 100755 test/packetdrill/configure create mode 100644 test/packetdrill/contrib/packetdrill.el create mode 100644 test/packetdrill/contrib/packetdrill.vim create mode 100644 test/packetdrill/epoll.c create mode 100644 test/packetdrill/epoll.h create mode 100644 test/packetdrill/ethernet.h create mode 100644 test/packetdrill/fd_state.h create mode 100644 test/packetdrill/file.c create mode 100644 test/packetdrill/file.h create mode 100644 test/packetdrill/fmemopen.c create mode 100644 test/packetdrill/fmemopen.h create mode 100644 test/packetdrill/gre.h create mode 100644 test/packetdrill/gre_packet.c create mode 100644 test/packetdrill/gre_packet.h create mode 100644 test/packetdrill/hash.c create mode 100644 test/packetdrill/hash.h create mode 100644 test/packetdrill/hash_map.c create mode 100644 test/packetdrill/hash_map.h create mode 100644 test/packetdrill/header.h create mode 100644 test/packetdrill/icmp.h create mode 100644 test/packetdrill/icmp_packet.c create mode 100644 test/packetdrill/icmp_packet.h create mode 100644 test/packetdrill/icmpv6.h create mode 100644 test/packetdrill/ip.h create mode 100644 test/packetdrill/ip_address.c create mode 100644 test/packetdrill/ip_address.h create mode 100644 test/packetdrill/ip_packet.c create mode 100644 test/packetdrill/ip_packet.h create mode 100644 test/packetdrill/ip_prefix.c create mode 100644 test/packetdrill/ip_prefix.h create mode 100644 test/packetdrill/ipv6.h create mode 100644 test/packetdrill/lexer.l create mode 100644 test/packetdrill/link_layer.c create mode 100644 test/packetdrill/link_layer.h create mode 100644 test/packetdrill/logging.c create mode 100644 test/packetdrill/logging.h create mode 100644 test/packetdrill/mpls.h create mode 100644 test/packetdrill/mpls_packet.c create mode 100644 test/packetdrill/mpls_packet.h create mode 100644 test/packetdrill/net_utils.c create mode 100644 test/packetdrill/net_utils.h create mode 100644 test/packetdrill/netdev.c create mode 100644 test/packetdrill/netdev.h create mode 100644 test/packetdrill/open_memstream.c create mode 100644 test/packetdrill/open_memstream.h create mode 100644 test/packetdrill/packet.c create mode 100644 test/packetdrill/packet.h create mode 100644 test/packetdrill/packet_checksum.c create mode 100644 test/packetdrill/packet_checksum.h create mode 100644 test/packetdrill/packet_parser.c create mode 100644 test/packetdrill/packet_parser.h create mode 100644 test/packetdrill/packet_parser_test.c create mode 100644 test/packetdrill/packet_socket.h create mode 100644 test/packetdrill/packet_socket_linux.c create mode 100644 test/packetdrill/packet_socket_pcap.c create mode 100644 test/packetdrill/packet_to_string.c create mode 100644 test/packetdrill/packet_to_string.h create mode 100644 test/packetdrill/packet_to_string_test.c create mode 100644 test/packetdrill/packetdrill.c create mode 100644 test/packetdrill/packetdrill.h create mode 100644 test/packetdrill/parse.h create mode 100644 test/packetdrill/parser.y create mode 100644 test/packetdrill/pipe.c create mode 100644 test/packetdrill/pipe.h create mode 100644 test/packetdrill/platforms.h create mode 100644 test/packetdrill/run.c create mode 100644 test/packetdrill/run.h create mode 100644 test/packetdrill/run_command.c create mode 100644 test/packetdrill/run_command.h create mode 100644 test/packetdrill/run_packet.c create mode 100644 test/packetdrill/run_packet.h create mode 100644 test/packetdrill/run_system_call.c create mode 100644 test/packetdrill/run_system_call.h create mode 100644 test/packetdrill/script.c create mode 100644 test/packetdrill/script.h create mode 100644 test/packetdrill/sctp.h create mode 100644 test/packetdrill/so_testing.c create mode 100644 test/packetdrill/so_testing.h create mode 100644 test/packetdrill/socket.c create mode 100644 test/packetdrill/socket.h create mode 100644 test/packetdrill/symbols.h create mode 100644 test/packetdrill/symbols_freebsd.c create mode 100644 test/packetdrill/symbols_linux.c create mode 100644 test/packetdrill/symbols_netbsd.c create mode 100644 test/packetdrill/symbols_openbsd.c create mode 100644 test/packetdrill/system.c create mode 100644 test/packetdrill/system.h create mode 100644 test/packetdrill/tcp.h create mode 100644 test/packetdrill/tcp_options.c create mode 100644 test/packetdrill/tcp_options.h create mode 100644 test/packetdrill/tcp_options_iterator.c create mode 100644 test/packetdrill/tcp_options_iterator.h create mode 100644 test/packetdrill/tcp_options_to_string.c create mode 100644 test/packetdrill/tcp_options_to_string.h create mode 100644 test/packetdrill/tcp_packet.c create mode 100644 test/packetdrill/tcp_packet.h create mode 100644 test/packetdrill/tests/bsd/fast_retransmit/fr-4pkt-sack-bsd.pkt create mode 100644 test/packetdrill/tests/linux/README create mode 100644 test/packetdrill/tests/linux/blocking/blocking-accept.pkt create mode 100644 test/packetdrill/tests/linux/blocking/blocking-read.pkt create mode 100644 test/packetdrill/tests/linux/close/close-read-data-fin.pkt create mode 100644 test/packetdrill/tests/linux/close/close-so-linger-onoff-1-linger-0-rst.pkt create mode 100644 test/packetdrill/tests/linux/close/close-unread-data-rst.pkt create mode 100644 test/packetdrill/tests/linux/connect/http-get-nonblocking-ts.pkt create mode 100644 test/packetdrill/tests/linux/early_retransmit/er-delayed-2pkt-sack.pkt create mode 100644 test/packetdrill/tests/linux/early_retransmit/er-delayed-3pkt-sack.pkt create mode 100644 test/packetdrill/tests/linux/early_retransmit/er-delayed-filled-3pkt-sack.pkt create mode 100644 test/packetdrill/tests/linux/early_retransmit/er-delayed-get-ack-3pkt-sack.pkt create mode 100644 test/packetdrill/tests/linux/early_retransmit/er-quick-2pkt-sack.pkt create mode 100644 test/packetdrill/tests/linux/early_retransmit/er-quick-3pkt-sack.pkt create mode 100644 test/packetdrill/tests/linux/fast_recovery/prr-ss-ack-below-snd_una-reno.pkt create mode 100644 test/packetdrill/tests/linux/fast_retransmit/fr-4pkt-sack-linux.pkt create mode 100644 test/packetdrill/tests/linux/icmp/icmp-all-types.pkt create mode 100644 test/packetdrill/tests/linux/inet_diag/inet-diag-ipv4-mapped-ipv6.pkt create mode 100644 test/packetdrill/tests/linux/inet_diag/inet-diag-ipv4.pkt create mode 100644 test/packetdrill/tests/linux/inet_diag/inet-diag-ipv6.pkt create mode 100644 test/packetdrill/tests/linux/init_rto/init_rto_passive_open.pkt create mode 100755 test/packetdrill/tests/linux/initial_window/iw10-base-case.pkt create mode 100755 test/packetdrill/tests/linux/initial_window/iw10-short-response.pkt create mode 100644 test/packetdrill/tests/linux/ioctl/ioctl-siocinq-fin.pkt create mode 100644 test/packetdrill/tests/linux/listen/listen-incoming-ack.pkt create mode 100644 test/packetdrill/tests/linux/listen/listen-incoming-no-tcp-flags.pkt create mode 100644 test/packetdrill/tests/linux/listen/listen-incoming-rst.pkt create mode 100644 test/packetdrill/tests/linux/listen/listen-incoming-syn-ack.pkt create mode 100644 test/packetdrill/tests/linux/listen/listen-incoming-syn-rst.pkt create mode 100644 test/packetdrill/tests/linux/listen/listen-unbound.pkt create mode 100644 test/packetdrill/tests/linux/mss/mss-getsockopt-tcp_maxseg-client-ts.pkt create mode 100644 test/packetdrill/tests/linux/mss/mss-getsockopt-tcp_maxseg-client.pkt create mode 100644 test/packetdrill/tests/linux/mss/mss-getsockopt-tcp_maxseg-server-advmss-ipv4.pkt create mode 100644 test/packetdrill/tests/linux/mss/mss-getsockopt-tcp_maxseg-server-advmss-ts-ipv4.pkt create mode 100644 test/packetdrill/tests/linux/mss/mss-getsockopt-tcp_maxseg-server-ts.pkt create mode 100644 test/packetdrill/tests/linux/mss/mss-getsockopt-tcp_maxseg-server.pkt create mode 100644 test/packetdrill/tests/linux/mss/mss-setsockopt-tcp_maxseg-client.pkt create mode 100644 test/packetdrill/tests/linux/mss/mss-setsockopt-tcp_maxseg-server.pkt create mode 100644 test/packetdrill/tests/linux/pmtu_discovery/pmtud-10pkt-1460-to-1160.pkt create mode 100644 test/packetdrill/tests/linux/pmtu_discovery/pmtud-1pkt-1460-to-1160.pkt create mode 100644 test/packetdrill/tests/linux/receiver_rtt/rcv-rtt-with-timestamps-new.pkt create mode 100644 test/packetdrill/tests/linux/receiver_rtt/rcv-rtt-without-timestamps-new.pkt create mode 100755 test/packetdrill/tests/linux/run_tests.sh create mode 100644 test/packetdrill/tests/linux/sack/sack-shift-sacked-1-2-3-fack.pkt create mode 100644 test/packetdrill/tests/linux/sack/sack-shift-sacked-1-2:6-fack.pkt create mode 100644 test/packetdrill/tests/linux/shutdown/shutdown-rd-close.pkt create mode 100644 test/packetdrill/tests/linux/shutdown/shutdown-rd-wr-close.pkt create mode 100644 test/packetdrill/tests/linux/shutdown/shutdown-rdwr-close.pkt create mode 100644 test/packetdrill/tests/linux/shutdown/shutdown-wr-close.pkt create mode 100644 test/packetdrill/tests/linux/undo/undo-fr-ack-then-dsack-on-ack-below-snd_una.pkt create mode 100644 test/packetdrill/tests/linux/undo/undo-fr-acks-dropped-then-dsack.pkt create mode 100644 test/packetdrill/tests/tldk/delay_ack/delay-ack-tldk.pkt create mode 100644 test/packetdrill/tests/tldk/fast_retransmit/fr-4pkt-tldk.pkt create mode 100644 test/packetdrill/tests/tldk/keep_alive/keep-alive-after-accept-tldk.pkt create mode 100644 test/packetdrill/tests/tldk/keep_alive/keep-alive-before-connect-tldk.pkt create mode 100644 test/packetdrill/tests/tldk/keep_alive/keep-alive-enable-disable-tldk.pkt create mode 100644 test/packetdrill/tests/tldk/out_of_order/ofo-simple-3pkt-tldk.pkt create mode 100644 test/packetdrill/tests/tldk/tso/tso-segment-split.pkt create mode 100644 test/packetdrill/tun.h create mode 100644 test/packetdrill/types.c create mode 100644 test/packetdrill/types.h create mode 100644 test/packetdrill/uapi_linux.h create mode 100644 test/packetdrill/udp.h create mode 100644 test/packetdrill/udp_packet.c create mode 100644 test/packetdrill/udp_packet.h create mode 100644 test/packetdrill/unaligned.h create mode 100644 test/packetdrill/wire_client.c create mode 100644 test/packetdrill/wire_client.h create mode 100644 test/packetdrill/wire_client_netdev.c create mode 100644 test/packetdrill/wire_client_netdev.h create mode 100644 test/packetdrill/wire_conn.c create mode 100644 test/packetdrill/wire_conn.h create mode 100644 test/packetdrill/wire_protocol.c create mode 100644 test/packetdrill/wire_protocol.h create mode 100644 test/packetdrill/wire_server.c create mode 100644 test/packetdrill/wire_server.h create mode 100644 test/packetdrill/wire_server_netdev.c create mode 100644 test/packetdrill/wire_server_netdev.h create mode 100644 test/packetdrill/wrap.c create mode 100644 test/packetdrill/wrap.h diff --git a/.gitreview b/.gitreview index 3559d4a..418bfa7 100644 --- a/.gitreview +++ b/.gitreview @@ -2,3 +2,4 @@ host=gerrit.fd.io port=29418 project=tldk +defaultbranch=dev-next-socket diff --git a/Makefile b/Makefile index 474ada6..10c276d 100644 --- a/Makefile +++ b/Makefile @@ -22,6 +22,7 @@ endif RTE_TARGET ?= x86_64-native-linuxapp-gcc +DIRS-y += dpdk DIRS-y += lib DIRS-y += examples DIRS-y += test @@ -32,11 +33,18 @@ MAKEFLAGS += --no-print-directory O ?= $(TLDK_ROOT)/${RTE_TARGET} BASE_OUTPUT ?= $(abspath $(O)) +DPDK_LIBS_PATH := $(TLDK_ROOT)/dpdk/install/lib +TLDK_LIBS_PATH := $(TLDK_ROOT)/$(RTE_TARGET)/lib +LIBS := + .PHONY: all all: $(DIRS-y) .PHONY: clean -clean: $(DIRS-y) +clean: + @make clean -C test/packetdrill + @rm -rf $(RTE_TARGET) + @rm -rf libtldk.so libtldk.a .PHONY: $(DIRS-y) $(DIRS-y): $(RTE_SDK)/mk/rte.vars.mk @@ -48,8 +56,37 @@ $(DIRS-y): $(RTE_SDK)/mk/rte.vars.mk CUR_SUBDIR=$(CUR_SUBDIR)/$(@) \ S=$(CURDIR)/$(@) \ RTE_TARGET=$(RTE_TARGET) \ + EXTRA_CFLAGS="-fPIC" \ $(filter-out $(DIRS-y),$(MAKECMDGOALS)) +test: libtldk.a libtldk.so + +libtldk.so: lib + $(eval LIBS = $(wildcard $(DPDK_LIBS_PATH)/librte*.a $(TLDK_LIBS_PATH)/*.a)) + @gcc -shared -o libtldk.so -L$(DPDK_LIBS_PATH) -L$(TLDK_LIBS_PATH) \ + -Wl,--whole-archive $(LIBS) -Wl,--no-whole-archive \ + -lpthread -ldl -lnuma + +define repack +@echo -- repack $1 --- +@rm -rf tmpxyz; rm -f $1; mkdir tmpxyz; cd tmpxyz; \ + for f in $(LIBS) ; do \ + fn=$$(basename $$f) ; \ + echo $$fn ; \ + mkdir $$fn"_obj" ; \ + cd $$fn"_obj" ; \ + ar x $$f ; \ + cd .. ; \ + done; \ +ar cru ../$1 $$(find */*.o | paste -sd " " -); cd ..; rm -rf tmpxyz +endef + +libtldk.a: lib + $(eval LIBS = $(wildcard $(DPDK_LIBS_PATH)/librte*.a)) + $(call repack,libdpdk.a) + $(eval LIBS = $(wildcard $(DPDK_LIBS_PATH)/librte*.a $(TLDK_LIBS_PATH)/*.a)) + $(call repack,libtldk.a) + $(RTE_SDK)/mk/rte.vars.mk: ifeq ($(RTE_SDK),$(LOCAL_RTE_SDK)) @make RTE_TARGET=$(RTE_TARGET) config all -C $(TLDK_ROOT)/dpdk/ diff --git a/README b/README index 2ca150b..792bdef 100644 --- a/README +++ b/README @@ -1,7 +1,5 @@ 1. OVERVIEW - TLDK project scope is as follows: - 1) To implement a set of libraries for L4 protocol processing (UDP, TCP etc.) for both IPv4 and IPv6. @@ -16,8 +14,7 @@ code for setup, manage and perform actual IO over underlying devices are all out of scope of these libraries. - The only information these libraries need to know about the - underlying devices: + The only information these libraries need about the underlying devices: - supported HW offloads - MTU and L3/L2 addresses That allows the libraries to fill L2/L3 headers and mbuf metadata @@ -36,12 +33,22 @@ The library uses siphash logic from the below source https://github.com/veorq/SipHash +2. APIs + + TLDK provides three series of APIs: + - TLDK native APIs, provided by libtle_l4p. + - Posix APIs, provided by libtle_glue with PRELOAD compile macro. + - Posix APIs with changed symbol names, provided by libtle_glue without PRELOAD macro. + + +3. INSTALLATION GUIDE -2. INSTALLATION GUIDE + - Original guide + ---------------- 1) Obtain latest supported DPDK version and build it. (refer to http://dpdk.org for information how to download and build it). - Currently supported(tested) DPDK versions: 18.11 LTS. + Currently supported(tested) DPDK versions: 16.11 LTS, 17.11 LTS, 18.02. 2) Make sure that RTE_SDK and RTE_TARGET DPDK related environment variables are setup correctly. 3) Go to the TLDK root directory and type: 'make all'. @@ -58,6 +65,29 @@ make all ./x86_64-native-linuxapp-gcc/app/l4fwd ... + + - For preload use + ----------------- + + Debug: + + $ make DPDK_DEBUG=y EXTRA_CFLAGS="-g -O0 -fPIC -DPRELOAD" all + + Release: + + $ make EXTRA_CFLAGS="-g -fPIC -DPRELOAD" all + + - For TLDK API use + ------------------ + + Debug: + + $ make DPDK_DEBUG=y EXTRA_CFLAGS="-g -O0 -fPIC" all + + Release: + + $ make EXTRA_CFLAGS="-g -O3 -fPIC" all + 3. CONTENTS $(TLDK_ROOT) @@ -74,6 +104,8 @@ | +--libtle_l4p - implementation of the TCP/UDP packet processing | | | +--libtle_timer - implementation of the timer library + | | + | +--libtle_glue - socket glue layer with arp, icmp, epoll, etc | +----examples | | @@ -88,3 +120,159 @@ | | (googletest) | | | +--timer - UT for libtle_timer (standalone app) + | | + | +--packetdrill - UT for stack (standalone app) + + +5. Features + + Done: + - posix interface + - loopback device + - regression test + - multi-thread + - lightweight mem + - tcp_info (paritial) + - fd management + - arp request/reply + - icmp reply + - interrupt mode + - blocking recv/send + - TSO + - UFO + + TODO: + - fuzzing + - SACK + - RACK + - zerocopy APIs + - batching APIs + - multi-process + - numa awareness + - context recycle on thread exit + +5. Thread model + + - Multi-process is still not fully supported. + + - Symmetric multi-thread + + (app thread) (app thread) (app thread) + \ \ \ + / / / + \ \ \ + -------------------------------------------------------- + | FD management, Socket APIs (FE) | + -------------------------------------------------------- + + ----------- ----------- ----------- + | | | | | | + | ctx | | ctx | | ctx | + | | | | | | + ----------- ----------- ----------- + \__ | __/ + \__ | __/ + \__ | __/ + \__ | __/ + ------------------------- + | (RSS) NIC (FDIR) | + ------------------------- + + - Lookaside multi-thread + + (app thread) (app thread) (io thread) + \ \ \ + / / / + \ \ \ + ------------------------------------------------------ + | FD management, Socket APIs (FE) | + ------------------------------------------------------ + / + \ + / + ------------------------------------------------------ + | | + | ctx | + | | + ------------------------------------------------------ + | + | + ------------------------- + | NIC | + ------------------------- + +6. How to run + + We have two setups which need their own preparation. + + - virtio-user: test with virtio-user + vhost-kernel; + - physical NIC: test with physical NIC bound to vfio. + + If you are using physical NIC: + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + a. Set Linux boot options (Only needed if you will use physical NIC) + a1. Add below configuration into GRUB_CMDLINE_LINUX in /etc/default/grub + "intel_iommu=on iommu=pt" + + a2. Update grub + $ sudo grub2-mkconfig -o /boot/grub2/grub.cfg + + If you want to use 1GB hugepages, you can also add below content in the + boot cmdline: + "default_hugepagesz=1G hugepagesz=1G hugepages=2" + + b. Adjust RLIMIT_MEMLOCK (Only needed if you will use physical NIC) + Add below two lines into /etc/security/limits.conf + "* soft memlock 4194304 + * hard memlock 4194304" + + c. Reboot system + + d. Bind NIC to vfio-pci + + $ sudo modprobe vfio-pci + $ sudo ./usertools/dpdk-devbind.py -b vfio-pci 0000:01:00.1 + $ sudo chmod 666 /dev/vfio/16 (16 is just an example) + + If you are using virtio-user: + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + a. Prepare vhost-kernel + + $ sudo modprobe vhost-net + (if you don't have those modules, have to compile by yourself) + $ sudo chmod 666 /dev/vhost-net + $ sudo tunctl -u + + b. Prepare the vNIC + + $ export DPDK_VNIC="--vdev=virtio_user0,path=/dev/vhost-net,queue_size=1024,iface=tap0" + + For both cases, we need to: + ~~~~~~~~~~~~~~~~~~~~~~~~~~ + + $ sudo chmod 777 /dev/hugepages + $ export DPDK_IP=1.1.1.1 + + Note: for specific test example, you can refer to the example commit log. + +7. How to run packetdrill tests: + + Compile it in LOOK_ASIDE_BACKEND mode: + + $ make PACKETDRILL=y EXTRA_CFLAGS="-g -O0 -fPIC -march=native -DLOOK_ASIDE_BACKEND " all + + To run it: + + $ test/packetdrill/packetdrill --local_ip=192.168.0.2 \ + --remote_ip=192.0.2.1 --so_filename=`pwd`/libtldk.so \ + test/packetdrill/tests/tldk/fast_retransmit/fr-4pkt-tldk.pkt + +8. Tested Examples + + - examples/client + - examples/server + - wget (epoll) + - curl (poll) + - haproxy (multi-thread mode) diff --git a/afl/lower_constructor_priority.diff b/afl/lower_constructor_priority.diff new file mode 100644 index 0000000..b1eba07 --- /dev/null +++ b/afl/lower_constructor_priority.diff @@ -0,0 +1,16 @@ +diff --git a/llvm_mode/afl-llvm-rt.o.c b/llvm_mode/afl-llvm-rt.o.c +index debde20..69e2e4c 100644 +--- a/llvm_mode/afl-llvm-rt.o.c ++++ b/llvm_mode/afl-llvm-rt.o.c +@@ -39,9 +39,9 @@ + the LLVM-generated runtime initialization pass, not before. */ + + #ifdef USE_TRACE_PC +-# define CONST_PRIO 5 ++# define CONST_PRIO 2005 + #else +-# define CONST_PRIO 0 ++# define CONST_PRIO 2000 + #endif /* ^USE_TRACE_PC */ + + #include diff --git a/afl/run.sh b/afl/run.sh new file mode 100755 index 0000000..23213f6 --- /dev/null +++ b/afl/run.sh @@ -0,0 +1 @@ +`pwd`/AFLplusplus/afl-fuzz -m 4096 -i seeds -o output ../x86_64-native-linuxapp-gcc/app/tcp_lo 127.0.0.1 1234 @@ diff --git a/afl/seeds/seed.txt b/afl/seeds/seed.txt new file mode 100644 index 0000000..1ed4d76 Binary files /dev/null and b/afl/seeds/seed.txt differ diff --git a/angora/dpdk_abilist.txt b/angora/dpdk_abilist.txt new file mode 100644 index 0000000..f02f7c7 --- /dev/null +++ b/angora/dpdk_abilist.txt @@ -0,0 +1,1756 @@ +fun:pci_find_max_end_va=uninstrumented +fun:pci_parse_one_sysfs_resource=uninstrumented +fun:pci_update_device=uninstrumented +fun:rte_pci_get_iommu_class=uninstrumented +fun:rte_pci_ioport_map=uninstrumented +fun:rte_pci_ioport_read=uninstrumented +fun:rte_pci_ioport_unmap=uninstrumented +fun:rte_pci_ioport_write=uninstrumented +fun:rte_pci_map_device=uninstrumented +fun:rte_pci_read_config=uninstrumented +fun:rte_pci_scan=uninstrumented +fun:rte_pci_unmap_device=uninstrumented +fun:rte_pci_write_config=uninstrumented +fun:pci_name_set=uninstrumented +fun:rte_pci_add_device=uninstrumented +fun:rte_pci_dump=uninstrumented +fun:rte_pci_get_sysfs_path=uninstrumented +fun:rte_pci_insert_device=uninstrumented +fun:rte_pci_match=uninstrumented +fun:rte_pci_probe=uninstrumented +fun:rte_pci_register=uninstrumented +fun:rte_pci_unregister=uninstrumented +fun:pci_uio_map_resource=uninstrumented +fun:pci_uio_remap_resource=uninstrumented +fun:pci_uio_unmap_resource=uninstrumented +fun:rte_pci_dev_iterate=uninstrumented +fun:pci_uio_alloc_resource=uninstrumented +fun:pci_uio_free_resource=uninstrumented +fun:pci_uio_ioport_map=uninstrumented +fun:pci_uio_ioport_read=uninstrumented +fun:pci_uio_ioport_unmap=uninstrumented +fun:pci_uio_ioport_write=uninstrumented +fun:pci_uio_map_resource_by_index=uninstrumented +fun:pci_uio_read_config=uninstrumented +fun:pci_uio_write_config=uninstrumented +fun:pci_vfio_ioport_map=uninstrumented +fun:pci_vfio_ioport_read=uninstrumented +fun:pci_vfio_ioport_unmap=uninstrumented +fun:pci_vfio_ioport_write=uninstrumented +fun:pci_vfio_is_enabled=uninstrumented +fun:pci_vfio_map_resource=uninstrumented +fun:pci_vfio_read_config=uninstrumented +fun:pci_vfio_unmap_resource=uninstrumented +fun:pci_vfio_write_config=uninstrumented +fun:rte_vdev_add_custom_scan=uninstrumented +fun:rte_vdev_find_device=uninstrumented +fun:rte_vdev_init=uninstrumented +fun:rte_vdev_register=uninstrumented +fun:rte_vdev_remove_custom_scan=uninstrumented +fun:rte_vdev_uninit=uninstrumented +fun:rte_vdev_unregister=uninstrumented +fun:rte_vdev_dev_iterate=uninstrumented +fun:cmdline_free=uninstrumented +fun:cmdline_in=uninstrumented +fun:cmdline_interact=uninstrumented +fun:cmdline_new=uninstrumented +fun:cmdline_poll=uninstrumented +fun:cmdline_printf=uninstrumented +fun:cmdline_quit=uninstrumented +fun:cmdline_set_prompt=uninstrumented +fun:cmdline_write_char=uninstrumented +fun:cirbuf_add_buf_head=uninstrumented +fun:cirbuf_add_buf_tail=uninstrumented +fun:cirbuf_add_head=uninstrumented +fun:cirbuf_add_head_safe=uninstrumented +fun:cirbuf_add_tail=uninstrumented +fun:cirbuf_add_tail_safe=uninstrumented +fun:cirbuf_align_left=uninstrumented +fun:cirbuf_align_right=uninstrumented +fun:cirbuf_del_buf_head=uninstrumented +fun:cirbuf_del_buf_tail=uninstrumented +fun:cirbuf_del_head=uninstrumented +fun:cirbuf_del_head_safe=uninstrumented +fun:cirbuf_del_tail=uninstrumented +fun:cirbuf_del_tail_safe=uninstrumented +fun:cirbuf_get_buf_head=uninstrumented +fun:cirbuf_get_buf_tail=uninstrumented +fun:cirbuf_get_head=uninstrumented +fun:cirbuf_get_tail=uninstrumented +fun:cirbuf_init=uninstrumented +fun:cmdline_complete=uninstrumented +fun:cmdline_isendofcommand=uninstrumented +fun:cmdline_isendoftoken=uninstrumented +fun:cmdline_parse=uninstrumented +fun:cmdline_get_help_etheraddr=uninstrumented +fun:cmdline_parse_etheraddr=uninstrumented +fun:cmdline_get_help_ipaddr=uninstrumented +fun:cmdline_parse_ipaddr=uninstrumented +fun:cmdline_get_help_num=uninstrumented +fun:cmdline_parse_num=uninstrumented +fun:cmdline_get_help_portlist=uninstrumented +fun:cmdline_parse_portlist=uninstrumented +fun:cmdline_complete_get_elt_string=uninstrumented +fun:cmdline_complete_get_nb_string=uninstrumented +fun:cmdline_get_help_string=uninstrumented +fun:cmdline_parse_string=uninstrumented +fun:rdline_add_history=uninstrumented +fun:rdline_char_in=uninstrumented +fun:rdline_clear_history=uninstrumented +fun:rdline_get_buffer=uninstrumented +fun:rdline_get_history_item=uninstrumented +fun:rdline_init=uninstrumented +fun:rdline_newline=uninstrumented +fun:rdline_quit=uninstrumented +fun:rdline_redisplay=uninstrumented +fun:rdline_reset=uninstrumented +fun:rdline_restart=uninstrumented +fun:rdline_stop=uninstrumented +fun:cmdline_file_new=uninstrumented +fun:cmdline_stdin_exit=uninstrumented +fun:cmdline_stdin_new=uninstrumented +fun:vt100_init=uninstrumented +fun:vt100_parser=uninstrumented +fun:eal_create_runtime_dir=uninstrumented +fun:eal_parse_sysfs_value=uninstrumented +fun:eal_proc_type_detect=uninstrumented +fun:rte_eal_check_module=uninstrumented +fun:rte_eal_cleanup=uninstrumented +fun:rte_eal_create_uio_dev=uninstrumented +fun:rte_eal_get_configuration=uninstrumented +fun:rte_eal_get_runtime_dir=uninstrumented +fun:rte_eal_has_hugepages=uninstrumented +fun:rte_eal_has_pci=uninstrumented +fun:rte_eal_init=uninstrumented +fun:rte_eal_iopl_init=uninstrumented +fun:rte_eal_iova_mode=uninstrumented +fun:rte_eal_lcore_role=uninstrumented +fun:rte_eal_mbuf_user_pool_ops=uninstrumented +fun:rte_eal_process_type=uninstrumented +fun:rte_eal_vfio_intr_mode=uninstrumented +fun:rte_set_application_usage_hook=uninstrumented +fun:rte_eal_alarm_cancel=uninstrumented +fun:rte_eal_alarm_init=uninstrumented +fun:rte_eal_alarm_set=uninstrumented +fun:rte_bus_dump=uninstrumented +fun:rte_bus_find=uninstrumented +fun:rte_bus_find_by_device=uninstrumented +fun:rte_bus_find_by_device_name=uninstrumented +fun:rte_bus_find_by_name=uninstrumented +fun:rte_bus_get_iommu_class=uninstrumented +fun:rte_bus_probe=uninstrumented +fun:rte_bus_register=uninstrumented +fun:rte_bus_scan=uninstrumented +fun:rte_bus_sigbus_handler=uninstrumented +fun:rte_bus_unregister=uninstrumented +fun:rte_class_find=uninstrumented +fun:rte_class_find_by_name=uninstrumented +fun:rte_class_register=uninstrumented +fun:rte_class_unregister=uninstrumented +fun:rte_cpu_check_supported=uninstrumented +fun:rte_cpu_is_supported=uninstrumented +fun:local_dev_probe=uninstrumented +fun:local_dev_remove=uninstrumented +fun:rte_dev_event_callback_process=uninstrumented +fun:rte_dev_event_callback_register=uninstrumented +fun:rte_dev_event_callback_unregister=uninstrumented +fun:rte_dev_is_probed=uninstrumented +fun:rte_dev_iterator_init=uninstrumented +fun:rte_dev_iterator_next=uninstrumented +fun:rte_dev_probe=uninstrumented +fun:rte_dev_remove=uninstrumented +fun:rte_eal_hotplug_add=uninstrumented +fun:rte_eal_hotplug_remove=uninstrumented +fun:rte_devargs_add=uninstrumented +fun:rte_devargs_dump=uninstrumented +fun:rte_devargs_insert=uninstrumented +fun:rte_devargs_layers_parse=uninstrumented +fun:rte_devargs_next=uninstrumented +fun:rte_devargs_parse=uninstrumented +fun:rte_devargs_parsef=uninstrumented +fun:rte_devargs_remove=uninstrumented +fun:rte_devargs_type_count=uninstrumented +fun:rte_strerror=uninstrumented +fun:rte_fbarray_attach=uninstrumented +fun:rte_fbarray_destroy=uninstrumented +fun:rte_fbarray_detach=uninstrumented +fun:rte_fbarray_dump_metadata=uninstrumented +fun:rte_fbarray_find_contig_free=uninstrumented +fun:rte_fbarray_find_contig_used=uninstrumented +fun:rte_fbarray_find_idx=uninstrumented +fun:rte_fbarray_find_next_free=uninstrumented +fun:rte_fbarray_find_next_n_free=uninstrumented +fun:rte_fbarray_find_next_n_used=uninstrumented +fun:rte_fbarray_find_next_used=uninstrumented +fun:rte_fbarray_find_prev_free=uninstrumented +fun:rte_fbarray_find_prev_n_free=uninstrumented +fun:rte_fbarray_find_prev_n_used=uninstrumented +fun:rte_fbarray_find_prev_used=uninstrumented +fun:rte_fbarray_find_rev_contig_free=uninstrumented +fun:rte_fbarray_find_rev_contig_used=uninstrumented +fun:rte_fbarray_get=uninstrumented +fun:rte_fbarray_init=uninstrumented +fun:rte_fbarray_is_used=uninstrumented +fun:rte_fbarray_set_free=uninstrumented +fun:rte_fbarray_set_used=uninstrumented +fun:rte_hexdump=uninstrumented +fun:rte_memdump=uninstrumented +fun:rte_hypervisor_get_name=uninstrumented +fun:rte_eal_get_lcore_state=uninstrumented +fun:rte_eal_mp_remote_launch=uninstrumented +fun:rte_eal_mp_wait_lcore=uninstrumented +fun:rte_eal_wait_lcore=uninstrumented +fun:rte_eal_cpu_init=uninstrumented +fun:rte_socket_count=uninstrumented +fun:rte_socket_id_by_idx=uninstrumented +fun:eal_log_set_default=uninstrumented +fun:rte_log=uninstrumented +fun:rte_log_cur_msg_loglevel=uninstrumented +fun:rte_log_cur_msg_logtype=uninstrumented +fun:rte_log_dump=uninstrumented +fun:rte_log_get_global_level=uninstrumented +fun:rte_log_get_level=uninstrumented +fun:rte_log_register=uninstrumented +fun:rte_log_register_type_and_pick_level=uninstrumented +fun:rte_log_save_pattern=uninstrumented +fun:rte_log_save_regexp=uninstrumented +fun:rte_log_set_global_level=uninstrumented +fun:rte_log_set_level=uninstrumented +fun:rte_log_set_level_pattern=uninstrumented +fun:rte_log_set_level_regexp=uninstrumented +fun:rte_openlog_stream=uninstrumented +fun:rte_vlog=uninstrumented +fun:eal_memalloc_is_contig=uninstrumented +fun:eal_memalloc_mem_alloc_validate=uninstrumented +fun:eal_memalloc_mem_alloc_validator_register=uninstrumented +fun:eal_memalloc_mem_alloc_validator_unregister=uninstrumented +fun:eal_memalloc_mem_event_callback_register=uninstrumented +fun:eal_memalloc_mem_event_callback_unregister=uninstrumented +fun:eal_memalloc_mem_event_notify=uninstrumented +fun:eal_get_virtual_area=uninstrumented +fun:rte_dump_physmem_layout=uninstrumented +fun:rte_eal_get_physmem_size=uninstrumented +fun:rte_eal_memory_init=uninstrumented +fun:rte_mem_alloc_validator_register=uninstrumented +fun:rte_mem_alloc_validator_unregister=uninstrumented +fun:rte_mem_check_dma_mask=uninstrumented +fun:rte_mem_check_dma_mask_thread_unsafe=uninstrumented +fun:rte_mem_event_callback_register=uninstrumented +fun:rte_mem_event_callback_unregister=uninstrumented +fun:rte_mem_iova2virt=uninstrumented +fun:rte_mem_lock_page=uninstrumented +fun:rte_mem_set_dma_mask=uninstrumented +fun:rte_mem_virt2memseg=uninstrumented +fun:rte_mem_virt2memseg_list=uninstrumented +fun:rte_memory_get_nchannel=uninstrumented +fun:rte_memory_get_nrank=uninstrumented +fun:rte_memseg_contig_walk=uninstrumented +fun:rte_memseg_contig_walk_thread_unsafe=uninstrumented +fun:rte_memseg_get_fd=uninstrumented +fun:rte_memseg_get_fd_offset=uninstrumented +fun:rte_memseg_get_fd_offset_thread_unsafe=uninstrumented +fun:rte_memseg_get_fd_thread_unsafe=uninstrumented +fun:rte_memseg_list_walk=uninstrumented +fun:rte_memseg_list_walk_thread_unsafe=uninstrumented +fun:rte_memseg_walk=uninstrumented +fun:rte_memseg_walk_thread_unsafe=uninstrumented +fun:rte_eal_memzone_init=uninstrumented +fun:rte_memzone_dump=uninstrumented +fun:rte_memzone_free=uninstrumented +fun:rte_memzone_lookup=uninstrumented +fun:rte_memzone_reserve=uninstrumented +fun:rte_memzone_reserve_aligned=uninstrumented +fun:rte_memzone_reserve_bounded=uninstrumented +fun:rte_memzone_walk=uninstrumented +fun:eal_adjust_config=uninstrumented +fun:eal_check_common_options=uninstrumented +fun:eal_common_usage=uninstrumented +fun:eal_option_device_parse=uninstrumented +fun:eal_parse_common_option=uninstrumented +fun:eal_plugins_init=uninstrumented +fun:eal_reset_internal_config=uninstrumented +fun:rte_eal_primary_proc_alive=uninstrumented +fun:rte_mp_action_register=uninstrumented +fun:rte_mp_action_unregister=uninstrumented +fun:rte_mp_channel_init=uninstrumented +fun:rte_mp_reply=uninstrumented +fun:rte_mp_request_async=uninstrumented +fun:rte_mp_request_sync=uninstrumented +fun:rte_mp_sendmsg=uninstrumented +fun:rte_strscpy=uninstrumented +fun:rte_strsplit=uninstrumented +fun:rte_dump_tailq=uninstrumented +fun:rte_eal_tailq_lookup=uninstrumented +fun:rte_eal_tailq_register=uninstrumented +fun:rte_eal_tailqs_init=uninstrumented +fun:eal_cpuset_socket_id=uninstrumented +fun:eal_thread_dump_affinity=uninstrumented +fun:rte_ctrl_thread_create=uninstrumented +fun:rte_lcore_has_role=uninstrumented +fun:rte_socket_id=uninstrumented +fun:rte_thread_get_affinity=uninstrumented +fun:rte_thread_set_affinity=uninstrumented +fun:rte_delay_us_block=uninstrumented +fun:rte_delay_us_callback_register=uninstrumented +fun:rte_delay_us_sleep=uninstrumented +fun:rte_get_tsc_hz=uninstrumented +fun:set_tsc_freq=uninstrumented +fun:rte_uuid_compare=uninstrumented +fun:rte_uuid_is_null=uninstrumented +fun:rte_uuid_parse=uninstrumented +fun:rte_uuid_unparse=uninstrumented +fun:rte_cpu_getauxval=uninstrumented +fun:rte_cpu_strcmp_auxval=uninstrumented +fun:__rte_panic=uninstrumented +fun:rte_dump_registers=uninstrumented +fun:rte_dump_stack=uninstrumented +fun:rte_exit=uninstrumented +fun:dev_sigbus_handler_register=uninstrumented +fun:dev_sigbus_handler_unregister=uninstrumented +fun:rte_dev_event_monitor_start=uninstrumented +fun:rte_dev_event_monitor_stop=uninstrumented +fun:rte_dev_hotplug_handle_disable=uninstrumented +fun:rte_dev_hotplug_handle_enable=uninstrumented +fun:eal_hugepage_info_init=uninstrumented +fun:eal_hugepage_info_read=uninstrumented +fun:rte_eal_intr_init=uninstrumented +fun:rte_epoll_ctl=uninstrumented +fun:rte_epoll_wait=uninstrumented +fun:rte_intr_allow_others=uninstrumented +fun:rte_intr_callback_register=uninstrumented +fun:rte_intr_callback_unregister=uninstrumented +fun:rte_intr_cap_multiple=uninstrumented +fun:rte_intr_disable=uninstrumented +fun:rte_intr_dp_is_en=uninstrumented +fun:rte_intr_efd_disable=uninstrumented +fun:rte_intr_efd_enable=uninstrumented +fun:rte_intr_enable=uninstrumented +fun:rte_intr_free_epoll_fd=uninstrumented +fun:rte_intr_rx_ctl=uninstrumented +fun:rte_intr_tls_epfd=uninstrumented +fun:eal_cpu_core_id=uninstrumented +fun:eal_cpu_detected=uninstrumented +fun:eal_cpu_socket_id=uninstrumented +fun:rte_eal_log_init=uninstrumented +fun:eal_memalloc_alloc_seg=uninstrumented +fun:eal_memalloc_alloc_seg_bulk=uninstrumented +fun:eal_memalloc_free_seg=uninstrumented +fun:eal_memalloc_free_seg_bulk=uninstrumented +fun:eal_memalloc_get_seg_fd=uninstrumented +fun:eal_memalloc_get_seg_fd_offset=uninstrumented +fun:eal_memalloc_init=uninstrumented +fun:eal_memalloc_set_seg_fd=uninstrumented +fun:eal_memalloc_sync_with_primary=uninstrumented +fun:rte_eal_hugepage_attach=uninstrumented +fun:rte_eal_hugepage_init=uninstrumented +fun:rte_eal_memseg_init=uninstrumented +fun:rte_eal_using_phys_addrs=uninstrumented +fun:rte_mem_virt2iova=uninstrumented +fun:rte_mem_virt2phy=uninstrumented +fun:eal_thread_init_master=uninstrumented +fun:eal_thread_loop=uninstrumented +fun:rte_eal_remote_launch=uninstrumented +fun:rte_sys_gettid=uninstrumented +fun:rte_thread_setname=uninstrumented +fun:get_tsc_freq=uninstrumented +fun:rte_eal_hpet_init=uninstrumented +fun:rte_eal_timer_init=uninstrumented +fun:rte_get_hpet_cycles=uninstrumented +fun:rte_get_hpet_hz=uninstrumented +fun:rte_vfio_clear_group=uninstrumented +fun:rte_vfio_container_create=uninstrumented +fun:rte_vfio_container_destroy=uninstrumented +fun:rte_vfio_container_dma_map=uninstrumented +fun:rte_vfio_container_dma_unmap=uninstrumented +fun:rte_vfio_container_group_bind=uninstrumented +fun:rte_vfio_container_group_unbind=uninstrumented +fun:rte_vfio_dma_map=uninstrumented +fun:rte_vfio_dma_unmap=uninstrumented +fun:rte_vfio_enable=uninstrumented +fun:rte_vfio_get_container_fd=uninstrumented +fun:rte_vfio_get_group_fd=uninstrumented +fun:rte_vfio_get_group_num=uninstrumented +fun:rte_vfio_is_enabled=uninstrumented +fun:rte_vfio_noiommu_is_enabled=uninstrumented +fun:rte_vfio_release_device=uninstrumented +fun:rte_vfio_setup_device=uninstrumented +fun:vfio_get_default_container_fd=uninstrumented +fun:vfio_has_supported_extensions=uninstrumented +fun:vfio_set_iommu_type=uninstrumented +fun:vfio_mp_sync_setup=uninstrumented +fun:eal_dev_hotplug_request_to_primary=uninstrumented +fun:eal_dev_hotplug_request_to_secondary=uninstrumented +fun:rte_mp_dev_hotplug_init=uninstrumented +fun:malloc_elem_alloc=uninstrumented +fun:malloc_elem_can_hold=uninstrumented +fun:malloc_elem_dump=uninstrumented +fun:malloc_elem_find_max_iova_contig=uninstrumented +fun:malloc_elem_free=uninstrumented +fun:malloc_elem_free_list_index=uninstrumented +fun:malloc_elem_free_list_insert=uninstrumented +fun:malloc_elem_free_list_remove=uninstrumented +fun:malloc_elem_hide_region=uninstrumented +fun:malloc_elem_init=uninstrumented +fun:malloc_elem_insert=uninstrumented +fun:malloc_elem_join_adjacent_free=uninstrumented +fun:malloc_elem_resize=uninstrumented +fun:alloc_pages_on_heap=uninstrumented +fun:malloc_heap_add_external_memory=uninstrumented +fun:malloc_heap_alloc=uninstrumented +fun:malloc_heap_alloc_biggest=uninstrumented +fun:malloc_heap_create=uninstrumented +fun:malloc_heap_destroy=uninstrumented +fun:malloc_heap_dump=uninstrumented +fun:malloc_heap_free=uninstrumented +fun:malloc_heap_free_pages=uninstrumented +fun:malloc_heap_get_stats=uninstrumented +fun:malloc_heap_remove_external_memory=uninstrumented +fun:malloc_heap_resize=uninstrumented +fun:malloc_socket_to_heap_id=uninstrumented +fun:rollback_expand_heap=uninstrumented +fun:rte_eal_malloc_heap_init=uninstrumented +fun:register_mp_requests=uninstrumented +fun:request_sync=uninstrumented +fun:request_to_primary=uninstrumented +fun:rte_cpu_get_flag_enabled=uninstrumented +fun:rte_cpu_get_flag_name=uninstrumented +fun:get_tsc_freq_arch=uninstrumented +fun:rte_hypervisor_get=uninstrumented +fun:rte_keepalive_create=uninstrumented +fun:rte_keepalive_dispatch_pings=uninstrumented +fun:rte_keepalive_mark_alive=uninstrumented +fun:rte_keepalive_mark_sleep=uninstrumented +fun:rte_keepalive_register_core=uninstrumented +fun:rte_keepalive_register_relay_callback=uninstrumented +fun:rte_calloc=uninstrumented +fun:rte_calloc_socket=uninstrumented +fun:rte_free=uninstrumented +fun:rte_malloc=uninstrumented +fun:rte_malloc_dump_heaps=uninstrumented +fun:rte_malloc_dump_stats=uninstrumented +fun:rte_malloc_get_socket_stats=uninstrumented +fun:rte_malloc_heap_create=uninstrumented +fun:rte_malloc_heap_destroy=uninstrumented +fun:rte_malloc_heap_get_socket=uninstrumented +fun:rte_malloc_heap_memory_add=uninstrumented +fun:rte_malloc_heap_memory_attach=uninstrumented +fun:rte_malloc_heap_memory_detach=uninstrumented +fun:rte_malloc_heap_memory_remove=uninstrumented +fun:rte_malloc_heap_socket_is_external=uninstrumented +fun:rte_malloc_set_limit=uninstrumented +fun:rte_malloc_socket=uninstrumented +fun:rte_malloc_validate=uninstrumented +fun:rte_malloc_virt2iova=uninstrumented +fun:rte_realloc=uninstrumented +fun:rte_zmalloc=uninstrumented +fun:rte_zmalloc_socket=uninstrumented +fun:rte_option_init=uninstrumented +fun:rte_option_parse=uninstrumented +fun:rte_option_register=uninstrumented +fun:rte_reciprocal_value=uninstrumented +fun:rte_reciprocal_value_u64=uninstrumented +fun:rte_service_attr_get=uninstrumented +fun:rte_service_attr_reset_all=uninstrumented +fun:rte_service_component_register=uninstrumented +fun:rte_service_component_runstate_set=uninstrumented +fun:rte_service_component_unregister=uninstrumented +fun:rte_service_dump=uninstrumented +fun:rte_service_finalize=uninstrumented +fun:rte_service_get_by_name=uninstrumented +fun:rte_service_get_count=uninstrumented +fun:rte_service_get_name=uninstrumented +fun:rte_service_init=uninstrumented +fun:rte_service_lcore_add=uninstrumented +fun:rte_service_lcore_attr_get=uninstrumented +fun:rte_service_lcore_attr_reset_all=uninstrumented +fun:rte_service_lcore_count=uninstrumented +fun:rte_service_lcore_count_services=uninstrumented +fun:rte_service_lcore_del=uninstrumented +fun:rte_service_lcore_list=uninstrumented +fun:rte_service_lcore_reset_all=uninstrumented +fun:rte_service_lcore_start=uninstrumented +fun:rte_service_lcore_stop=uninstrumented +fun:rte_service_map_lcore_get=uninstrumented +fun:rte_service_map_lcore_set=uninstrumented +fun:rte_service_may_be_active=uninstrumented +fun:rte_service_probe_capability=uninstrumented +fun:rte_service_run_iter_on_app_lcore=uninstrumented +fun:rte_service_runstate_get=uninstrumented +fun:rte_service_runstate_set=uninstrumented +fun:rte_service_set_runstate_mapped_check=uninstrumented +fun:rte_service_set_stats_enable=uninstrumented +fun:rte_service_start_with_defaults=uninstrumented +fun:eth_dev_to_id=uninstrumented +fun:eth_find_device=uninstrumented +fun:rte_eth_devargs_parse_list=uninstrumented +fun:rte_eth_devargs_parse_representor_ports=uninstrumented +fun:__rte_eth_dev_profile_init=uninstrumented +fun:_rte_eth_dev_callback_process=uninstrumented +fun:_rte_eth_dev_reset=uninstrumented +fun:rte_eth_add_first_rx_callback=uninstrumented +fun:rte_eth_add_rx_callback=uninstrumented +fun:rte_eth_add_tx_callback=uninstrumented +fun:rte_eth_allmulticast_disable=uninstrumented +fun:rte_eth_allmulticast_enable=uninstrumented +fun:rte_eth_allmulticast_get=uninstrumented +fun:rte_eth_dev_adjust_nb_rx_tx_desc=uninstrumented +fun:rte_eth_dev_allocate=uninstrumented +fun:rte_eth_dev_allocated=uninstrumented +fun:rte_eth_dev_attach_secondary=uninstrumented +fun:rte_eth_dev_callback_register=uninstrumented +fun:rte_eth_dev_callback_unregister=uninstrumented +fun:rte_eth_dev_close=uninstrumented +fun:rte_eth_dev_configure=uninstrumented +fun:rte_eth_dev_count=uninstrumented +fun:rte_eth_dev_count_avail=uninstrumented +fun:rte_eth_dev_count_total=uninstrumented +fun:rte_eth_dev_create=uninstrumented +fun:rte_eth_dev_default_mac_addr_set=uninstrumented +fun:rte_eth_dev_destroy=uninstrumented +fun:rte_eth_dev_filter_ctrl=uninstrumented +fun:rte_eth_dev_filter_supported=uninstrumented +fun:rte_eth_dev_flow_ctrl_get=uninstrumented +fun:rte_eth_dev_flow_ctrl_set=uninstrumented +fun:rte_eth_dev_fw_version_get=uninstrumented +fun:rte_eth_dev_get_dcb_info=uninstrumented +fun:rte_eth_dev_get_eeprom=uninstrumented +fun:rte_eth_dev_get_eeprom_length=uninstrumented +fun:rte_eth_dev_get_module_eeprom=uninstrumented +fun:rte_eth_dev_get_module_info=uninstrumented +fun:rte_eth_dev_get_mtu=uninstrumented +fun:rte_eth_dev_get_name_by_port=uninstrumented +fun:rte_eth_dev_get_port_by_name=uninstrumented +fun:rte_eth_dev_get_reg_info=uninstrumented +fun:rte_eth_dev_get_sec_ctx=uninstrumented +fun:rte_eth_dev_get_supported_ptypes=uninstrumented +fun:rte_eth_dev_get_vlan_offload=uninstrumented +fun:rte_eth_dev_info_get=uninstrumented +fun:rte_eth_dev_is_removed=uninstrumented +fun:rte_eth_dev_is_valid_port=uninstrumented +fun:rte_eth_dev_l2_tunnel_eth_type_conf=uninstrumented +fun:rte_eth_dev_l2_tunnel_offload_set=uninstrumented +fun:rte_eth_dev_mac_addr_add=uninstrumented +fun:rte_eth_dev_mac_addr_remove=uninstrumented +fun:rte_eth_dev_owner_delete=uninstrumented +fun:rte_eth_dev_owner_get=uninstrumented +fun:rte_eth_dev_owner_new=uninstrumented +fun:rte_eth_dev_owner_set=uninstrumented +fun:rte_eth_dev_owner_unset=uninstrumented +fun:rte_eth_dev_pool_ops_supported=uninstrumented +fun:rte_eth_dev_priority_flow_ctrl_set=uninstrumented +fun:rte_eth_dev_probing_finish=uninstrumented +fun:rte_eth_dev_release_port=uninstrumented +fun:rte_eth_dev_reset=uninstrumented +fun:rte_eth_dev_rss_hash_conf_get=uninstrumented +fun:rte_eth_dev_rss_hash_update=uninstrumented +fun:rte_eth_dev_rss_reta_query=uninstrumented +fun:rte_eth_dev_rss_reta_update=uninstrumented +fun:rte_eth_dev_rx_intr_ctl=uninstrumented +fun:rte_eth_dev_rx_intr_ctl_q=uninstrumented +fun:rte_eth_dev_rx_intr_ctl_q_get_fd=uninstrumented +fun:rte_eth_dev_rx_intr_disable=uninstrumented +fun:rte_eth_dev_rx_intr_enable=uninstrumented +fun:rte_eth_dev_rx_offload_name=uninstrumented +fun:rte_eth_dev_rx_queue_start=uninstrumented +fun:rte_eth_dev_rx_queue_stop=uninstrumented +fun:rte_eth_dev_set_eeprom=uninstrumented +fun:rte_eth_dev_set_link_down=uninstrumented +fun:rte_eth_dev_set_link_up=uninstrumented +fun:rte_eth_dev_set_mc_addr_list=uninstrumented +fun:rte_eth_dev_set_mtu=uninstrumented +fun:rte_eth_dev_set_rx_queue_stats_mapping=uninstrumented +fun:rte_eth_dev_set_tx_queue_stats_mapping=uninstrumented +fun:rte_eth_dev_set_vlan_ether_type=uninstrumented +fun:rte_eth_dev_set_vlan_offload=uninstrumented +fun:rte_eth_dev_set_vlan_pvid=uninstrumented +fun:rte_eth_dev_set_vlan_strip_on_queue=uninstrumented +fun:rte_eth_dev_socket_id=uninstrumented +fun:rte_eth_dev_start=uninstrumented +fun:rte_eth_dev_stop=uninstrumented +fun:rte_eth_dev_tx_offload_name=uninstrumented +fun:rte_eth_dev_tx_queue_start=uninstrumented +fun:rte_eth_dev_tx_queue_stop=uninstrumented +fun:rte_eth_dev_uc_all_hash_table_set=uninstrumented +fun:rte_eth_dev_uc_hash_table_set=uninstrumented +fun:rte_eth_dev_udp_tunnel_port_add=uninstrumented +fun:rte_eth_dev_udp_tunnel_port_delete=uninstrumented +fun:rte_eth_dev_vlan_filter=uninstrumented +fun:rte_eth_devargs_parse=uninstrumented +fun:rte_eth_dma_zone_reserve=uninstrumented +fun:rte_eth_find_next=uninstrumented +fun:rte_eth_find_next_owned_by=uninstrumented +fun:rte_eth_iterator_cleanup=uninstrumented +fun:rte_eth_iterator_init=uninstrumented +fun:rte_eth_iterator_next=uninstrumented +fun:rte_eth_led_off=uninstrumented +fun:rte_eth_led_on=uninstrumented +fun:rte_eth_link_get=uninstrumented +fun:rte_eth_link_get_nowait=uninstrumented +fun:rte_eth_macaddr_get=uninstrumented +fun:rte_eth_mirror_rule_reset=uninstrumented +fun:rte_eth_mirror_rule_set=uninstrumented +fun:rte_eth_promiscuous_disable=uninstrumented +fun:rte_eth_promiscuous_enable=uninstrumented +fun:rte_eth_promiscuous_get=uninstrumented +fun:rte_eth_remove_rx_callback=uninstrumented +fun:rte_eth_remove_tx_callback=uninstrumented +fun:rte_eth_rx_queue_info_get=uninstrumented +fun:rte_eth_rx_queue_setup=uninstrumented +fun:rte_eth_set_queue_rate_limit=uninstrumented +fun:rte_eth_speed_bitflag=uninstrumented +fun:rte_eth_stats_get=uninstrumented +fun:rte_eth_stats_reset=uninstrumented +fun:rte_eth_switch_domain_alloc=uninstrumented +fun:rte_eth_switch_domain_free=uninstrumented +fun:rte_eth_timesync_adjust_time=uninstrumented +fun:rte_eth_timesync_disable=uninstrumented +fun:rte_eth_timesync_enable=uninstrumented +fun:rte_eth_timesync_read_rx_timestamp=uninstrumented +fun:rte_eth_timesync_read_time=uninstrumented +fun:rte_eth_timesync_read_tx_timestamp=uninstrumented +fun:rte_eth_timesync_write_time=uninstrumented +fun:rte_eth_tx_buffer_count_callback=uninstrumented +fun:rte_eth_tx_buffer_drop_callback=uninstrumented +fun:rte_eth_tx_buffer_init=uninstrumented +fun:rte_eth_tx_buffer_set_err_callback=uninstrumented +fun:rte_eth_tx_done_cleanup=uninstrumented +fun:rte_eth_tx_queue_info_get=uninstrumented +fun:rte_eth_tx_queue_setup=uninstrumented +fun:rte_eth_xstats_get=uninstrumented +fun:rte_eth_xstats_get_by_id=uninstrumented +fun:rte_eth_xstats_get_id_by_name=uninstrumented +fun:rte_eth_xstats_get_names=uninstrumented +fun:rte_eth_xstats_get_names_by_id=uninstrumented +fun:rte_eth_xstats_reset=uninstrumented +fun:rte_flow_conv=uninstrumented +fun:rte_flow_copy=uninstrumented +fun:rte_flow_create=uninstrumented +fun:rte_flow_destroy=uninstrumented +fun:rte_flow_error_set=uninstrumented +fun:rte_flow_expand_rss=uninstrumented +fun:rte_flow_flush=uninstrumented +fun:rte_flow_isolate=uninstrumented +fun:rte_flow_ops_get=uninstrumented +fun:rte_flow_query=uninstrumented +fun:rte_flow_validate=uninstrumented +fun:rte_mtr_capabilities_get=uninstrumented +fun:rte_mtr_create=uninstrumented +fun:rte_mtr_destroy=uninstrumented +fun:rte_mtr_meter_disable=uninstrumented +fun:rte_mtr_meter_dscp_table_update=uninstrumented +fun:rte_mtr_meter_enable=uninstrumented +fun:rte_mtr_meter_profile_add=uninstrumented +fun:rte_mtr_meter_profile_delete=uninstrumented +fun:rte_mtr_meter_profile_update=uninstrumented +fun:rte_mtr_ops_get=uninstrumented +fun:rte_mtr_policer_actions_update=uninstrumented +fun:rte_mtr_stats_read=uninstrumented +fun:rte_mtr_stats_update=uninstrumented +fun:rte_tm_capabilities_get=uninstrumented +fun:rte_tm_get_number_of_leaf_nodes=uninstrumented +fun:rte_tm_hierarchy_commit=uninstrumented +fun:rte_tm_level_capabilities_get=uninstrumented +fun:rte_tm_mark_ip_dscp=uninstrumented +fun:rte_tm_mark_ip_ecn=uninstrumented +fun:rte_tm_mark_vlan_dei=uninstrumented +fun:rte_tm_node_add=uninstrumented +fun:rte_tm_node_capabilities_get=uninstrumented +fun:rte_tm_node_cman_update=uninstrumented +fun:rte_tm_node_delete=uninstrumented +fun:rte_tm_node_parent_update=uninstrumented +fun:rte_tm_node_resume=uninstrumented +fun:rte_tm_node_shaper_update=uninstrumented +fun:rte_tm_node_shared_shaper_update=uninstrumented +fun:rte_tm_node_shared_wred_context_update=uninstrumented +fun:rte_tm_node_stats_read=uninstrumented +fun:rte_tm_node_stats_update=uninstrumented +fun:rte_tm_node_suspend=uninstrumented +fun:rte_tm_node_type_get=uninstrumented +fun:rte_tm_node_wfq_weight_mode_update=uninstrumented +fun:rte_tm_node_wred_context_update=uninstrumented +fun:rte_tm_ops_get=uninstrumented +fun:rte_tm_shaper_profile_add=uninstrumented +fun:rte_tm_shaper_profile_delete=uninstrumented +fun:rte_tm_shared_shaper_add_update=uninstrumented +fun:rte_tm_shared_shaper_delete=uninstrumented +fun:rte_tm_shared_wred_context_add_update=uninstrumented +fun:rte_tm_shared_wred_context_delete=uninstrumented +fun:rte_tm_wred_profile_add=uninstrumented +fun:rte_tm_wred_profile_delete=uninstrumented +fun:gro_tcp4_reassemble=uninstrumented +fun:gro_tcp4_tbl_create=uninstrumented +fun:gro_tcp4_tbl_destroy=uninstrumented +fun:gro_tcp4_tbl_pkt_count=uninstrumented +fun:gro_tcp4_tbl_timeout_flush=uninstrumented +fun:gro_vxlan_tcp4_reassemble=uninstrumented +fun:gro_vxlan_tcp4_tbl_create=uninstrumented +fun:gro_vxlan_tcp4_tbl_destroy=uninstrumented +fun:gro_vxlan_tcp4_tbl_pkt_count=uninstrumented +fun:gro_vxlan_tcp4_tbl_timeout_flush=uninstrumented +fun:rte_gro_ctx_create=uninstrumented +fun:rte_gro_ctx_destroy=uninstrumented +fun:rte_gro_get_pkt_count=uninstrumented +fun:rte_gro_reassemble=uninstrumented +fun:rte_gro_reassemble_burst=uninstrumented +fun:rte_gro_timeout_flush=uninstrumented +fun:rte_hash_add_key=uninstrumented +fun:rte_hash_add_key_data=uninstrumented +fun:rte_hash_add_key_with_hash=uninstrumented +fun:rte_hash_add_key_with_hash_data=uninstrumented +fun:rte_hash_count=uninstrumented +fun:rte_hash_create=uninstrumented +fun:rte_hash_del_key=uninstrumented +fun:rte_hash_del_key_with_hash=uninstrumented +fun:rte_hash_find_existing=uninstrumented +fun:rte_hash_free=uninstrumented +fun:rte_hash_free_key_with_position=uninstrumented +fun:rte_hash_get_key_with_position=uninstrumented +fun:rte_hash_hash=uninstrumented +fun:rte_hash_iterate=uninstrumented +fun:rte_hash_lookup=uninstrumented +fun:rte_hash_lookup_bulk=uninstrumented +fun:rte_hash_lookup_bulk_data=uninstrumented +fun:rte_hash_lookup_data=uninstrumented +fun:rte_hash_lookup_with_hash=uninstrumented +fun:rte_hash_lookup_with_hash_data=uninstrumented +fun:rte_hash_reset=uninstrumented +fun:rte_hash_set_cmp_func=uninstrumented +fun:rte_fbk_hash_create=uninstrumented +fun:rte_fbk_hash_find_existing=uninstrumented +fun:rte_fbk_hash_free=uninstrumented +fun:ip_frag_find=uninstrumented +fun:ip_frag_lookup=uninstrumented +fun:ip_frag_process=uninstrumented +fun:rte_frag_table_del_expired_entries=uninstrumented +fun:rte_ip_frag_free_death_row=uninstrumented +fun:rte_ip_frag_table_create=uninstrumented +fun:rte_ip_frag_table_destroy=uninstrumented +fun:rte_ip_frag_table_statistics_dump=uninstrumented +fun:rte_ipv4_fragment_packet=uninstrumented +fun:ipv4_frag_reassemble=uninstrumented +fun:rte_ipv4_frag_reassemble_packet=uninstrumented +fun:rte_ipv6_fragment_packet=uninstrumented +fun:ipv6_frag_reassemble=uninstrumented +fun:rte_ipv6_frag_reassemble_packet=uninstrumented +fun:rte_kvargs_count=uninstrumented +fun:rte_kvargs_free=uninstrumented +fun:rte_kvargs_parse=uninstrumented +fun:rte_kvargs_parse_delim=uninstrumented +fun:rte_kvargs_process=uninstrumented +fun:rte_kvargs_strcmp=uninstrumented +fun:__rte_pktmbuf_read=uninstrumented +fun:rte_get_rx_ol_flag_list=uninstrumented +fun:rte_get_rx_ol_flag_name=uninstrumented +fun:rte_get_tx_ol_flag_list=uninstrumented +fun:rte_get_tx_ol_flag_name=uninstrumented +fun:rte_mbuf_sanity_check=uninstrumented +fun:rte_pktmbuf_dump=uninstrumented +fun:rte_pktmbuf_dynamic_pool_create=uninstrumented +fun:rte_pktmbuf_init=uninstrumented +fun:rte_pktmbuf_pool_create=uninstrumented +fun:rte_pktmbuf_pool_create_by_ops=uninstrumented +fun:rte_pktmbuf_pool_init=uninstrumented +fun:rte_mbuf_best_mempool_ops=uninstrumented +fun:rte_mbuf_platform_mempool_ops=uninstrumented +fun:rte_mbuf_set_platform_mempool_ops=uninstrumented +fun:rte_mbuf_set_user_mempool_ops=uninstrumented +fun:rte_mbuf_user_mempool_ops=uninstrumented +fun:rte_get_ptype_inner_l2_name=uninstrumented +fun:rte_get_ptype_inner_l3_name=uninstrumented +fun:rte_get_ptype_inner_l4_name=uninstrumented +fun:rte_get_ptype_l2_name=uninstrumented +fun:rte_get_ptype_l3_name=uninstrumented +fun:rte_get_ptype_l4_name=uninstrumented +fun:rte_get_ptype_name=uninstrumented +fun:rte_get_ptype_tunnel_name=uninstrumented +fun:rte_mempool_audit=uninstrumented +fun:rte_mempool_avail_count=uninstrumented +fun:rte_mempool_cache_create=uninstrumented +fun:rte_mempool_cache_free=uninstrumented +fun:rte_mempool_calc_obj_size=uninstrumented +fun:rte_mempool_check_cookies=uninstrumented +fun:rte_mempool_contig_blocks_check_cookies=uninstrumented +fun:rte_mempool_create=uninstrumented +fun:rte_mempool_create_empty=uninstrumented +fun:rte_mempool_dump=uninstrumented +fun:rte_mempool_free=uninstrumented +fun:rte_mempool_in_use_count=uninstrumented +fun:rte_mempool_list_dump=uninstrumented +fun:rte_mempool_lookup=uninstrumented +fun:rte_mempool_mem_iter=uninstrumented +fun:rte_mempool_obj_iter=uninstrumented +fun:rte_mempool_populate_anon=uninstrumented +fun:rte_mempool_populate_default=uninstrumented +fun:rte_mempool_populate_iova=uninstrumented +fun:rte_mempool_populate_virt=uninstrumented +fun:rte_mempool_walk=uninstrumented +fun:rte_mempool_ops_alloc=uninstrumented +fun:rte_mempool_ops_calc_mem_size=uninstrumented +fun:rte_mempool_ops_free=uninstrumented +fun:rte_mempool_ops_get_count=uninstrumented +fun:rte_mempool_ops_get_info=uninstrumented +fun:rte_mempool_ops_populate=uninstrumented +fun:rte_mempool_register_ops=uninstrumented +fun:rte_mempool_set_ops_byname=uninstrumented +fun:rte_mempool_op_calc_mem_size_default=uninstrumented +fun:rte_mempool_op_populate_default=uninstrumented +fun:rte_metrics_get_names=uninstrumented +fun:rte_metrics_get_values=uninstrumented +fun:rte_metrics_init=uninstrumented +fun:rte_metrics_reg_name=uninstrumented +fun:rte_metrics_reg_names=uninstrumented +fun:rte_metrics_update_value=uninstrumented +fun:rte_metrics_update_values=uninstrumented +fun:rte_net_make_rarp_packet=uninstrumented +fun:rte_net_get_ptype=uninstrumented +fun:rte_net_skip_ip6_ext=uninstrumented +fun:rte_net_crc_calc=uninstrumented +fun:rte_net_crc_set_alg=uninstrumented +fun:eal_parse_pci_BDF=uninstrumented +fun:eal_parse_pci_DomBDF=uninstrumented +fun:pci_map_resource=uninstrumented +fun:pci_unmap_resource=uninstrumented +fun:rte_eal_compare_pci_addr=uninstrumented +fun:rte_pci_addr_cmp=uninstrumented +fun:rte_pci_addr_parse=uninstrumented +fun:rte_pci_device_name=uninstrumented +fun:rte_eth_from_ring=uninstrumented +fun:rte_eth_from_rings=uninstrumented +fun:sock_support_features=uninstrumented +fun:vhost_kernel_open_sock=uninstrumented +fun:vhost_kernel_set_sock=uninstrumented +fun:tap_support_features=uninstrumented +fun:vhost_kernel_open_tap=uninstrumented +fun:eth_virtio_dev_init=uninstrumented +fun:virtio_dev_pause=uninstrumented +fun:virtio_dev_resume=uninstrumented +fun:virtio_inject_pkts=uninstrumented +fun:virtio_interrupt_handler=uninstrumented +fun:vtpci_get_status=uninstrumented +fun:vtpci_init=uninstrumented +fun:vtpci_isr=uninstrumented +fun:vtpci_msix_detect=uninstrumented +fun:vtpci_negotiate_features=uninstrumented +fun:vtpci_read_dev_config=uninstrumented +fun:vtpci_reinit_complete=uninstrumented +fun:vtpci_reset=uninstrumented +fun:vtpci_set_status=uninstrumented +fun:vtpci_write_dev_config=uninstrumented +fun:virtio_dev_cq_start=uninstrumented +fun:virtio_dev_rx_queue_done=uninstrumented +fun:virtio_dev_rx_queue_setup=uninstrumented +fun:virtio_dev_rx_queue_setup_finish=uninstrumented +fun:virtio_dev_tx_queue_setup=uninstrumented +fun:virtio_dev_tx_queue_setup_finish=uninstrumented +fun:virtio_recv_mergeable_pkts=uninstrumented +fun:virtio_recv_mergeable_pkts_inorder=uninstrumented +fun:virtio_recv_pkts=uninstrumented +fun:virtio_xmit_pkts=uninstrumented +fun:virtio_xmit_pkts_inorder=uninstrumented +fun:vq_ring_free_chain=uninstrumented +fun:vq_ring_free_inorder=uninstrumented +fun:virtio_rxq_vec_setup=uninstrumented +fun:virtio_recv_pkts_vec=uninstrumented +fun:is_vhost_user_by_type=uninstrumented +fun:virtio_user_dev_init=uninstrumented +fun:virtio_user_dev_uninit=uninstrumented +fun:virtio_user_handle_cq=uninstrumented +fun:virtio_user_handle_mq=uninstrumented +fun:virtio_user_start_device=uninstrumented +fun:virtio_user_stop_device=uninstrumented +fun:virtqueue_detach_unused=uninstrumented +fun:virtqueue_rxvq_flush=uninstrumented +fun:rte_ring_create=uninstrumented +fun:rte_ring_dump=uninstrumented +fun:rte_ring_free=uninstrumented +fun:rte_ring_get_memsize=uninstrumented +fun:rte_ring_init=uninstrumented +fun:rte_ring_list_dump=uninstrumented +fun:rte_ring_lookup=uninstrumented +fun:rte_timer_dump_stats=uninstrumented +fun:rte_timer_init=uninstrumented +fun:rte_timer_manage=uninstrumented +fun:rte_timer_pending=uninstrumented +fun:rte_timer_reset=uninstrumented +fun:rte_timer_reset_sync=uninstrumented +fun:rte_timer_stop=uninstrumented +fun:rte_timer_stop_sync=uninstrumented +fun:rte_timer_subsystem_init=uninstrumented +fun:pci_find_max_end_va=discard +fun:pci_parse_one_sysfs_resource=discard +fun:pci_update_device=discard +fun:rte_pci_get_iommu_class=discard +fun:rte_pci_ioport_map=discard +fun:rte_pci_ioport_read=discard +fun:rte_pci_ioport_unmap=discard +fun:rte_pci_ioport_write=discard +fun:rte_pci_map_device=discard +fun:rte_pci_read_config=discard +fun:rte_pci_scan=discard +fun:rte_pci_unmap_device=discard +fun:rte_pci_write_config=discard +fun:pci_name_set=discard +fun:rte_pci_add_device=discard +fun:rte_pci_dump=discard +fun:rte_pci_get_sysfs_path=discard +fun:rte_pci_insert_device=discard +fun:rte_pci_match=discard +fun:rte_pci_probe=discard +fun:rte_pci_register=discard +fun:rte_pci_unregister=discard +fun:pci_uio_map_resource=discard +fun:pci_uio_remap_resource=discard +fun:pci_uio_unmap_resource=discard +fun:rte_pci_dev_iterate=discard +fun:pci_uio_alloc_resource=discard +fun:pci_uio_free_resource=discard +fun:pci_uio_ioport_map=discard +fun:pci_uio_ioport_read=discard +fun:pci_uio_ioport_unmap=discard +fun:pci_uio_ioport_write=discard +fun:pci_uio_map_resource_by_index=discard +fun:pci_uio_read_config=discard +fun:pci_uio_write_config=discard +fun:pci_vfio_ioport_map=discard +fun:pci_vfio_ioport_read=discard +fun:pci_vfio_ioport_unmap=discard +fun:pci_vfio_ioport_write=discard +fun:pci_vfio_is_enabled=discard +fun:pci_vfio_map_resource=discard +fun:pci_vfio_read_config=discard +fun:pci_vfio_unmap_resource=discard +fun:pci_vfio_write_config=discard +fun:rte_vdev_add_custom_scan=discard +fun:rte_vdev_find_device=discard +fun:rte_vdev_init=discard +fun:rte_vdev_register=discard +fun:rte_vdev_remove_custom_scan=discard +fun:rte_vdev_uninit=discard +fun:rte_vdev_unregister=discard +fun:rte_vdev_dev_iterate=discard +fun:cmdline_free=discard +fun:cmdline_in=discard +fun:cmdline_interact=discard +fun:cmdline_new=discard +fun:cmdline_poll=discard +fun:cmdline_printf=discard +fun:cmdline_quit=discard +fun:cmdline_set_prompt=discard +fun:cmdline_write_char=discard +fun:cirbuf_add_buf_head=discard +fun:cirbuf_add_buf_tail=discard +fun:cirbuf_add_head=discard +fun:cirbuf_add_head_safe=discard +fun:cirbuf_add_tail=discard +fun:cirbuf_add_tail_safe=discard +fun:cirbuf_align_left=discard +fun:cirbuf_align_right=discard +fun:cirbuf_del_buf_head=discard +fun:cirbuf_del_buf_tail=discard +fun:cirbuf_del_head=discard +fun:cirbuf_del_head_safe=discard +fun:cirbuf_del_tail=discard +fun:cirbuf_del_tail_safe=discard +fun:cirbuf_get_buf_head=discard +fun:cirbuf_get_buf_tail=discard +fun:cirbuf_get_head=discard +fun:cirbuf_get_tail=discard +fun:cirbuf_init=discard +fun:cmdline_complete=discard +fun:cmdline_isendofcommand=discard +fun:cmdline_isendoftoken=discard +fun:cmdline_parse=discard +fun:cmdline_get_help_etheraddr=discard +fun:cmdline_parse_etheraddr=discard +fun:cmdline_get_help_ipaddr=discard +fun:cmdline_parse_ipaddr=discard +fun:cmdline_get_help_num=discard +fun:cmdline_parse_num=discard +fun:cmdline_get_help_portlist=discard +fun:cmdline_parse_portlist=discard +fun:cmdline_complete_get_elt_string=discard +fun:cmdline_complete_get_nb_string=discard +fun:cmdline_get_help_string=discard +fun:cmdline_parse_string=discard +fun:rdline_add_history=discard +fun:rdline_char_in=discard +fun:rdline_clear_history=discard +fun:rdline_get_buffer=discard +fun:rdline_get_history_item=discard +fun:rdline_init=discard +fun:rdline_newline=discard +fun:rdline_quit=discard +fun:rdline_redisplay=discard +fun:rdline_reset=discard +fun:rdline_restart=discard +fun:rdline_stop=discard +fun:cmdline_file_new=discard +fun:cmdline_stdin_exit=discard +fun:cmdline_stdin_new=discard +fun:vt100_init=discard +fun:vt100_parser=discard +fun:eal_create_runtime_dir=discard +fun:eal_parse_sysfs_value=discard +fun:eal_proc_type_detect=discard +fun:rte_eal_check_module=discard +fun:rte_eal_cleanup=discard +fun:rte_eal_create_uio_dev=discard +fun:rte_eal_get_configuration=discard +fun:rte_eal_get_runtime_dir=discard +fun:rte_eal_has_hugepages=discard +fun:rte_eal_has_pci=discard +fun:rte_eal_init=discard +fun:rte_eal_iopl_init=discard +fun:rte_eal_iova_mode=discard +fun:rte_eal_lcore_role=discard +fun:rte_eal_mbuf_user_pool_ops=discard +fun:rte_eal_process_type=discard +fun:rte_eal_vfio_intr_mode=discard +fun:rte_set_application_usage_hook=discard +fun:rte_eal_alarm_cancel=discard +fun:rte_eal_alarm_init=discard +fun:rte_eal_alarm_set=discard +fun:rte_bus_dump=discard +fun:rte_bus_find=discard +fun:rte_bus_find_by_device=discard +fun:rte_bus_find_by_device_name=discard +fun:rte_bus_find_by_name=discard +fun:rte_bus_get_iommu_class=discard +fun:rte_bus_probe=discard +fun:rte_bus_register=discard +fun:rte_bus_scan=discard +fun:rte_bus_sigbus_handler=discard +fun:rte_bus_unregister=discard +fun:rte_class_find=discard +fun:rte_class_find_by_name=discard +fun:rte_class_register=discard +fun:rte_class_unregister=discard +fun:rte_cpu_check_supported=discard +fun:rte_cpu_is_supported=discard +fun:local_dev_probe=discard +fun:local_dev_remove=discard +fun:rte_dev_event_callback_process=discard +fun:rte_dev_event_callback_register=discard +fun:rte_dev_event_callback_unregister=discard +fun:rte_dev_is_probed=discard +fun:rte_dev_iterator_init=discard +fun:rte_dev_iterator_next=discard +fun:rte_dev_probe=discard +fun:rte_dev_remove=discard +fun:rte_eal_hotplug_add=discard +fun:rte_eal_hotplug_remove=discard +fun:rte_devargs_add=discard +fun:rte_devargs_dump=discard +fun:rte_devargs_insert=discard +fun:rte_devargs_layers_parse=discard +fun:rte_devargs_next=discard +fun:rte_devargs_parse=discard +fun:rte_devargs_parsef=discard +fun:rte_devargs_remove=discard +fun:rte_devargs_type_count=discard +fun:rte_strerror=discard +fun:rte_fbarray_attach=discard +fun:rte_fbarray_destroy=discard +fun:rte_fbarray_detach=discard +fun:rte_fbarray_dump_metadata=discard +fun:rte_fbarray_find_contig_free=discard +fun:rte_fbarray_find_contig_used=discard +fun:rte_fbarray_find_idx=discard +fun:rte_fbarray_find_next_free=discard +fun:rte_fbarray_find_next_n_free=discard +fun:rte_fbarray_find_next_n_used=discard +fun:rte_fbarray_find_next_used=discard +fun:rte_fbarray_find_prev_free=discard +fun:rte_fbarray_find_prev_n_free=discard +fun:rte_fbarray_find_prev_n_used=discard +fun:rte_fbarray_find_prev_used=discard +fun:rte_fbarray_find_rev_contig_free=discard +fun:rte_fbarray_find_rev_contig_used=discard +fun:rte_fbarray_get=discard +fun:rte_fbarray_init=discard +fun:rte_fbarray_is_used=discard +fun:rte_fbarray_set_free=discard +fun:rte_fbarray_set_used=discard +fun:rte_hexdump=discard +fun:rte_memdump=discard +fun:rte_hypervisor_get_name=discard +fun:rte_eal_get_lcore_state=discard +fun:rte_eal_mp_remote_launch=discard +fun:rte_eal_mp_wait_lcore=discard +fun:rte_eal_wait_lcore=discard +fun:rte_eal_cpu_init=discard +fun:rte_socket_count=discard +fun:rte_socket_id_by_idx=discard +fun:eal_log_set_default=discard +fun:rte_log=discard +fun:rte_log_cur_msg_loglevel=discard +fun:rte_log_cur_msg_logtype=discard +fun:rte_log_dump=discard +fun:rte_log_get_global_level=discard +fun:rte_log_get_level=discard +fun:rte_log_register=discard +fun:rte_log_register_type_and_pick_level=discard +fun:rte_log_save_pattern=discard +fun:rte_log_save_regexp=discard +fun:rte_log_set_global_level=discard +fun:rte_log_set_level=discard +fun:rte_log_set_level_pattern=discard +fun:rte_log_set_level_regexp=discard +fun:rte_openlog_stream=discard +fun:rte_vlog=discard +fun:eal_memalloc_is_contig=discard +fun:eal_memalloc_mem_alloc_validate=discard +fun:eal_memalloc_mem_alloc_validator_register=discard +fun:eal_memalloc_mem_alloc_validator_unregister=discard +fun:eal_memalloc_mem_event_callback_register=discard +fun:eal_memalloc_mem_event_callback_unregister=discard +fun:eal_memalloc_mem_event_notify=discard +fun:eal_get_virtual_area=discard +fun:rte_dump_physmem_layout=discard +fun:rte_eal_get_physmem_size=discard +fun:rte_eal_memory_init=discard +fun:rte_mem_alloc_validator_register=discard +fun:rte_mem_alloc_validator_unregister=discard +fun:rte_mem_check_dma_mask=discard +fun:rte_mem_check_dma_mask_thread_unsafe=discard +fun:rte_mem_event_callback_register=discard +fun:rte_mem_event_callback_unregister=discard +fun:rte_mem_iova2virt=discard +fun:rte_mem_lock_page=discard +fun:rte_mem_set_dma_mask=discard +fun:rte_mem_virt2memseg=discard +fun:rte_mem_virt2memseg_list=discard +fun:rte_memory_get_nchannel=discard +fun:rte_memory_get_nrank=discard +fun:rte_memseg_contig_walk=discard +fun:rte_memseg_contig_walk_thread_unsafe=discard +fun:rte_memseg_get_fd=discard +fun:rte_memseg_get_fd_offset=discard +fun:rte_memseg_get_fd_offset_thread_unsafe=discard +fun:rte_memseg_get_fd_thread_unsafe=discard +fun:rte_memseg_list_walk=discard +fun:rte_memseg_list_walk_thread_unsafe=discard +fun:rte_memseg_walk=discard +fun:rte_memseg_walk_thread_unsafe=discard +fun:rte_eal_memzone_init=discard +fun:rte_memzone_dump=discard +fun:rte_memzone_free=discard +fun:rte_memzone_lookup=discard +fun:rte_memzone_reserve=discard +fun:rte_memzone_reserve_aligned=discard +fun:rte_memzone_reserve_bounded=discard +fun:rte_memzone_walk=discard +fun:eal_adjust_config=discard +fun:eal_check_common_options=discard +fun:eal_common_usage=discard +fun:eal_option_device_parse=discard +fun:eal_parse_common_option=discard +fun:eal_plugins_init=discard +fun:eal_reset_internal_config=discard +fun:rte_eal_primary_proc_alive=discard +fun:rte_mp_action_register=discard +fun:rte_mp_action_unregister=discard +fun:rte_mp_channel_init=discard +fun:rte_mp_reply=discard +fun:rte_mp_request_async=discard +fun:rte_mp_request_sync=discard +fun:rte_mp_sendmsg=discard +fun:rte_strscpy=discard +fun:rte_strsplit=discard +fun:rte_dump_tailq=discard +fun:rte_eal_tailq_lookup=discard +fun:rte_eal_tailq_register=discard +fun:rte_eal_tailqs_init=discard +fun:eal_cpuset_socket_id=discard +fun:eal_thread_dump_affinity=discard +fun:rte_ctrl_thread_create=discard +fun:rte_lcore_has_role=discard +fun:rte_socket_id=discard +fun:rte_thread_get_affinity=discard +fun:rte_thread_set_affinity=discard +fun:rte_delay_us_block=discard +fun:rte_delay_us_callback_register=discard +fun:rte_delay_us_sleep=discard +fun:rte_get_tsc_hz=discard +fun:set_tsc_freq=discard +fun:rte_uuid_compare=discard +fun:rte_uuid_is_null=discard +fun:rte_uuid_parse=discard +fun:rte_uuid_unparse=discard +fun:rte_cpu_getauxval=discard +fun:rte_cpu_strcmp_auxval=discard +fun:__rte_panic=discard +fun:rte_dump_registers=discard +fun:rte_dump_stack=discard +fun:rte_exit=discard +fun:dev_sigbus_handler_register=discard +fun:dev_sigbus_handler_unregister=discard +fun:rte_dev_event_monitor_start=discard +fun:rte_dev_event_monitor_stop=discard +fun:rte_dev_hotplug_handle_disable=discard +fun:rte_dev_hotplug_handle_enable=discard +fun:eal_hugepage_info_init=discard +fun:eal_hugepage_info_read=discard +fun:rte_eal_intr_init=discard +fun:rte_epoll_ctl=discard +fun:rte_epoll_wait=discard +fun:rte_intr_allow_others=discard +fun:rte_intr_callback_register=discard +fun:rte_intr_callback_unregister=discard +fun:rte_intr_cap_multiple=discard +fun:rte_intr_disable=discard +fun:rte_intr_dp_is_en=discard +fun:rte_intr_efd_disable=discard +fun:rte_intr_efd_enable=discard +fun:rte_intr_enable=discard +fun:rte_intr_free_epoll_fd=discard +fun:rte_intr_rx_ctl=discard +fun:rte_intr_tls_epfd=discard +fun:eal_cpu_core_id=discard +fun:eal_cpu_detected=discard +fun:eal_cpu_socket_id=discard +fun:rte_eal_log_init=discard +fun:eal_memalloc_alloc_seg=discard +fun:eal_memalloc_alloc_seg_bulk=discard +fun:eal_memalloc_free_seg=discard +fun:eal_memalloc_free_seg_bulk=discard +fun:eal_memalloc_get_seg_fd=discard +fun:eal_memalloc_get_seg_fd_offset=discard +fun:eal_memalloc_init=discard +fun:eal_memalloc_set_seg_fd=discard +fun:eal_memalloc_sync_with_primary=discard +fun:rte_eal_hugepage_attach=discard +fun:rte_eal_hugepage_init=discard +fun:rte_eal_memseg_init=discard +fun:rte_eal_using_phys_addrs=discard +fun:rte_mem_virt2iova=discard +fun:rte_mem_virt2phy=discard +fun:eal_thread_init_master=discard +fun:eal_thread_loop=discard +fun:rte_eal_remote_launch=discard +fun:rte_sys_gettid=discard +fun:rte_thread_setname=discard +fun:get_tsc_freq=discard +fun:rte_eal_hpet_init=discard +fun:rte_eal_timer_init=discard +fun:rte_get_hpet_cycles=discard +fun:rte_get_hpet_hz=discard +fun:rte_vfio_clear_group=discard +fun:rte_vfio_container_create=discard +fun:rte_vfio_container_destroy=discard +fun:rte_vfio_container_dma_map=discard +fun:rte_vfio_container_dma_unmap=discard +fun:rte_vfio_container_group_bind=discard +fun:rte_vfio_container_group_unbind=discard +fun:rte_vfio_dma_map=discard +fun:rte_vfio_dma_unmap=discard +fun:rte_vfio_enable=discard +fun:rte_vfio_get_container_fd=discard +fun:rte_vfio_get_group_fd=discard +fun:rte_vfio_get_group_num=discard +fun:rte_vfio_is_enabled=discard +fun:rte_vfio_noiommu_is_enabled=discard +fun:rte_vfio_release_device=discard +fun:rte_vfio_setup_device=discard +fun:vfio_get_default_container_fd=discard +fun:vfio_has_supported_extensions=discard +fun:vfio_set_iommu_type=discard +fun:vfio_mp_sync_setup=discard +fun:eal_dev_hotplug_request_to_primary=discard +fun:eal_dev_hotplug_request_to_secondary=discard +fun:rte_mp_dev_hotplug_init=discard +fun:malloc_elem_alloc=discard +fun:malloc_elem_can_hold=discard +fun:malloc_elem_dump=discard +fun:malloc_elem_find_max_iova_contig=discard +fun:malloc_elem_free=discard +fun:malloc_elem_free_list_index=discard +fun:malloc_elem_free_list_insert=discard +fun:malloc_elem_free_list_remove=discard +fun:malloc_elem_hide_region=discard +fun:malloc_elem_init=discard +fun:malloc_elem_insert=discard +fun:malloc_elem_join_adjacent_free=discard +fun:malloc_elem_resize=discard +fun:alloc_pages_on_heap=discard +fun:malloc_heap_add_external_memory=discard +fun:malloc_heap_alloc=discard +fun:malloc_heap_alloc_biggest=discard +fun:malloc_heap_create=discard +fun:malloc_heap_destroy=discard +fun:malloc_heap_dump=discard +fun:malloc_heap_free=discard +fun:malloc_heap_free_pages=discard +fun:malloc_heap_get_stats=discard +fun:malloc_heap_remove_external_memory=discard +fun:malloc_heap_resize=discard +fun:malloc_socket_to_heap_id=discard +fun:rollback_expand_heap=discard +fun:rte_eal_malloc_heap_init=discard +fun:register_mp_requests=discard +fun:request_sync=discard +fun:request_to_primary=discard +fun:rte_cpu_get_flag_enabled=discard +fun:rte_cpu_get_flag_name=discard +fun:get_tsc_freq_arch=discard +fun:rte_hypervisor_get=discard +fun:rte_keepalive_create=discard +fun:rte_keepalive_dispatch_pings=discard +fun:rte_keepalive_mark_alive=discard +fun:rte_keepalive_mark_sleep=discard +fun:rte_keepalive_register_core=discard +fun:rte_keepalive_register_relay_callback=discard +fun:rte_calloc=discard +fun:rte_calloc_socket=discard +fun:rte_free=discard +fun:rte_malloc=discard +fun:rte_malloc_dump_heaps=discard +fun:rte_malloc_dump_stats=discard +fun:rte_malloc_get_socket_stats=discard +fun:rte_malloc_heap_create=discard +fun:rte_malloc_heap_destroy=discard +fun:rte_malloc_heap_get_socket=discard +fun:rte_malloc_heap_memory_add=discard +fun:rte_malloc_heap_memory_attach=discard +fun:rte_malloc_heap_memory_detach=discard +fun:rte_malloc_heap_memory_remove=discard +fun:rte_malloc_heap_socket_is_external=discard +fun:rte_malloc_set_limit=discard +fun:rte_malloc_socket=discard +fun:rte_malloc_validate=discard +fun:rte_malloc_virt2iova=discard +fun:rte_realloc=discard +fun:rte_zmalloc=discard +fun:rte_zmalloc_socket=discard +fun:rte_option_init=discard +fun:rte_option_parse=discard +fun:rte_option_register=discard +fun:rte_reciprocal_value=discard +fun:rte_reciprocal_value_u64=discard +fun:rte_service_attr_get=discard +fun:rte_service_attr_reset_all=discard +fun:rte_service_component_register=discard +fun:rte_service_component_runstate_set=discard +fun:rte_service_component_unregister=discard +fun:rte_service_dump=discard +fun:rte_service_finalize=discard +fun:rte_service_get_by_name=discard +fun:rte_service_get_count=discard +fun:rte_service_get_name=discard +fun:rte_service_init=discard +fun:rte_service_lcore_add=discard +fun:rte_service_lcore_attr_get=discard +fun:rte_service_lcore_attr_reset_all=discard +fun:rte_service_lcore_count=discard +fun:rte_service_lcore_count_services=discard +fun:rte_service_lcore_del=discard +fun:rte_service_lcore_list=discard +fun:rte_service_lcore_reset_all=discard +fun:rte_service_lcore_start=discard +fun:rte_service_lcore_stop=discard +fun:rte_service_map_lcore_get=discard +fun:rte_service_map_lcore_set=discard +fun:rte_service_may_be_active=discard +fun:rte_service_probe_capability=discard +fun:rte_service_run_iter_on_app_lcore=discard +fun:rte_service_runstate_get=discard +fun:rte_service_runstate_set=discard +fun:rte_service_set_runstate_mapped_check=discard +fun:rte_service_set_stats_enable=discard +fun:rte_service_start_with_defaults=discard +fun:eth_dev_to_id=discard +fun:eth_find_device=discard +fun:rte_eth_devargs_parse_list=discard +fun:rte_eth_devargs_parse_representor_ports=discard +fun:__rte_eth_dev_profile_init=discard +fun:_rte_eth_dev_callback_process=discard +fun:_rte_eth_dev_reset=discard +fun:rte_eth_add_first_rx_callback=discard +fun:rte_eth_add_rx_callback=discard +fun:rte_eth_add_tx_callback=discard +fun:rte_eth_allmulticast_disable=discard +fun:rte_eth_allmulticast_enable=discard +fun:rte_eth_allmulticast_get=discard +fun:rte_eth_dev_adjust_nb_rx_tx_desc=discard +fun:rte_eth_dev_allocate=discard +fun:rte_eth_dev_allocated=discard +fun:rte_eth_dev_attach_secondary=discard +fun:rte_eth_dev_callback_register=discard +fun:rte_eth_dev_callback_unregister=discard +fun:rte_eth_dev_close=discard +fun:rte_eth_dev_configure=discard +fun:rte_eth_dev_count=discard +fun:rte_eth_dev_count_avail=discard +fun:rte_eth_dev_count_total=discard +fun:rte_eth_dev_create=discard +fun:rte_eth_dev_default_mac_addr_set=discard +fun:rte_eth_dev_destroy=discard +fun:rte_eth_dev_filter_ctrl=discard +fun:rte_eth_dev_filter_supported=discard +fun:rte_eth_dev_flow_ctrl_get=discard +fun:rte_eth_dev_flow_ctrl_set=discard +fun:rte_eth_dev_fw_version_get=discard +fun:rte_eth_dev_get_dcb_info=discard +fun:rte_eth_dev_get_eeprom=discard +fun:rte_eth_dev_get_eeprom_length=discard +fun:rte_eth_dev_get_module_eeprom=discard +fun:rte_eth_dev_get_module_info=discard +fun:rte_eth_dev_get_mtu=discard +fun:rte_eth_dev_get_name_by_port=discard +fun:rte_eth_dev_get_port_by_name=discard +fun:rte_eth_dev_get_reg_info=discard +fun:rte_eth_dev_get_sec_ctx=discard +fun:rte_eth_dev_get_supported_ptypes=discard +fun:rte_eth_dev_get_vlan_offload=discard +fun:rte_eth_dev_info_get=discard +fun:rte_eth_dev_is_removed=discard +fun:rte_eth_dev_is_valid_port=discard +fun:rte_eth_dev_l2_tunnel_eth_type_conf=discard +fun:rte_eth_dev_l2_tunnel_offload_set=discard +fun:rte_eth_dev_mac_addr_add=discard +fun:rte_eth_dev_mac_addr_remove=discard +fun:rte_eth_dev_owner_delete=discard +fun:rte_eth_dev_owner_get=discard +fun:rte_eth_dev_owner_new=discard +fun:rte_eth_dev_owner_set=discard +fun:rte_eth_dev_owner_unset=discard +fun:rte_eth_dev_pool_ops_supported=discard +fun:rte_eth_dev_priority_flow_ctrl_set=discard +fun:rte_eth_dev_probing_finish=discard +fun:rte_eth_dev_release_port=discard +fun:rte_eth_dev_reset=discard +fun:rte_eth_dev_rss_hash_conf_get=discard +fun:rte_eth_dev_rss_hash_update=discard +fun:rte_eth_dev_rss_reta_query=discard +fun:rte_eth_dev_rss_reta_update=discard +fun:rte_eth_dev_rx_intr_ctl=discard +fun:rte_eth_dev_rx_intr_ctl_q=discard +fun:rte_eth_dev_rx_intr_ctl_q_get_fd=discard +fun:rte_eth_dev_rx_intr_disable=discard +fun:rte_eth_dev_rx_intr_enable=discard +fun:rte_eth_dev_rx_offload_name=discard +fun:rte_eth_dev_rx_queue_start=discard +fun:rte_eth_dev_rx_queue_stop=discard +fun:rte_eth_dev_set_eeprom=discard +fun:rte_eth_dev_set_link_down=discard +fun:rte_eth_dev_set_link_up=discard +fun:rte_eth_dev_set_mc_addr_list=discard +fun:rte_eth_dev_set_mtu=discard +fun:rte_eth_dev_set_rx_queue_stats_mapping=discard +fun:rte_eth_dev_set_tx_queue_stats_mapping=discard +fun:rte_eth_dev_set_vlan_ether_type=discard +fun:rte_eth_dev_set_vlan_offload=discard +fun:rte_eth_dev_set_vlan_pvid=discard +fun:rte_eth_dev_set_vlan_strip_on_queue=discard +fun:rte_eth_dev_socket_id=discard +fun:rte_eth_dev_start=discard +fun:rte_eth_dev_stop=discard +fun:rte_eth_dev_tx_offload_name=discard +fun:rte_eth_dev_tx_queue_start=discard +fun:rte_eth_dev_tx_queue_stop=discard +fun:rte_eth_dev_uc_all_hash_table_set=discard +fun:rte_eth_dev_uc_hash_table_set=discard +fun:rte_eth_dev_udp_tunnel_port_add=discard +fun:rte_eth_dev_udp_tunnel_port_delete=discard +fun:rte_eth_dev_vlan_filter=discard +fun:rte_eth_devargs_parse=discard +fun:rte_eth_dma_zone_reserve=discard +fun:rte_eth_find_next=discard +fun:rte_eth_find_next_owned_by=discard +fun:rte_eth_iterator_cleanup=discard +fun:rte_eth_iterator_init=discard +fun:rte_eth_iterator_next=discard +fun:rte_eth_led_off=discard +fun:rte_eth_led_on=discard +fun:rte_eth_link_get=discard +fun:rte_eth_link_get_nowait=discard +fun:rte_eth_macaddr_get=discard +fun:rte_eth_mirror_rule_reset=discard +fun:rte_eth_mirror_rule_set=discard +fun:rte_eth_promiscuous_disable=discard +fun:rte_eth_promiscuous_enable=discard +fun:rte_eth_promiscuous_get=discard +fun:rte_eth_remove_rx_callback=discard +fun:rte_eth_remove_tx_callback=discard +fun:rte_eth_rx_queue_info_get=discard +fun:rte_eth_rx_queue_setup=discard +fun:rte_eth_set_queue_rate_limit=discard +fun:rte_eth_speed_bitflag=discard +fun:rte_eth_stats_get=discard +fun:rte_eth_stats_reset=discard +fun:rte_eth_switch_domain_alloc=discard +fun:rte_eth_switch_domain_free=discard +fun:rte_eth_timesync_adjust_time=discard +fun:rte_eth_timesync_disable=discard +fun:rte_eth_timesync_enable=discard +fun:rte_eth_timesync_read_rx_timestamp=discard +fun:rte_eth_timesync_read_time=discard +fun:rte_eth_timesync_read_tx_timestamp=discard +fun:rte_eth_timesync_write_time=discard +fun:rte_eth_tx_buffer_count_callback=discard +fun:rte_eth_tx_buffer_drop_callback=discard +fun:rte_eth_tx_buffer_init=discard +fun:rte_eth_tx_buffer_set_err_callback=discard +fun:rte_eth_tx_done_cleanup=discard +fun:rte_eth_tx_queue_info_get=discard +fun:rte_eth_tx_queue_setup=discard +fun:rte_eth_xstats_get=discard +fun:rte_eth_xstats_get_by_id=discard +fun:rte_eth_xstats_get_id_by_name=discard +fun:rte_eth_xstats_get_names=discard +fun:rte_eth_xstats_get_names_by_id=discard +fun:rte_eth_xstats_reset=discard +fun:rte_flow_conv=discard +fun:rte_flow_copy=discard +fun:rte_flow_create=discard +fun:rte_flow_destroy=discard +fun:rte_flow_error_set=discard +fun:rte_flow_expand_rss=discard +fun:rte_flow_flush=discard +fun:rte_flow_isolate=discard +fun:rte_flow_ops_get=discard +fun:rte_flow_query=discard +fun:rte_flow_validate=discard +fun:rte_mtr_capabilities_get=discard +fun:rte_mtr_create=discard +fun:rte_mtr_destroy=discard +fun:rte_mtr_meter_disable=discard +fun:rte_mtr_meter_dscp_table_update=discard +fun:rte_mtr_meter_enable=discard +fun:rte_mtr_meter_profile_add=discard +fun:rte_mtr_meter_profile_delete=discard +fun:rte_mtr_meter_profile_update=discard +fun:rte_mtr_ops_get=discard +fun:rte_mtr_policer_actions_update=discard +fun:rte_mtr_stats_read=discard +fun:rte_mtr_stats_update=discard +fun:rte_tm_capabilities_get=discard +fun:rte_tm_get_number_of_leaf_nodes=discard +fun:rte_tm_hierarchy_commit=discard +fun:rte_tm_level_capabilities_get=discard +fun:rte_tm_mark_ip_dscp=discard +fun:rte_tm_mark_ip_ecn=discard +fun:rte_tm_mark_vlan_dei=discard +fun:rte_tm_node_add=discard +fun:rte_tm_node_capabilities_get=discard +fun:rte_tm_node_cman_update=discard +fun:rte_tm_node_delete=discard +fun:rte_tm_node_parent_update=discard +fun:rte_tm_node_resume=discard +fun:rte_tm_node_shaper_update=discard +fun:rte_tm_node_shared_shaper_update=discard +fun:rte_tm_node_shared_wred_context_update=discard +fun:rte_tm_node_stats_read=discard +fun:rte_tm_node_stats_update=discard +fun:rte_tm_node_suspend=discard +fun:rte_tm_node_type_get=discard +fun:rte_tm_node_wfq_weight_mode_update=discard +fun:rte_tm_node_wred_context_update=discard +fun:rte_tm_ops_get=discard +fun:rte_tm_shaper_profile_add=discard +fun:rte_tm_shaper_profile_delete=discard +fun:rte_tm_shared_shaper_add_update=discard +fun:rte_tm_shared_shaper_delete=discard +fun:rte_tm_shared_wred_context_add_update=discard +fun:rte_tm_shared_wred_context_delete=discard +fun:rte_tm_wred_profile_add=discard +fun:rte_tm_wred_profile_delete=discard +fun:gro_tcp4_reassemble=discard +fun:gro_tcp4_tbl_create=discard +fun:gro_tcp4_tbl_destroy=discard +fun:gro_tcp4_tbl_pkt_count=discard +fun:gro_tcp4_tbl_timeout_flush=discard +fun:gro_vxlan_tcp4_reassemble=discard +fun:gro_vxlan_tcp4_tbl_create=discard +fun:gro_vxlan_tcp4_tbl_destroy=discard +fun:gro_vxlan_tcp4_tbl_pkt_count=discard +fun:gro_vxlan_tcp4_tbl_timeout_flush=discard +fun:rte_gro_ctx_create=discard +fun:rte_gro_ctx_destroy=discard +fun:rte_gro_get_pkt_count=discard +fun:rte_gro_reassemble=discard +fun:rte_gro_reassemble_burst=discard +fun:rte_gro_timeout_flush=discard +fun:rte_hash_add_key=discard +fun:rte_hash_add_key_data=discard +fun:rte_hash_add_key_with_hash=discard +fun:rte_hash_add_key_with_hash_data=discard +fun:rte_hash_count=discard +fun:rte_hash_create=discard +fun:rte_hash_del_key=discard +fun:rte_hash_del_key_with_hash=discard +fun:rte_hash_find_existing=discard +fun:rte_hash_free=discard +fun:rte_hash_free_key_with_position=discard +fun:rte_hash_get_key_with_position=discard +fun:rte_hash_hash=discard +fun:rte_hash_iterate=discard +fun:rte_hash_lookup=discard +fun:rte_hash_lookup_bulk=discard +fun:rte_hash_lookup_bulk_data=discard +fun:rte_hash_lookup_data=discard +fun:rte_hash_lookup_with_hash=discard +fun:rte_hash_lookup_with_hash_data=discard +fun:rte_hash_reset=discard +fun:rte_hash_set_cmp_func=discard +fun:rte_fbk_hash_create=discard +fun:rte_fbk_hash_find_existing=discard +fun:rte_fbk_hash_free=discard +fun:ip_frag_find=discard +fun:ip_frag_lookup=discard +fun:ip_frag_process=discard +fun:rte_frag_table_del_expired_entries=discard +fun:rte_ip_frag_free_death_row=discard +fun:rte_ip_frag_table_create=discard +fun:rte_ip_frag_table_destroy=discard +fun:rte_ip_frag_table_statistics_dump=discard +fun:rte_ipv4_fragment_packet=discard +fun:ipv4_frag_reassemble=discard +fun:rte_ipv4_frag_reassemble_packet=discard +fun:rte_ipv6_fragment_packet=discard +fun:ipv6_frag_reassemble=discard +fun:rte_ipv6_frag_reassemble_packet=discard +fun:rte_kvargs_count=discard +fun:rte_kvargs_free=discard +fun:rte_kvargs_parse=discard +fun:rte_kvargs_parse_delim=discard +fun:rte_kvargs_process=discard +fun:rte_kvargs_strcmp=discard +fun:__rte_pktmbuf_read=discard +fun:rte_get_rx_ol_flag_list=discard +fun:rte_get_rx_ol_flag_name=discard +fun:rte_get_tx_ol_flag_list=discard +fun:rte_get_tx_ol_flag_name=discard +fun:rte_mbuf_sanity_check=discard +fun:rte_pktmbuf_dump=discard +fun:rte_pktmbuf_dynamic_pool_create=discard +fun:rte_pktmbuf_init=discard +fun:rte_pktmbuf_pool_create=discard +fun:rte_pktmbuf_pool_create_by_ops=discard +fun:rte_pktmbuf_pool_init=discard +fun:rte_mbuf_best_mempool_ops=discard +fun:rte_mbuf_platform_mempool_ops=discard +fun:rte_mbuf_set_platform_mempool_ops=discard +fun:rte_mbuf_set_user_mempool_ops=discard +fun:rte_mbuf_user_mempool_ops=discard +fun:rte_get_ptype_inner_l2_name=discard +fun:rte_get_ptype_inner_l3_name=discard +fun:rte_get_ptype_inner_l4_name=discard +fun:rte_get_ptype_l2_name=discard +fun:rte_get_ptype_l3_name=discard +fun:rte_get_ptype_l4_name=discard +fun:rte_get_ptype_name=discard +fun:rte_get_ptype_tunnel_name=discard +fun:rte_mempool_audit=discard +fun:rte_mempool_avail_count=discard +fun:rte_mempool_cache_create=discard +fun:rte_mempool_cache_free=discard +fun:rte_mempool_calc_obj_size=discard +fun:rte_mempool_check_cookies=discard +fun:rte_mempool_contig_blocks_check_cookies=discard +fun:rte_mempool_create=discard +fun:rte_mempool_create_empty=discard +fun:rte_mempool_dump=discard +fun:rte_mempool_free=discard +fun:rte_mempool_in_use_count=discard +fun:rte_mempool_list_dump=discard +fun:rte_mempool_lookup=discard +fun:rte_mempool_mem_iter=discard +fun:rte_mempool_obj_iter=discard +fun:rte_mempool_populate_anon=discard +fun:rte_mempool_populate_default=discard +fun:rte_mempool_populate_iova=discard +fun:rte_mempool_populate_virt=discard +fun:rte_mempool_walk=discard +fun:rte_mempool_ops_alloc=discard +fun:rte_mempool_ops_calc_mem_size=discard +fun:rte_mempool_ops_free=discard +fun:rte_mempool_ops_get_count=discard +fun:rte_mempool_ops_get_info=discard +fun:rte_mempool_ops_populate=discard +fun:rte_mempool_register_ops=discard +fun:rte_mempool_set_ops_byname=discard +fun:rte_mempool_op_calc_mem_size_default=discard +fun:rte_mempool_op_populate_default=discard +fun:rte_metrics_get_names=discard +fun:rte_metrics_get_values=discard +fun:rte_metrics_init=discard +fun:rte_metrics_reg_name=discard +fun:rte_metrics_reg_names=discard +fun:rte_metrics_update_value=discard +fun:rte_metrics_update_values=discard +fun:rte_net_make_rarp_packet=discard +fun:rte_net_get_ptype=discard +fun:rte_net_skip_ip6_ext=discard +fun:rte_net_crc_calc=discard +fun:rte_net_crc_set_alg=discard +fun:eal_parse_pci_BDF=discard +fun:eal_parse_pci_DomBDF=discard +fun:pci_map_resource=discard +fun:pci_unmap_resource=discard +fun:rte_eal_compare_pci_addr=discard +fun:rte_pci_addr_cmp=discard +fun:rte_pci_addr_parse=discard +fun:rte_pci_device_name=discard +fun:rte_eth_from_ring=discard +fun:rte_eth_from_rings=discard +fun:sock_support_features=discard +fun:vhost_kernel_open_sock=discard +fun:vhost_kernel_set_sock=discard +fun:tap_support_features=discard +fun:vhost_kernel_open_tap=discard +fun:eth_virtio_dev_init=discard +fun:virtio_dev_pause=discard +fun:virtio_dev_resume=discard +fun:virtio_inject_pkts=discard +fun:virtio_interrupt_handler=discard +fun:vtpci_get_status=discard +fun:vtpci_init=discard +fun:vtpci_isr=discard +fun:vtpci_msix_detect=discard +fun:vtpci_negotiate_features=discard +fun:vtpci_read_dev_config=discard +fun:vtpci_reinit_complete=discard +fun:vtpci_reset=discard +fun:vtpci_set_status=discard +fun:vtpci_write_dev_config=discard +fun:virtio_dev_cq_start=discard +fun:virtio_dev_rx_queue_done=discard +fun:virtio_dev_rx_queue_setup=discard +fun:virtio_dev_rx_queue_setup_finish=discard +fun:virtio_dev_tx_queue_setup=discard +fun:virtio_dev_tx_queue_setup_finish=discard +fun:virtio_recv_mergeable_pkts=discard +fun:virtio_recv_mergeable_pkts_inorder=discard +fun:virtio_recv_pkts=discard +fun:virtio_xmit_pkts=discard +fun:virtio_xmit_pkts_inorder=discard +fun:vq_ring_free_chain=discard +fun:vq_ring_free_inorder=discard +fun:virtio_rxq_vec_setup=discard +fun:virtio_recv_pkts_vec=discard +fun:is_vhost_user_by_type=discard +fun:virtio_user_dev_init=discard +fun:virtio_user_dev_uninit=discard +fun:virtio_user_handle_cq=discard +fun:virtio_user_handle_mq=discard +fun:virtio_user_start_device=discard +fun:virtio_user_stop_device=discard +fun:virtqueue_detach_unused=discard +fun:virtqueue_rxvq_flush=discard +fun:rte_ring_create=discard +fun:rte_ring_dump=discard +fun:rte_ring_free=discard +fun:rte_ring_get_memsize=discard +fun:rte_ring_init=discard +fun:rte_ring_list_dump=discard +fun:rte_ring_lookup=discard +fun:rte_timer_dump_stats=discard +fun:rte_timer_init=discard +fun:rte_timer_manage=discard +fun:rte_timer_pending=discard +fun:rte_timer_reset=discard +fun:rte_timer_reset_sync=discard +fun:rte_timer_stop=discard +fun:rte_timer_stop_sync=discard +fun:rte_timer_subsystem_init=discard diff --git a/angora/run.sh b/angora/run.sh new file mode 100644 index 0000000..be215f1 --- /dev/null +++ b/angora/run.sh @@ -0,0 +1 @@ +~/git/Angora/angora_fuzzer -M 2048 -i seeds -o output -t /root/git/uss/angora/tcp_lo.taint -- /root/git/uss/angora/tcp_lo.fast 127.0.0.1 1234 @@ diff --git a/angora/seeds/seed.txt b/angora/seeds/seed.txt new file mode 100644 index 0000000..f534deb --- /dev/null +++ b/angora/seeds/seed.txt @@ -0,0 +1 @@ +Hello World. diff --git a/dpdk/Makefile b/dpdk/Makefile index 15204fa..5d92719 100644 --- a/dpdk/Makefile +++ b/dpdk/Makefile @@ -21,10 +21,12 @@ DPDK_PKTMBUF_HEADROOM ?= 128 DPDK_MARCH ?= native DPDK_TUNE ?= generic DPDK_DEBUG ?= n +DPDK_DESTDIR ?= $(CURDIR)/install +PACKETDRILL ?= n B := $(DPDK_BUILD_DIR) I := $(DPDK_INSTALL_DIR) -DPDK_GIT_REPO ?= http://dpdk.org/git/dpdk +DPDK_GIT_REPO ?= http://dpdk.org/git/dpdk -b v18.11 DPDK_SOURCE := $(B)/dpdk ifneq (,$(findstring clang,$(CC))) @@ -40,8 +42,8 @@ endif JOBS := $(shell grep processor /proc/cpuinfo | wc -l) # compiler/linker custom arguments -DPDK_CPU_CFLAGS := -pie -fPIC -DPDK_CPU_LDFLAGS := +DPDK_CPU_CFLAGS := -fPIC +DPDK_CPU_LDFLAGS := -r DPDK_EXTRA_LDFLAGS := -g ifeq ($(DPDK_DEBUG),n) @@ -78,6 +80,7 @@ DPDK_MAKE_ARGS := -C $(DPDK_SOURCE) -j $(JOBS) \ EXTRA_LDFLAGS="$(DPDK_EXTRA_LDFLAGS)" \ CPU_CFLAGS="$(DPDK_CPU_CFLAGS)" \ CPU_LDFLAGS="$(DPDK_CPU_LDFLAGS)" \ + DESTDIR="$(DPDK_DESTDIR)" \ $(DPDK_MAKE_EXTRA_ARGS) DPDK_SOURCE_FILES := $(shell [ -e $(DPDK_SOURCE) ] && \ @@ -102,7 +105,7 @@ $(B)/custom-config: $(B)/.patch.ok Makefile $(call set,RTE_MAX_LCORE,256) $(call set,RTE_PKTMBUF_HEADROOM,$(DPDK_PKTMBUF_HEADROOM)) $(call set,RTE_LIBEAL_USE_HPET,y) - $(call set,RTE_BUILD_COMBINE_LIBS,y) + $(call set,RTE_BUILD_COMBINE_LIBS,n) $(call set,RTE_LIBRTE_I40E_16BYTE_RX_DESC,y) $(call set,RTE_LIBRTE_I40E_ITR_INTERVAL,16) $(call set,RTE_LIBRTE_PMD_PCAP,y) @@ -115,13 +118,101 @@ $(B)/custom-config: $(B)/.patch.ok Makefile $(call set,RTE_LIBRTE_PMD_BOND,y) $(call set,RTE_LIBRTE_IP_FRAG,y) @# not needed + $(call set,RTE_LIBRTE_TIMER,y) $(call set,RTE_LIBRTE_CFGFILE,n) + $(call set,RTE_LIBRTE_LPM,y) + $(call set,RTE_LIBRTE_ACL,n) $(call set,RTE_LIBRTE_POWER,n) $(call set,RTE_LIBRTE_DISTRIBUTOR,n) $(call set,RTE_LIBRTE_REORDER,n) + $(call set,RTE_LIBRTE_PORT,n) + $(call set,RTE_LIBRTE_TABLE,n) + $(call set,RTE_LIBRTE_PIPELINE,n) $(call set,RTE_LIBRTE_FLOW_CLASSIFY,n) $(call set,RTE_LIBRTE_PMD_CRYPTO_SCHEDULER,n) $(call set,RTE_KNI_KMOD,n) + $(call set,RTE_LIBRTE_ENA_PMD,n) + $(call set,RTE_LIBRTE_FM10K_PMD,n) + $(call set,RTE_LIBRTE_CXGBE_PMD,n) + $(call set,RTE_LIBRTE_ENIC_PMD,n) + $(call set,RTE_LIBRTE_BNXT_PMD,n) + $(call set,RTE_LIBRTE_SFC_EFX_PMD,n) + $(call set,RTE_LIBRTE_PMD_SOFTNIC,n) + $(call set,RTE_LIBRTE_THUNDERX_NICVF_PMD,n) + $(call set,RTE_LIBRTE_LIO_PMD,n) + $(call set,RTE_LIBRTE_OCTEONTX_PMD,n) + $(call set,RTE_LIBRTE_VMXNET3_PMD,n) + $(call set,RTE_LIBRTE_QEDE_PMD,n) + $(call set,RTE_LIBRTE_ARK_PMD,n) + $(call set,RTE_LIBRTE_PMD_NULL,n) + $(call set,RTE_LIBRTE_CRYPTODEV,n) + $(call set,RTE_LIBRTE_PMD_NULL_CRYPTO,n) + $(call set,RTE_LIBRTE_SECURITY,n) + $(call set,RTE_LIBRTE_EVENTDEV,n) + $(call set,RTE_LIBRTE_PMD_SKELETON_EVENTDEV,n) + $(call set,RTE_LIBRTE_PMD_OCTEONTX_SSOVF,n) + $(call set,RTE_LIBRTE_OCTEONTX_MEMPOOL,n) + $(call set,RTE_LIBRTE_EFD,n) + $(call set,RTE_LIBRTE_MEMBER,n) + $(call set,RTE_LIBRTE_JOBSTATS,n) + $(call set,RTE_LIBRTE_METER,n) + $(call set,RTE_LIBRTE_SCHED,n) + $(call set,RTE_APP_TEST,n) + $(call set,RTE_APP_CRYPTO_PERF,n) + $(call set,RTE_APP_EVENTDEV,n) + $(call set,RTE_LIBRTE_PMD_FAILSAFE,n) + $(call set,RTE_LIBRTE_EM_PMD,n) + $(call set,RTE_LIBRTE_IGB_PMD,n) + $(call set,RTE_LIBRTE_LATENCY_STATS,n) + $(call set,RTE_EAL_IGB_UIO,n) + $(call set,RTE_LIBRTE_KNI,n) + $(call set,RTE_LIBRTE_PMD_KNI,n) + $(call set,RTE_KNI_KMOD,n) + $(call set,RTE_KNI_KMOD_ETHTOOL,n) + $(call set,RTE_LIBRTE_BITRATE,n) + $(call set,RTE_LIBRTE_METRICS,y) + $(call set,RTE_LIBRTE_AVP_PMD,n) + $(call set,RTE_LIBRTE_NFP_PMD,n) + $(call set,RTE_LIBRTE_PMD_TAP,n) + $(call set,RTE_LIBRTE_VHOST,$(PACKETDRILL)) + $(call set,RTE_LIBRTE_IFC_PMD,n) + $(call set,RTE_LIBRTE_PMD_VHOST,n) + $(call set,RTE_PROC_INFO,n) + $(call set,RTE_TEST_PMD,n) + $(call set,RTE_LIBRTE_FSLMC_BUS,n) + $(call set,RTE_LIBRTE_DPAA_BUS,n) + $(call set,RTE_LIBRTE_VMBUS,n) + $(call set,RTE_LIBRTE_IFPGA_BUS,n) + $(call set,RTE_LIBRTE_BPF,n) + $(call set,RTE_LIBRTE_COMPRESSDEV,n) + $(call set,RTE_LIBRTE_VDEV_NETVSC_PMD,n) + $(call set,RTE_LIBRTE_NETVSC_PMD,n) + $(call set,RTE_LIBRTE_RAWDEV,n) + $(call set,RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT,n) + $(call set,RTE_LIBRTE_AXGBE_PMD,n) + $(call set,RTE_LIBRTE_AVF_PMD,n) + $(call set,RTE_LIBRTE_BBDEV,n) + $(call set,RTE_LIBRTE_IP_FRAG_MAX_FRAG,48) + $(call set,RTE_MAX_NUMA_NODES,2) + $(call set,RTE_MAX_ETHPORTS,4) + $(call set,RTE_MAX_QUEUES_PER_PORT,8) + $(call set,RTE_LIBRTE_I40E_PMD,n) + $(call set,RTE_LIBRTE_IXGBE_PMD,n) + $(call set,RTE_LIBRTE_ENETC_PMD,n) + $(call set,RTE_LIBRTE_PMD_BOND,n) + $(call set,RTE_LIBRTE_ATLANTIC_PMD,n) + $(call set,RTE_LIBRTE_GSO,n) + $(call set,RTE_MAX_VFIO_GROUPS,4) + $(call set,RTE_MAX_VFIO_CONTAINERS,4) + $(call set,RTE_LIBRTE_COMMON_DPAAX,n) + $(call set,RTE_LIBRTE_PMD_OCTEONTX_CRYPTO,n) + $(call set,RTE_EAL_NUMA_AWARE_HUGEPAGES,n) + $(call set,RTE_DRIVER_MEMPOOL_STACK,y) + $(call set,RTE_DRIVER_MEMPOOL_BUCKET,n) + $(call set,RTE_LIBRTE_PMD_QAT,n) + $(call set,RTE_LIBRTE_PMD_AF_PACKET,n) + $(call set,RTE_MAX_MEM_MB,1024) + $(call set,RTE_LIBRTE_PDUMP,n) @rm -f .config.ok $(B)/.download.ok: @@ -165,4 +256,4 @@ build: $(B)/.build.ok .PHONY: clean clean: - @rm -rf $(B) $(I) + @rm -rf $(DPDK_BUILD_DIR) $(DPDK_DESTDIR) diff --git a/dpdk/dpdk-v18.11_patches/0001-eal-don-t-start-the-interrupt-mp-thread.patch b/dpdk/dpdk-v18.11_patches/0001-eal-don-t-start-the-interrupt-mp-thread.patch new file mode 100644 index 0000000..770bf05 --- /dev/null +++ b/dpdk/dpdk-v18.11_patches/0001-eal-don-t-start-the-interrupt-mp-thread.patch @@ -0,0 +1,35 @@ +From f68558b0ccbddb4cc81aca36befa0a7730ee051c Mon Sep 17 00:00:00 2001 +From: Jianfeng Tan +Date: Wed, 29 Aug 2018 14:24:01 +0000 +Subject: [PATCH 7/9] eal: don't start the interrupt mp thread + +--- + lib/librte_eal/common/eal_common_proc.c | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c +index 9fcb91219..79d9e6bbe 100644 +--- a/lib/librte_eal/common/eal_common_proc.c ++++ b/lib/librte_eal/common/eal_common_proc.c +@@ -615,6 +615,7 @@ rte_mp_channel_init(void) + return -1; + } + ++#if 0 + if (rte_ctrl_thread_create(&mp_handle_tid, "rte_mp_handle", + NULL, mp_handle, NULL) < 0) { + RTE_LOG(ERR, EAL, "failed to create mp thead: %s\n", +@@ -624,6 +625,10 @@ rte_mp_channel_init(void) + mp_fd = -1; + return -1; + } ++#else ++ RTE_SET_USED(mp_handle); ++ RTE_SET_USED(mp_handle_tid); ++#endif + + /* unlock the directory */ + flock(dir_fd, LOCK_UN); +-- +2.17.1 + diff --git a/dpdk/dpdk-v18.11_patches/0002-eal-prioritize-constructor.patch b/dpdk/dpdk-v18.11_patches/0002-eal-prioritize-constructor.patch new file mode 100644 index 0000000..9d2959f --- /dev/null +++ b/dpdk/dpdk-v18.11_patches/0002-eal-prioritize-constructor.patch @@ -0,0 +1,25 @@ +From 7fe32567994a8ce782fa8406613bade1d2100dca Mon Sep 17 00:00:00 2001 +From: Jianfeng Tan +Date: Wed, 29 Aug 2018 14:14:09 +0000 +Subject: [PATCH 2/9] eal: prioritize constructor + +--- + lib/librte_eal/common/include/rte_common.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/lib/librte_eal/common/include/rte_common.h b/lib/librte_eal/common/include/rte_common.h +index 069c13ec7..a635f5be4 100644 +--- a/lib/librte_eal/common/include/rte_common.h ++++ b/lib/librte_eal/common/include/rte_common.h +@@ -84,7 +84,7 @@ typedef uint16_t unaligned_uint16_t; + #define RTE_PRIORITY_LOG 101 + #define RTE_PRIORITY_BUS 110 + #define RTE_PRIORITY_CLASS 120 +-#define RTE_PRIORITY_LAST 65535 ++#define RTE_PRIORITY_LAST 130 + + #define RTE_PRIO(prio) \ + RTE_PRIORITY_ ## prio +-- +2.17.1 + diff --git a/dpdk/dpdk-v18.11_patches/0003-mbuf-add-single-linked-list.patch b/dpdk/dpdk-v18.11_patches/0003-mbuf-add-single-linked-list.patch new file mode 100644 index 0000000..7430d1e --- /dev/null +++ b/dpdk/dpdk-v18.11_patches/0003-mbuf-add-single-linked-list.patch @@ -0,0 +1,33 @@ +From 1416ff5de58922dc32eb2fb9ce2b9b970282136c Mon Sep 17 00:00:00 2001 +From: Jianfeng Tan +Date: Wed, 29 Aug 2018 14:18:13 +0000 +Subject: [PATCH 3/9] mbuf: add single linked list + +--- + lib/librte_mbuf/rte_mbuf.h | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h +index 9ce5d76d7..0081bd6d7 100644 +--- a/lib/librte_mbuf/rte_mbuf.h ++++ b/lib/librte_mbuf/rte_mbuf.h +@@ -593,6 +593,8 @@ struct rte_mbuf { + */ + struct rte_mbuf_ext_shared_info *shinfo; + ++ struct rte_mbuf *next_pkt; ++ + } __rte_cache_aligned; + + /** +@@ -1237,6 +1239,7 @@ static inline void rte_pktmbuf_reset_headroom(struct rte_mbuf *m) + static inline void rte_pktmbuf_reset(struct rte_mbuf *m) + { + m->next = NULL; ++ m->next_pkt = NULL; + m->pkt_len = 0; + m->tx_offload = 0; + m->vlan_tci = 0; +-- +2.17.1 + diff --git a/dpdk/dpdk-v18.11_patches/0004-net-virtio-user-add-rss-update-for-virtio-user.patch b/dpdk/dpdk-v18.11_patches/0004-net-virtio-user-add-rss-update-for-virtio-user.patch new file mode 100644 index 0000000..e4eb8e7 --- /dev/null +++ b/dpdk/dpdk-v18.11_patches/0004-net-virtio-user-add-rss-update-for-virtio-user.patch @@ -0,0 +1,43 @@ +From 9bbe20eda858fd7fcbd8f137e5f96f51d571a556 Mon Sep 17 00:00:00 2001 +From: Jianfeng Tan +Date: Wed, 29 Aug 2018 14:20:51 +0000 +Subject: [PATCH 4/9] net/virtio-user: add rss update for virtio-user + +--- + drivers/net/virtio/virtio_ethdev.c | 13 +++++++++++++ + 1 file changed, 13 insertions(+) + +diff --git a/drivers/net/virtio/virtio_ethdev.c b/drivers/net/virtio/virtio_ethdev.c +index 614357da7..e7336cde9 100644 +--- a/drivers/net/virtio/virtio_ethdev.c ++++ b/drivers/net/virtio/virtio_ethdev.c +@@ -738,6 +738,18 @@ virtio_dev_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id) + return 0; + } + ++static int ++virtio_rss_hash_update(struct rte_eth_dev *dev, ++ struct rte_eth_rss_conf *rss_conf __rte_unused) ++{ ++ struct virtio_hw *hw = dev->data->dev_private; ++ ++ if (hw->virtio_user_dev) ++ return 0; ++ ++ return -1; ++} ++ + /* + * dev_ops for virtio, bare necessities for basic operation + */ +@@ -772,6 +784,7 @@ static const struct eth_dev_ops virtio_eth_dev_ops = { + .mac_addr_add = virtio_mac_addr_add, + .mac_addr_remove = virtio_mac_addr_remove, + .mac_addr_set = virtio_mac_addr_set, ++ .rss_hash_update = virtio_rss_hash_update, + }; + + static void +-- +2.17.1 + diff --git a/dpdk/dpdk-v18.11_patches/0005-net-virtio-user-support-raw-socket-as-backend.patch b/dpdk/dpdk-v18.11_patches/0005-net-virtio-user-support-raw-socket-as-backend.patch new file mode 100644 index 0000000..1d950c5 --- /dev/null +++ b/dpdk/dpdk-v18.11_patches/0005-net-virtio-user-support-raw-socket-as-backend.patch @@ -0,0 +1,645 @@ +From 307f7debe0f2143e70659b7a082537077b20d185 Mon Sep 17 00:00:00 2001 +From: Jianfeng Tan +Date: Thu, 19 Jul 2018 11:25:22 +0000 +Subject: [PATCH] net/virtio-user: support raw socket as backend + +We will support tapfd or raw socket fd opened by application and +passed into virtio-user for initialization. + +Note if there are multiple queue pairs, users are still supposed +to pass down the iface name with the first queue pair fd passed +through this parameter. + +Signed-off-by: Jianfeng Tan +--- + drivers/net/virtio/Makefile | 1 + + drivers/net/virtio/virtio_user/vhost_kernel.c | 78 ++++++--- + drivers/net/virtio/virtio_user/vhost_kernel.h | 15 ++ + .../virtio/virtio_user/vhost_kernel_sock.c | 156 ++++++++++++++++++ + .../net/virtio/virtio_user/vhost_kernel_tap.c | 64 ++++++- + .../net/virtio/virtio_user/vhost_kernel_tap.h | 39 ----- + .../net/virtio/virtio_user/virtio_user_dev.c | 16 +- + .../net/virtio/virtio_user/virtio_user_dev.h | 3 +- + drivers/net/virtio/virtio_user_ethdev.c | 20 ++- + 9 files changed, 318 insertions(+), 74 deletions(-) + create mode 100644 drivers/net/virtio/virtio_user/vhost_kernel.h + create mode 100644 drivers/net/virtio/virtio_user/vhost_kernel_sock.c + delete mode 100644 drivers/net/virtio/virtio_user/vhost_kernel_tap.h + +diff --git a/drivers/net/virtio/Makefile b/drivers/net/virtio/Makefile +index 6c2c9967b..2e1fc9b5e 100644 +--- a/drivers/net/virtio/Makefile ++++ b/drivers/net/virtio/Makefile +@@ -41,6 +41,7 @@ ifeq ($(CONFIG_RTE_VIRTIO_USER),y) + SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_user.c + SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_kernel.c + SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_kernel_tap.c ++SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_kernel_sock.c + SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/virtio_user_dev.c + SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user_ethdev.c + endif +diff --git a/drivers/net/virtio/virtio_user/vhost_kernel.c b/drivers/net/virtio/virtio_user/vhost_kernel.c +index 6b19180d7..fa84287f5 100644 +--- a/drivers/net/virtio/virtio_user/vhost_kernel.c ++++ b/drivers/net/virtio/virtio_user/vhost_kernel.c +@@ -6,13 +6,14 @@ + #include + #include + #include ++#include + + #include + #include + + #include "vhost.h" + #include "virtio_user_dev.h" +-#include "vhost_kernel_tap.h" ++#include "vhost_kernel.h" + + struct vhost_memory_kernel { + uint32_t nregions; +@@ -152,27 +153,25 @@ prepare_vhost_memory_kernel(void) + (1ULL << VIRTIO_NET_F_HOST_TSO6) | \ + (1ULL << VIRTIO_NET_F_CSUM)) + +-static unsigned int +-tap_support_features(void) ++#define PATH_SYS_CLASS_NET "/sys/class/net" ++ ++static int ++vhost_kernel_is_tap(struct virtio_user_dev *dev) + { +- int tapfd; +- unsigned int tap_features; ++ char path[128]; + +- tapfd = open(PATH_NET_TUN, O_RDWR); +- if (tapfd < 0) { +- PMD_DRV_LOG(ERR, "fail to open %s: %s", +- PATH_NET_TUN, strerror(errno)); +- return -1; +- } ++ if (dev->ifname == NULL) ++ return 0; + +- if (ioctl(tapfd, TUNGETFEATURES, &tap_features) == -1) { +- PMD_DRV_LOG(ERR, "TUNGETFEATURES failed: %s", strerror(errno)); +- close(tapfd); +- return -1; +- } ++ snprintf(path, 128, PATH_SYS_CLASS_NET"/%s", dev->ifname); ++ if(access(path, F_OK) == -1) ++ return 1; + +- close(tapfd); +- return tap_features; ++ snprintf(path, 128, PATH_SYS_CLASS_NET"/%s/tun_flags", dev->ifname); ++ if(access(path, F_OK) != -1) ++ return 1; ++ ++ return 0; + } + + static int +@@ -186,7 +185,6 @@ vhost_kernel_ioctl(struct virtio_user_dev *dev, + struct vhost_memory_kernel *vm = NULL; + int vhostfd; + unsigned int queue_sel; +- unsigned int features; + + PMD_DRV_LOG(INFO, "%s", vhost_msg_strings[req]); + +@@ -240,21 +238,36 @@ vhost_kernel_ioctl(struct virtio_user_dev *dev, + } + + if (!ret && req_kernel == VHOST_GET_FEATURES) { +- features = tap_support_features(); +- /* with tap as the backend, all these features are supported ++ int vnet_hdr, mq; ++ ++ if (vhost_kernel_is_tap(dev)) ++ tap_support_features(&vnet_hdr, &mq); ++ else ++ sock_support_features(dev->be_fd, &vnet_hdr, &mq); ++ ++ /* with kernel vhost, all these features are supported + * but not claimed by vhost-net, so we add them back when + * reporting to upper layer. + */ +- if (features & IFF_VNET_HDR) { ++ if (vnet_hdr) { + *((uint64_t *)arg) |= VHOST_KERNEL_GUEST_OFFLOADS_MASK; + *((uint64_t *)arg) |= VHOST_KERNEL_HOST_OFFLOADS_MASK; + } + +- /* vhost_kernel will not declare this feature, but it does ++ /* kernel vhost will not declare this feature, but it does + * support multi-queue. + */ +- if (features & IFF_MULTI_QUEUE) ++ if (mq) + *(uint64_t *)arg |= (1ull << VIRTIO_NET_F_MQ); ++ ++ /* raw socket only supports vnet header size of 10, so we must ++ * eliminate below features. ++ */ ++ if (!vhost_kernel_is_tap(dev) && ++ vnet_hdr == sizeof(struct virtio_net_hdr)) { ++ *((uint64_t *)arg) &= ~(1ull << VIRTIO_NET_F_MRG_RXBUF); ++ *((uint64_t *)arg) &= ~(1ull << VIRTIO_F_VERSION_1); ++ } + } + + if (vm) +@@ -333,7 +346,8 @@ vhost_kernel_enable_queue_pair(struct virtio_user_dev *dev, + + if (!enable) { + if (dev->tapfds[pair_idx] >= 0) { +- close(dev->tapfds[pair_idx]); ++ if (dev->be_fd < 0) ++ close(dev->tapfds[pair_idx]); + dev->tapfds[pair_idx] = -1; + } + return vhost_kernel_set_backend(vhostfd, -1); +@@ -347,8 +361,18 @@ vhost_kernel_enable_queue_pair(struct virtio_user_dev *dev, + else + hdr_size = sizeof(struct virtio_net_hdr); + +- tapfd = vhost_kernel_open_tap(&dev->ifname, hdr_size, req_mq, +- (char *)dev->mac_addr, dev->features); ++ if (vhost_kernel_is_tap(dev)) { ++ tapfd = vhost_kernel_open_tap(&dev->ifname, hdr_size, ++ req_mq, (char *)dev->mac_addr, dev->features); ++ } else { ++ if (pair_idx == 0 && dev->be_fd >= 0) ++ tapfd = vhost_kernel_set_sock(dev->be_fd, ++ hdr_size, req_mq); ++ else ++ tapfd = vhost_kernel_open_sock(dev->ifname, ++ hdr_size, dev->mac_addr, req_mq); ++ } ++ + if (tapfd < 0) { + PMD_DRV_LOG(ERR, "fail to open tap for vhost kernel"); + return -1; +diff --git a/drivers/net/virtio/virtio_user/vhost_kernel.h b/drivers/net/virtio/virtio_user/vhost_kernel.h +new file mode 100644 +index 000000000..75d6c5bf6 +--- /dev/null ++++ b/drivers/net/virtio/virtio_user/vhost_kernel.h +@@ -0,0 +1,15 @@ ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright(c) 2016 Intel Corporation ++ */ ++ ++int vhost_kernel_open_tap(char **p_ifname, int hdr_size, int req_mq, ++ const char *mac, uint64_t features); ++ ++void tap_support_features(int *vnet_hdr, int *mq); ++ ++int vhost_kernel_open_sock(char *ifname, int hdr_size, ++ uint8_t *mac, int req_mq); ++ ++int vhost_kernel_set_sock(int sockfd, int hdr_size, int req_mq); ++ ++void sock_support_features(int fd, int *vnet_hdr, int *mq); +diff --git a/drivers/net/virtio/virtio_user/vhost_kernel_sock.c b/drivers/net/virtio/virtio_user/vhost_kernel_sock.c +new file mode 100644 +index 000000000..7c2ace294 +--- /dev/null ++++ b/drivers/net/virtio/virtio_user/vhost_kernel_sock.c +@@ -0,0 +1,156 @@ ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright(c) 2018 Alibaba Group ++ * Copyright(c) 2018 Ant Financial Services Group ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include "../virtqueue.h" ++#include "../virtio_logs.h" ++#include "vhost_kernel.h" ++ ++#ifndef PACKET_VNET_HDR ++#define PACKET_VNET_HDR 15 ++#endif ++ ++#ifndef PACKET_FANOUT ++#define PACKET_FANOUT 18 ++#endif ++ ++#ifndef PACKET_VNET_HDR_SZ ++#define PACKET_VNET_HDR_SZ 128 ++#endif ++ ++void ++sock_support_features(int fd, int *vnet_hdr, int *mq) ++{ ++ int hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf); ++ int local_fd = 0; ++ ++ if (fd < 0) { ++ fd = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); ++ if (fd < 0) { ++ *mq = 0; ++ *vnet_hdr = 0; ++ return; ++ } ++ local_fd = 1; ++ } ++ ++ *mq = 1; ++ ++ if (setsockopt(fd, SOL_PACKET, PACKET_VNET_HDR_SZ, ++ (void *)&hdr_size, sizeof(hdr_size))) { ++ *vnet_hdr = sizeof(struct virtio_net_hdr); ++ } else ++ *vnet_hdr = hdr_size; ++ ++ if (local_fd) ++ close(fd); ++} ++ ++int ++vhost_kernel_set_sock(int sockfd, int hdr_size, int req_mq) ++{ ++ int ret; ++ int fanout_type = 0; /* PACKET_FANOUT_HASH */ ++ ++ if (hdr_size == sizeof(struct virtio_net_hdr)) ++ ret = setsockopt(sockfd, SOL_PACKET, PACKET_VNET_HDR, ++ (void *)&hdr_size, sizeof(hdr_size)); ++ else ++ ret = setsockopt(sockfd, SOL_PACKET, PACKET_VNET_HDR_SZ, ++ (void *)&hdr_size, sizeof(hdr_size)); ++ if (ret) { ++ PMD_DRV_LOG(ERR, "failed to set vnet hdr (%d): %s", ++ hdr_size, strerror(errno)); ++ close(sockfd); ++ return -1; ++ } ++ ++ if (fcntl(sockfd, F_SETFL, fcntl(sockfd, F_GETFL) | O_NONBLOCK)) ++ { ++ PMD_DRV_LOG(ERR, "fcntl O_NONBLOCK failed! %s", ++ strerror(errno)); ++ close(sockfd); ++ return -1; ++ } ++ ++ if (req_mq) { ++ if (setsockopt(sockfd, SOL_PACKET, PACKET_FANOUT, ++ (void *)&fanout_type, sizeof(fanout_type))) { ++ PMD_DRV_LOG(ERR, "PACKET_FANOUT failed! %s", ++ strerror(errno)); ++ close(sockfd); ++ return -1; ++ } ++ } ++ ++ return sockfd; ++} ++ ++int ++vhost_kernel_open_sock(char *ifname, int hdr_size, ++ uint8_t *mac, int req_mq) ++{ ++ int sockfd; ++ struct ifreq ifr; ++ struct sockaddr_ll addr_ll; ++ ++ sockfd = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); ++ if (sockfd < 0) { ++ PMD_DRV_LOG(ERR, "socket failed: %s", strerror(errno)); ++ return -1; ++ } ++ ++ memset(&ifr, 0, sizeof(ifr)); ++ strncpy(ifr.ifr_name, ifname, IFNAMSIZ - 1); ++ ++ if (ioctl(sockfd, SIOCGIFINDEX, (void*)&ifr)) { ++ PMD_DRV_LOG(ERR, "SIOCGIFINDEX failed: %s", strerror(errno)); ++ close(sockfd); ++ return -1; ++ } ++ ++ memset(&addr_ll, 0, sizeof(addr_ll)); ++ addr_ll.sll_ifindex = ifr.ifr_ifindex; ++ addr_ll.sll_family = AF_PACKET; ++ addr_ll.sll_protocol = htons(ETH_P_ALL); ++ addr_ll.sll_hatype = 0; ++ //addr_ll.sll_pkttype = PACKET_HOST; ++ //addr_ll.sll_halen = ETH_ALEN; ++ if (bind(sockfd, (struct sockaddr*)&addr_ll, sizeof(addr_ll))) { ++ PMD_DRV_LOG(ERR, "bind failed: %s", strerror(errno)); ++ close(sockfd); ++ return -1; ++ } ++ ++ ifr.ifr_flags |= IFF_PROMISC | IFF_UP; ++ ++ if (ioctl(sockfd, SIOCSIFFLAGS, (char*)&ifr)) { ++ PMD_DRV_LOG(ERR, "SIOCSIFFLAGS failed: %s", strerror(errno)); ++ close(sockfd); ++ return -1; ++ } ++ ++ ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER; ++ if (ioctl(sockfd, SIOCGIFHWADDR, &ifr) == 0) ++ memcpy(mac, ifr.ifr_hwaddr.sa_data, ETHER_ADDR_LEN); ++ ++ return vhost_kernel_set_sock(sockfd, hdr_size, req_mq); ++} +diff --git a/drivers/net/virtio/virtio_user/vhost_kernel_tap.c b/drivers/net/virtio/virtio_user/vhost_kernel_tap.c +index a3faf1d0c..85dd24dd6 100644 +--- a/drivers/net/virtio/virtio_user/vhost_kernel_tap.c ++++ b/drivers/net/virtio/virtio_user/vhost_kernel_tap.c +@@ -11,13 +11,75 @@ + #include + #include + #include ++#include + + #include + +-#include "vhost_kernel_tap.h" ++#include "vhost_kernel.h" + #include "../virtio_logs.h" + #include "../virtio_pci.h" + ++/* TUN ioctls */ ++#define TUNSETIFF _IOW('T', 202, int) ++#define TUNGETFEATURES _IOR('T', 207, unsigned int) ++#define TUNSETOFFLOAD _IOW('T', 208, unsigned int) ++#define TUNGETIFF _IOR('T', 210, unsigned int) ++#define TUNSETSNDBUF _IOW('T', 212, int) ++#define TUNGETVNETHDRSZ _IOR('T', 215, int) ++#define TUNSETVNETHDRSZ _IOW('T', 216, int) ++#define TUNSETQUEUE _IOW('T', 217, int) ++#define TUNSETVNETLE _IOW('T', 220, int) ++#define TUNSETVNETBE _IOW('T', 222, int) ++ ++/* TUNSETIFF ifr flags */ ++#define IFF_TAP 0x0002 ++#define IFF_NO_PI 0x1000 ++#define IFF_ONE_QUEUE 0x2000 ++#define IFF_VNET_HDR 0x4000 ++#define IFF_MULTI_QUEUE 0x0100 ++#define IFF_ATTACH_QUEUE 0x0200 ++#define IFF_DETACH_QUEUE 0x0400 ++ ++/* Features for GSO (TUNSETOFFLOAD). */ ++#define TUN_F_CSUM 0x01 /* You can hand me unchecksummed packets. */ ++#define TUN_F_TSO4 0x02 /* I can handle TSO for IPv4 packets */ ++#define TUN_F_TSO6 0x04 /* I can handle TSO for IPv6 packets */ ++#define TUN_F_TSO_ECN 0x08 /* I can handle TSO with ECN bits. */ ++#define TUN_F_UFO 0x10 /* I can handle UFO packets */ ++ ++/* Constants */ ++#define PATH_NET_TUN "/dev/net/tun" ++ ++void ++tap_support_features(int *vnet_hdr, int *mq) ++{ ++ int tapfd; ++ unsigned int tap_features; ++ ++ *vnet_hdr = 0; ++ *mq = 0; ++ ++ tapfd = open(PATH_NET_TUN, O_RDWR); ++ if (tapfd < 0) { ++ PMD_DRV_LOG(ERR, "fail to open %s: %s", ++ PATH_NET_TUN, strerror(errno)); ++ return; ++ } ++ ++ if (ioctl(tapfd, TUNGETFEATURES, &tap_features) == -1) { ++ PMD_DRV_LOG(ERR, "TUNGETFEATURES failed: %s", strerror(errno)); ++ close(tapfd); ++ return; ++ } ++ ++ close(tapfd); ++ ++ if (tap_features & IFF_VNET_HDR) ++ *vnet_hdr = 1; ++ if (tap_features & IFF_MULTI_QUEUE) ++ *mq = 1; ++} ++ + static int + vhost_kernel_tap_set_offload(int fd, uint64_t features) + { +diff --git a/drivers/net/virtio/virtio_user/vhost_kernel_tap.h b/drivers/net/virtio/virtio_user/vhost_kernel_tap.h +deleted file mode 100644 +index e0e95b4f5..000000000 +--- a/drivers/net/virtio/virtio_user/vhost_kernel_tap.h ++++ /dev/null +@@ -1,39 +0,0 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright(c) 2016 Intel Corporation +- */ +- +-#include +- +-/* TUN ioctls */ +-#define TUNSETIFF _IOW('T', 202, int) +-#define TUNGETFEATURES _IOR('T', 207, unsigned int) +-#define TUNSETOFFLOAD _IOW('T', 208, unsigned int) +-#define TUNGETIFF _IOR('T', 210, unsigned int) +-#define TUNSETSNDBUF _IOW('T', 212, int) +-#define TUNGETVNETHDRSZ _IOR('T', 215, int) +-#define TUNSETVNETHDRSZ _IOW('T', 216, int) +-#define TUNSETQUEUE _IOW('T', 217, int) +-#define TUNSETVNETLE _IOW('T', 220, int) +-#define TUNSETVNETBE _IOW('T', 222, int) +- +-/* TUNSETIFF ifr flags */ +-#define IFF_TAP 0x0002 +-#define IFF_NO_PI 0x1000 +-#define IFF_ONE_QUEUE 0x2000 +-#define IFF_VNET_HDR 0x4000 +-#define IFF_MULTI_QUEUE 0x0100 +-#define IFF_ATTACH_QUEUE 0x0200 +-#define IFF_DETACH_QUEUE 0x0400 +- +-/* Features for GSO (TUNSETOFFLOAD). */ +-#define TUN_F_CSUM 0x01 /* You can hand me unchecksummed packets. */ +-#define TUN_F_TSO4 0x02 /* I can handle TSO for IPv4 packets */ +-#define TUN_F_TSO6 0x04 /* I can handle TSO for IPv6 packets */ +-#define TUN_F_TSO_ECN 0x08 /* I can handle TSO with ECN bits. */ +-#define TUN_F_UFO 0x10 /* I can handle UFO packets */ +- +-/* Constants */ +-#define PATH_NET_TUN "/dev/net/tun" +- +-int vhost_kernel_open_tap(char **p_ifname, int hdr_size, int req_mq, +- const char *mac, uint64_t features); +diff --git a/drivers/net/virtio/virtio_user/virtio_user_dev.c b/drivers/net/virtio/virtio_user/virtio_user_dev.c +index 20816c936..7e655a0d5 100644 +--- a/drivers/net/virtio/virtio_user/virtio_user_dev.c ++++ b/drivers/net/virtio/virtio_user/virtio_user_dev.c +@@ -294,7 +294,7 @@ virtio_user_fill_intr_handle(struct virtio_user_dev *dev) + eth_dev->intr_handle->max_intr = dev->max_queue_pairs + 1; + eth_dev->intr_handle->type = RTE_INTR_HANDLE_VDEV; + /* For virtio vdev, no need to read counter for clean */ +- eth_dev->intr_handle->efd_counter_size = 0; ++ eth_dev->intr_handle->efd_counter_size = 8; + eth_dev->intr_handle->fd = -1; + if (dev->vhostfd >= 0) + eth_dev->intr_handle->fd = dev->vhostfd; +@@ -312,7 +312,9 @@ virtio_user_mem_event_cb(enum rte_mem_event type __rte_unused, + { + struct virtio_user_dev *dev = arg; + struct rte_memseg_list *msl; ++#if 0 + uint16_t i; ++#endif + + /* ignore externally allocated memory */ + msl = rte_mem_virt2memseg_list(addr); +@@ -325,15 +327,19 @@ virtio_user_mem_event_cb(enum rte_mem_event type __rte_unused, + goto exit; + + /* Step 1: pause the active queues */ ++#if 0 + for (i = 0; i < dev->queue_pairs; i++) + dev->ops->enable_qp(dev, i, 0); ++#endif + + /* Step 2: update memory regions */ + dev->ops->send_request(dev, VHOST_USER_SET_MEM_TABLE, NULL); + + /* Step 3: resume the active queues */ ++#if 0 + for (i = 0; i < dev->queue_pairs; i++) + dev->ops->enable_qp(dev, i, 1); ++#endif + + exit: + pthread_mutex_unlock(&dev->mutex); +@@ -412,7 +418,7 @@ virtio_user_dev_setup(struct virtio_user_dev *dev) + int + virtio_user_dev_init(struct virtio_user_dev *dev, char *path, int queues, + int cq, int queue_size, const char *mac, char **ifname, +- int mrg_rxbuf, int in_order) ++ int mrg_rxbuf, int in_order, int fd) + { + pthread_mutex_init(&dev->mutex, NULL); + snprintf(dev->path, PATH_MAX, "%s", path); +@@ -435,6 +441,12 @@ virtio_user_dev_init(struct virtio_user_dev *dev, char *path, int queues, + return -1; + } + ++ if (fd >= 0) { ++ dev->be_fd = fd; ++ } else { ++ dev->be_fd = -1; ++ } ++ + if (!dev->is_server) { + if (dev->ops->send_request(dev, VHOST_USER_SET_OWNER, + NULL) < 0) { +diff --git a/drivers/net/virtio/virtio_user/virtio_user_dev.h b/drivers/net/virtio/virtio_user/virtio_user_dev.h +index c42ce5d4b..575c21e3b 100644 +--- a/drivers/net/virtio/virtio_user/virtio_user_dev.h ++++ b/drivers/net/virtio/virtio_user/virtio_user_dev.h +@@ -21,6 +21,7 @@ struct virtio_user_dev { + char *ifname; + int *vhostfds; + int *tapfds; ++ int be_fd; + + /* for both vhost_user and vhost_kernel */ + int callfds[VIRTIO_MAX_VIRTQUEUES]; +@@ -50,7 +51,7 @@ int virtio_user_start_device(struct virtio_user_dev *dev); + int virtio_user_stop_device(struct virtio_user_dev *dev); + int virtio_user_dev_init(struct virtio_user_dev *dev, char *path, int queues, + int cq, int queue_size, const char *mac, char **ifname, +- int mrg_rxbuf, int in_order); ++ int mrg_rxbuf, int in_order, int fd); + void virtio_user_dev_uninit(struct virtio_user_dev *dev); + void virtio_user_handle_cq(struct virtio_user_dev *dev, uint16_t queue_idx); + uint8_t virtio_user_handle_mq(struct virtio_user_dev *dev, uint16_t q_pairs); +diff --git a/drivers/net/virtio/virtio_user_ethdev.c b/drivers/net/virtio/virtio_user_ethdev.c +index f8791391a..d5e87b24c 100644 +--- a/drivers/net/virtio/virtio_user_ethdev.c ++++ b/drivers/net/virtio/virtio_user_ethdev.c +@@ -221,8 +221,7 @@ virtio_user_get_features(struct virtio_hw *hw) + { + struct virtio_user_dev *dev = virtio_user_get_dev(hw); + +- /* unmask feature bits defined in vhost user protocol */ +- return dev->device_features & VIRTIO_PMD_SUPPORTED_GUEST_FEATURES; ++ return dev->device_features; + } + + static void +@@ -361,6 +360,8 @@ static const char *valid_args[] = { + VIRTIO_USER_ARG_MRG_RXBUF, + #define VIRTIO_USER_ARG_IN_ORDER "in_order" + VIRTIO_USER_ARG_IN_ORDER, ++#define VIRTIO_USER_ARG_FD "fd" ++ VIRTIO_USER_ARG_FD, + NULL + }; + +@@ -464,6 +465,7 @@ virtio_user_pmd_probe(struct rte_vdev_device *dev) + uint64_t server_mode = VIRTIO_USER_DEF_SERVER_MODE; + uint64_t mrg_rxbuf = 1; + uint64_t in_order = 1; ++ uint64_t fd = -1; + char *path = NULL; + char *ifname = NULL; + char *mac_addr = NULL; +@@ -581,6 +583,15 @@ virtio_user_pmd_probe(struct rte_vdev_device *dev) + } + } + ++ if (rte_kvargs_count(kvlist, VIRTIO_USER_ARG_FD) == 1) { ++ if (rte_kvargs_process(kvlist, VIRTIO_USER_ARG_FD, ++ &get_integer_arg, &fd) < 0) { ++ PMD_INIT_LOG(ERR, "error to parse %s", ++ VIRTIO_USER_ARG_FD); ++ goto end; ++ } ++ } ++ + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + struct virtio_user_dev *vu_dev; + +@@ -598,7 +609,7 @@ virtio_user_pmd_probe(struct rte_vdev_device *dev) + vu_dev->is_server = false; + if (virtio_user_dev_init(hw->virtio_user_dev, path, queues, cq, + queue_size, mac_addr, &ifname, mrg_rxbuf, +- in_order) < 0) { ++ in_order, fd) < 0) { + PMD_INIT_LOG(ERR, "virtio_user_dev_init fails"); + virtio_user_eth_dev_free(eth_dev); + goto end; +@@ -677,4 +688,5 @@ RTE_PMD_REGISTER_PARAM_STRING(net_virtio_user, + "iface= " + "server=<0|1> " + "mrg_rxbuf=<0|1> " +- "in_order=<0|1>"); ++ "in_order=<0|1>" ++ "fd="); +-- +2.17.1 + diff --git a/dpdk/dpdk-v18.11_patches/0006-mempool-add-dynamic-mempool-support.patch b/dpdk/dpdk-v18.11_patches/0006-mempool-add-dynamic-mempool-support.patch new file mode 100644 index 0000000..bcc9743 --- /dev/null +++ b/dpdk/dpdk-v18.11_patches/0006-mempool-add-dynamic-mempool-support.patch @@ -0,0 +1,247 @@ +From 9d2ddfe6012b37297bc84f6ddcce810232162e5b Mon Sep 17 00:00:00 2001 +From: Jianfeng Tan +Date: Wed, 26 Dec 2018 14:39:24 +0000 +Subject: [PATCH 1/2] mempool: add dynamic mempool support + +Signed-off-by: Jianfeng Tan +--- + drivers/mempool/ring/rte_mempool_ring.c | 26 +++++++---- + lib/librte_mempool/rte_mempool.c | 27 +++++++++-- + lib/librte_mempool/rte_mempool.h | 62 ++++++++++++++++++++----- + 3 files changed, 92 insertions(+), 23 deletions(-) + +diff --git a/drivers/mempool/ring/rte_mempool_ring.c b/drivers/mempool/ring/rte_mempool_ring.c +index bc123fc52..e8fec9119 100644 +--- a/drivers/mempool/ring/rte_mempool_ring.c ++++ b/drivers/mempool/ring/rte_mempool_ring.c +@@ -49,30 +49,40 @@ common_ring_get_count(const struct rte_mempool *mp) + static int + common_ring_alloc(struct rte_mempool *mp) + { ++ int n; + int rg_flags = 0, ret; + char rg_name[RTE_RING_NAMESIZE]; + struct rte_ring *r; + +- ret = snprintf(rg_name, sizeof(rg_name), +- RTE_MEMPOOL_MZ_FORMAT, mp->name); +- if (ret < 0 || ret >= (int)sizeof(rg_name)) { +- rte_errno = ENAMETOOLONG; +- return -rte_errno; +- } +- + /* ring flags */ + if (mp->flags & MEMPOOL_F_SP_PUT) + rg_flags |= RING_F_SP_ENQ; + if (mp->flags & MEMPOOL_F_SC_GET) + rg_flags |= RING_F_SC_DEQ; + ++ if (mp->flags & MEMPOOL_F_DYNAMIC) { ++ n = RTE_MIN(mp->size, mp->populated_size + mp->dynamic_size); ++ ++ ret = snprintf(rg_name, sizeof(rg_name), ++ RTE_MEMPOOL_MZ_FORMAT"_%x", mp->name, n); ++ } else { ++ n = mp->size; ++ ret = snprintf(rg_name, sizeof(rg_name), ++ RTE_MEMPOOL_MZ_FORMAT, mp->name); ++ } ++ ++ if (ret < 0 || ret >= (int)sizeof(rg_name)) { ++ rte_errno = ENAMETOOLONG; ++ return -rte_errno; ++ } ++ + /* + * Allocate the ring that will be used to store objects. + * Ring functions will return appropriate errors if we are + * running as a secondary process etc., so no checks made + * in this function for that condition. + */ +- r = rte_ring_create(rg_name, rte_align32pow2(mp->size + 1), ++ r = rte_ring_create(rg_name, rte_align32pow2(n + 1), + mp->socket_id, rg_flags); + if (r == NULL) + return -rte_errno; +diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c +index 683b216f9..70039f6c3 100644 +--- a/lib/librte_mempool/rte_mempool.c ++++ b/lib/librte_mempool/rte_mempool.c +@@ -152,6 +152,8 @@ mempool_add_elem(struct rte_mempool *mp, __rte_unused void *opaque, + hdr->mp = mp; + hdr->iova = iova; + STAILQ_INSERT_TAIL(&mp->elt_list, hdr, next); ++ if (mp->flags & MEMPOOL_F_DYNAMIC && mp->dyn_obj_cb) ++ mp->dyn_obj_cb(mp, NULL, obj, mp->populated_size); + mp->populated_size++; + + #ifdef RTE_LIBRTE_MEMPOOL_DEBUG +@@ -426,9 +428,10 @@ rte_mempool_populate_default(struct rte_mempool *mp) + ssize_t mem_size; + size_t align, pg_sz, pg_shift; + rte_iova_t iova; +- unsigned mz_id, n; ++ unsigned mz_id, n, avail; + int ret; + bool no_contig, try_contig, no_pageshift, external; ++ bool dynamic = (mp->flags & MEMPOOL_F_DYNAMIC) ? true : false; + + ret = mempool_ops_alloc_once(mp); + if (ret != 0) +@@ -441,7 +444,7 @@ rte_mempool_populate_default(struct rte_mempool *mp) + external = ret; + + /* mempool must not be populated */ +- if (mp->nb_mem_chunks != 0) ++ if (mp->nb_mem_chunks != 0 && !dynamic) + return -EEXIST; + + no_contig = mp->flags & MEMPOOL_F_NO_IOVA_CONTIG; +@@ -512,7 +515,16 @@ rte_mempool_populate_default(struct rte_mempool *mp) + pg_shift = rte_bsf32(pg_sz); + } + +- for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) { ++ n = mp->size; ++ if (dynamic) { ++ n = RTE_MIN(mp->size - mp->populated_size, mp->dynamic_size); ++ if (mp->nb_mem_chunks != 0 && rte_mempool_ops_alloc(mp) != 0) ++ return -ENOMEM; ++ } ++ ++ avail = 0; ++ mz_id = mp->nb_mem_chunks; ++ for (; n > 0; mz_id++, n -= ret, avail += ret) { + size_t min_chunk_size; + unsigned int flags; + +@@ -607,9 +619,16 @@ rte_mempool_populate_default(struct rte_mempool *mp) + } + } + +- return mp->size; ++ return avail; + + fail: ++ if (dynamic) { ++ if (avail) ++ return avail; ++ ++ return ret; ++ } ++ + rte_mempool_free_memchunks(mp); + return ret; + } +diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h +index 7c9cd9a2f..0886b19f2 100644 +--- a/lib/librte_mempool/rte_mempool.h ++++ b/lib/librte_mempool/rte_mempool.h +@@ -207,6 +207,16 @@ struct rte_mempool_info { + unsigned int contig_block_size; + } __rte_cache_aligned; + ++struct rte_mempool; ++/** ++ * An object callback function for mempool. ++ * ++ * Used by rte_mempool_create() and rte_mempool_obj_iter(). ++ */ ++typedef void (rte_mempool_obj_cb_t)(struct rte_mempool *mp, ++ void *opaque, void *obj, unsigned obj_idx); ++typedef rte_mempool_obj_cb_t rte_mempool_obj_ctor_t; /* compat */ ++ + /** + * The RTE mempool structure. + */ +@@ -247,6 +257,8 @@ struct rte_mempool { + struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */ + + uint32_t populated_size; /**< Number of populated objects. */ ++ uint32_t dynamic_size; /**< Number of dynamic populated objects. */ ++ rte_mempool_obj_cb_t *dyn_obj_cb; /**< elem cb for dynamic populated objects. */ + struct rte_mempool_objhdr_list elt_list; /**< List of objects in pool */ + uint32_t nb_mem_chunks; /**< Number of memory chunks */ + struct rte_mempool_memhdr_list mem_list; /**< List of memory chunks */ +@@ -264,6 +276,8 @@ struct rte_mempool { + #define MEMPOOL_F_POOL_CREATED 0x0010 /**< Internal: pool is created. */ + #define MEMPOOL_F_NO_IOVA_CONTIG 0x0020 /**< Don't need IOVA contiguous objs. */ + #define MEMPOOL_F_NO_PHYS_CONTIG MEMPOOL_F_NO_IOVA_CONTIG /* deprecated */ ++#define MEMPOOL_F_DYNAMIC 0x0040 /**< Don't populate element once for all */ ++#define MEMPOOL_F_DYNAMIC_NOW 0x0080 /**< It's is dynamically populated now */ + + /** + * @internal When debug is enabled, store some statistics. +@@ -839,15 +853,6 @@ int rte_mempool_register_ops(const struct rte_mempool_ops *ops); + rte_mempool_register_ops(&ops); \ + } + +-/** +- * An object callback function for mempool. +- * +- * Used by rte_mempool_create() and rte_mempool_obj_iter(). +- */ +-typedef void (rte_mempool_obj_cb_t)(struct rte_mempool *mp, +- void *opaque, void *obj, unsigned obj_idx); +-typedef rte_mempool_obj_cb_t rte_mempool_obj_ctor_t; /* compat */ +- + /** + * A memory callback function for mempool. + * +@@ -989,6 +994,22 @@ struct rte_mempool * + rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size, + unsigned cache_size, unsigned private_data_size, + int socket_id, unsigned flags); ++ ++static inline void ++rte_mempool_set_dynamic_size(struct rte_mempool *mp, int dynamic_size) ++{ ++ mp->flags |= MEMPOOL_F_DYNAMIC; ++ mp->dynamic_size = dynamic_size; ++} ++ ++static inline void ++rte_mempool_set_dynamic_cb(struct rte_mempool *mp, ++ rte_mempool_obj_cb_t *dyn_obj_cb) ++{ ++ mp->flags |= MEMPOOL_F_DYNAMIC; ++ mp->dyn_obj_cb = dyn_obj_cb; ++} ++ + /** + * Free a mempool + * +@@ -1390,9 +1411,28 @@ __mempool_generic_get(struct rte_mempool *mp, void **obj_table, + /* get remaining objects from ring */ + ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, n); + +- if (ret < 0) ++ if (ret < 0) { ++ if (mp->flags & MEMPOOL_F_DYNAMIC && ++ mp->populated_size < mp->size) { ++ int work; ++ ++ work = rte_atomic32_cmpset(&mp->flags, ++ mp->flags & ~MEMPOOL_F_DYNAMIC_NOW, ++ mp->flags | MEMPOOL_F_DYNAMIC_NOW); ++ if (work) { ++ int more; ++ ++ more = rte_mempool_populate_default(mp); ++ mp->flags &= ~MEMPOOL_F_DYNAMIC_NOW; ++ if (more > 0) ++ goto ring_dequeue; ++ } else { ++ /* mempool is populating, try again */ ++ goto ring_dequeue; ++ } ++ } + __MEMPOOL_STAT_ADD(mp, get_fail, n); +- else ++ } else + __MEMPOOL_STAT_ADD(mp, get_success, n); + + return ret; +-- +2.17.1 + diff --git a/dpdk/dpdk-v18.11_patches/0007-mbuf-add-dynamic-mbuf-mempool-support.patch b/dpdk/dpdk-v18.11_patches/0007-mbuf-add-dynamic-mbuf-mempool-support.patch new file mode 100644 index 0000000..8618928 --- /dev/null +++ b/dpdk/dpdk-v18.11_patches/0007-mbuf-add-dynamic-mbuf-mempool-support.patch @@ -0,0 +1,305 @@ +From c2a2b8eec349156b31f2faab61cc6063ef3f0c61 Mon Sep 17 00:00:00 2001 +From: Jianfeng Tan +Date: Wed, 26 Dec 2018 14:40:07 +0000 +Subject: [PATCH 2/2] mbuf: add dynamic mbuf mempool support + +Signed-off-by: Jianfeng Tan +--- + examples/Makefile | 1 + + examples/dynamic_mbuf_pool/Makefile | 56 ++++++++++++++++ + examples/dynamic_mbuf_pool/main.c | 92 ++++++++++++++++++++++++++ + examples/dynamic_mbuf_pool/meson.build | 11 +++ + lib/librte_mbuf/rte_mbuf.c | 51 ++++++++++++++ + lib/librte_mbuf/rte_mbuf.h | 5 ++ + lib/librte_mbuf/rte_mbuf_version.map | 8 ++- + 7 files changed, 223 insertions(+), 1 deletion(-) + create mode 100644 examples/dynamic_mbuf_pool/Makefile + create mode 100644 examples/dynamic_mbuf_pool/main.c + create mode 100644 examples/dynamic_mbuf_pool/meson.build + +diff --git a/examples/Makefile b/examples/Makefile +index 33fe0e586..3df9cb7ad 100644 +--- a/examples/Makefile ++++ b/examples/Makefile +@@ -21,6 +21,7 @@ DIRS-$(CONFIG_RTE_LIBRTE_CRYPTODEV) += fips_validation + DIRS-$(CONFIG_RTE_LIBRTE_FLOW_CLASSIFY) += flow_classify + DIRS-y += flow_filtering + DIRS-y += helloworld ++DIRS-y += dynamic_mbuf_pool + DIRS-$(CONFIG_RTE_LIBRTE_PIPELINE) += ip_pipeline + ifeq ($(CONFIG_RTE_LIBRTE_LPM),y) + DIRS-$(CONFIG_RTE_IP_FRAG) += ip_reassembly +diff --git a/examples/dynamic_mbuf_pool/Makefile b/examples/dynamic_mbuf_pool/Makefile +new file mode 100644 +index 000000000..f2761f661 +--- /dev/null ++++ b/examples/dynamic_mbuf_pool/Makefile +@@ -0,0 +1,56 @@ ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright(c) 2010-2014 Intel Corporation ++ ++# binary name ++APP = dynamic_mbuf_pool ++ ++# all source are stored in SRCS-y ++SRCS-y := main.c ++ ++# Build using pkg-config variables if possible ++$(shell pkg-config --exists libdpdk) ++ifeq ($(.SHELLSTATUS),0) ++ ++all: shared ++.PHONY: shared static ++shared: build/$(APP)-shared ++ ln -sf $(APP)-shared build/$(APP) ++static: build/$(APP)-static ++ ln -sf $(APP)-static build/$(APP) ++ ++PC_FILE := $(shell pkg-config --path libdpdk) ++CFLAGS += -O3 $(shell pkg-config --cflags libdpdk) ++LDFLAGS_SHARED = $(shell pkg-config --libs libdpdk) ++LDFLAGS_STATIC = -Wl,-Bstatic $(shell pkg-config --static --libs libdpdk) ++ ++build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build ++ $(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED) ++ ++build/$(APP)-static: $(SRCS-y) Makefile $(PC_FILE) | build ++ $(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC) ++ ++build: ++ @mkdir -p $@ ++ ++.PHONY: clean ++clean: ++ rm -f build/$(APP) build/$(APP)-static build/$(APP)-shared ++ rmdir --ignore-fail-on-non-empty build ++ ++else ++ ++ifeq ($(RTE_SDK),) ++$(error "Please define RTE_SDK environment variable") ++endif ++ ++# Default target, can be overridden by command line or environment ++RTE_TARGET ?= x86_64-native-linuxapp-gcc ++ ++include $(RTE_SDK)/mk/rte.vars.mk ++ ++CFLAGS += -O3 ++CFLAGS += $(WERROR_FLAGS) ++ ++include $(RTE_SDK)/mk/rte.extapp.mk ++ ++endif +diff --git a/examples/dynamic_mbuf_pool/main.c b/examples/dynamic_mbuf_pool/main.c +new file mode 100644 +index 000000000..a568d7cec +--- /dev/null ++++ b/examples/dynamic_mbuf_pool/main.c +@@ -0,0 +1,92 @@ ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright(c) 2010-2014 Intel Corporation ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define HUGE_2M "/sys/kernel/mm/hugepages/hugepages-2048kB/free_hugepages" ++#define HUGE_1G "/sys/kernel/mm/hugepages/hugepages-1048576kB/free_hugepages" ++ ++static long int ++get_value(const char *path) ++{ ++ int fd, len; ++ long int value; ++ char buf[1024]; ++ ++ fd = open(path, O_RDONLY); ++ if (fd < 0) ++ return ULONG_MAX; ++ ++ len = read(fd, buf, sizeof(buf)); ++ ++ close(fd); ++ ++ if (len <= 0) { ++ return ULONG_MAX; ++ } ++ ++ value = strtol(buf, NULL, 10); ++ return value; ++} ++ ++static void ++print_free_hugepages(void) ++{ ++ printf("2M: %ld\t\t1G: %ld\n", get_value(HUGE_2M), get_value(HUGE_1G)); ++} ++ ++int ++main(int argc, char **argv) ++{ ++ int i; ++ int ret; ++ int n = 512 * 1024; ++ int dynamic_size = 8 * 1024; ++ struct rte_mbuf *m; ++ struct rte_mempool *mp; ++ ++ ret = rte_eal_init(argc, argv); ++ if (ret < 0) ++ rte_panic("Cannot init EAL\n"); ++ ++ mp = rte_pktmbuf_dynamic_pool_create("mbuf_pool", n, ++ 64, 0, RTE_MBUF_DEFAULT_BUF_SIZE, ++ 0, dynamic_size); ++ if (mp == NULL) ++ rte_panic("Failed to create mbuf mempool"); ++ ++ for (i = 0; i < n; i++) { ++ m = rte_pktmbuf_alloc(mp); ++ if (m == NULL) ++ break; ++ ++ if ((i % dynamic_size) == 1) { ++ print_free_hugepages(); ++ usleep(100 * 1000); ++ } ++ } ++ ++ printf("have allocated %d mbufs", i); ++ rte_memzone_dump(stdout); ++ ++ return 0; ++} +diff --git a/examples/dynamic_mbuf_pool/meson.build b/examples/dynamic_mbuf_pool/meson.build +new file mode 100644 +index 000000000..c34e11e36 +--- /dev/null ++++ b/examples/dynamic_mbuf_pool/meson.build +@@ -0,0 +1,11 @@ ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright(c) 2017 Intel Corporation ++ ++# meson file, for building this example as part of a main DPDK build. ++# ++# To build this example as a standalone application with an already-installed ++# DPDK instance, use 'make' ++ ++sources = files( ++ 'main.c' ++) +diff --git a/lib/librte_mbuf/rte_mbuf.c b/lib/librte_mbuf/rte_mbuf.c +index 9790b4fb1..b70abd88c 100644 +--- a/lib/librte_mbuf/rte_mbuf.c ++++ b/lib/librte_mbuf/rte_mbuf.c +@@ -167,6 +167,57 @@ rte_pktmbuf_pool_create(const char *name, unsigned int n, + data_room_size, socket_id, NULL); + } + ++struct rte_mempool * ++rte_pktmbuf_dynamic_pool_create(const char *name, unsigned int n, ++ unsigned int cache_size, uint16_t priv_size, ++ uint16_t data_room_size, int socket_id, int dynamic_size) ++{ ++ struct rte_mempool *mp; ++ struct rte_pktmbuf_pool_private mbp_priv; ++ const char *mp_ops_name; ++ unsigned elt_size; ++ int ret; ++ ++ if (RTE_ALIGN(priv_size, RTE_MBUF_PRIV_ALIGN) != priv_size) { ++ RTE_LOG(ERR, MBUF, "mbuf priv_size=%u is not aligned\n", ++ priv_size); ++ rte_errno = EINVAL; ++ return NULL; ++ } ++ elt_size = sizeof(struct rte_mbuf) + (unsigned)priv_size + ++ (unsigned)data_room_size; ++ mbp_priv.mbuf_data_room_size = data_room_size; ++ mbp_priv.mbuf_priv_size = priv_size; ++ ++ mp = rte_mempool_create_empty(name, n, elt_size, cache_size, ++ sizeof(struct rte_pktmbuf_pool_private), ++ socket_id, MEMPOOL_F_DYNAMIC); ++ if (mp == NULL) ++ return NULL; ++ ++ mp_ops_name = rte_mbuf_best_mempool_ops(); ++ ret = rte_mempool_set_ops_byname(mp, mp_ops_name, NULL); ++ if (ret != 0) { ++ RTE_LOG(ERR, MBUF, "error setting mempool handler\n"); ++ rte_mempool_free(mp); ++ rte_errno = -ret; ++ return NULL; ++ } ++ rte_pktmbuf_pool_init(mp, &mbp_priv); ++ ++ rte_mempool_set_dynamic_size(mp, dynamic_size); ++ rte_mempool_set_dynamic_cb(mp, rte_pktmbuf_init); ++ ++ ret = rte_mempool_populate_default(mp); ++ if (ret < 0) { ++ rte_mempool_free(mp); ++ rte_errno = -ret; ++ return NULL; ++ } ++ ++ return mp; ++} ++ + /* do some sanity checks on a mbuf: panic if it fails */ + void + rte_mbuf_sanity_check(const struct rte_mbuf *m, int is_header) +diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h +index 3dbc6695e..5a2d81605 100644 +--- a/lib/librte_mbuf/rte_mbuf.h ++++ b/lib/librte_mbuf/rte_mbuf.h +@@ -1183,6 +1183,11 @@ rte_pktmbuf_pool_create(const char *name, unsigned n, + unsigned cache_size, uint16_t priv_size, uint16_t data_room_size, + int socket_id); + ++struct rte_mempool * ++rte_pktmbuf_dynamic_pool_create(const char *name, unsigned int n, ++ unsigned int cache_size, uint16_t priv_size, ++ uint16_t data_room_size, int socket_id, int dynamic_size); ++ + /** + * Create a mbuf pool with a given mempool ops name + * +diff --git a/lib/librte_mbuf/rte_mbuf_version.map b/lib/librte_mbuf/rte_mbuf_version.map +index cae68db8d..d6d25af95 100644 +--- a/lib/librte_mbuf/rte_mbuf_version.map ++++ b/lib/librte_mbuf/rte_mbuf_version.map +@@ -44,4 +44,10 @@ DPDK_18.08 { + rte_mbuf_set_user_mempool_ops; + rte_mbuf_user_mempool_ops; + rte_pktmbuf_pool_create_by_ops; +-} DPDK_16.11; ++} DPDK_18.11; ++ ++DPDK_18.11 { ++ global: ++ ++ rte_pktmbuf_dynamic_pool_create; ++} DPDK_18.12; +-- +2.17.1 + diff --git a/dpdk/dpdk-v18.11_patches/0008-mempool-prioritize-constructor.patch b/dpdk/dpdk-v18.11_patches/0008-mempool-prioritize-constructor.patch new file mode 100644 index 0000000..c941443 --- /dev/null +++ b/dpdk/dpdk-v18.11_patches/0008-mempool-prioritize-constructor.patch @@ -0,0 +1,30 @@ +From cd36895a4a7bfc342915b42e3856bd233452f0bd Mon Sep 17 00:00:00 2001 +From: Jianfeng Tan +Date: Fri, 13 Jul 2018 15:25:22 +0800 +Subject: [PATCH 1/9] mempool: prioritize constructor + +--- + lib/librte_mempool/rte_mempool.h | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h +index 7c9cd9a2f..bdc32d583 100644 +--- a/lib/librte_mempool/rte_mempool.h ++++ b/lib/librte_mempool/rte_mempool.h +@@ -833,10 +833,10 @@ int rte_mempool_register_ops(const struct rte_mempool_ops *ops); + * more than RTE_MEMPOOL_MAX_OPS_IDX is registered. + */ + #define MEMPOOL_REGISTER_OPS(ops) \ +- void mp_hdlr_init_##ops(void); \ +- void __attribute__((constructor, used)) mp_hdlr_init_##ops(void)\ ++ static void __attribute__((constructor(101), used)) \ ++ mp_hdlr_init_##ops(void) \ + { \ +- rte_mempool_register_ops(&ops); \ ++ rte_mempool_register_ops(&ops); \ + } + + /** +-- +2.17.1 + diff --git a/dpdk/dpdk-v18.11_patches/0009-net-virtio-fill-desc-limit.patch b/dpdk/dpdk-v18.11_patches/0009-net-virtio-fill-desc-limit.patch new file mode 100644 index 0000000..146ea88 --- /dev/null +++ b/dpdk/dpdk-v18.11_patches/0009-net-virtio-fill-desc-limit.patch @@ -0,0 +1,42 @@ +commit 470acd1b108f20ae12b1216c9f6157b78655bcc7 +Author: Jianfeng Tan +Date: Wed Dec 12 02:14:03 2018 +0000 + + net/virtio: fill desc limit + + We shall fill desc limit accordingly, or APIs, such as + rte_eth_dev_adjust_nb_rx_tx_desc, will not give correct desc + information. + + Signed-off-by: Jianfeng Tan + +diff --git a/drivers/net/virtio/virtio_ethdev.c b/drivers/net/virtio/virtio_ethdev.c +index dbfa6865c..d369d5ce8 100644 +--- a/drivers/net/virtio/virtio_ethdev.c ++++ b/drivers/net/virtio/virtio_ethdev.c +@@ -2172,6 +2172,7 @@ virtio_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info) + { + uint64_t tso_mask, host_features; + struct virtio_hw *hw = dev->data->dev_private; ++ struct virtqueue *vq; + + dev_info->speed_capa = ETH_LINK_SPEED_10G; /* fake value */ + +@@ -2209,6 +2210,17 @@ virtio_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info) + (1ULL << VIRTIO_NET_F_HOST_TSO6); + if ((host_features & tso_mask) == tso_mask) + dev_info->tx_offload_capa |= DEV_TX_OFFLOAD_TCP_TSO; ++ ++ ++ if (hw->vqs) { ++ vq = hw->vqs[VTNET_SQ_RQ_QUEUE_IDX]; ++ dev_info->rx_desc_lim.nb_max = vq->vq_nentries; ++ dev_info->rx_desc_lim.nb_min = 256; ++ ++ vq = hw->vqs[VTNET_SQ_TQ_QUEUE_IDX]; ++ dev_info->tx_desc_lim.nb_max = vq->vq_nentries; ++ dev_info->tx_desc_lim.nb_min = 256; ++ } + } + + /* diff --git a/examples/Makefile b/examples/Makefile index cf13574..9ef8d85 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -21,6 +21,6 @@ endif include $(RTE_SDK)/mk/rte.vars.mk -DIRS-y += l4fwd +#DIRS-y += l4fwd include $(TLDK_ROOT)/mk/tle.subdir.mk diff --git a/examples/l4fwd/main.c b/examples/l4fwd/main.c index 9396403..2e16479 100644 --- a/examples/l4fwd/main.c +++ b/examples/l4fwd/main.c @@ -68,7 +68,6 @@ static char proto_name[3][10] = {"udp", "tcp", ""}; static const struct rte_eth_conf port_conf_default = { .rxmode = { - .offloads = DEV_RX_OFFLOAD_VLAN_STRIP, }, }; diff --git a/examples/l4fwd/port.h b/examples/l4fwd/port.h index a154844..67ca19a 100644 --- a/examples/l4fwd/port.h +++ b/examples/l4fwd/port.h @@ -177,21 +177,10 @@ port_init(struct netbe_port *uprt, uint32_t proto) } port_conf = port_conf_default; - if ((uprt->rx_offload & RX_CSUM_OFFLOAD) != 0) { - RTE_LOG(ERR, USER1, "%s(%u): enabling RX csum offload;\n", - __func__, uprt->id); - port_conf.rxmode.offloads |= uprt->rx_offload & RX_CSUM_OFFLOAD; - } - port_conf.rxmode.max_rx_pkt_len = uprt->mtu + ETHER_CRC_LEN; - if (port_conf.rxmode.max_rx_pkt_len > ETHER_MAX_LEN) - port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_JUMBO_FRAME; - rc = update_rss_conf(uprt, &dev_info, &port_conf, proto); if (rc != 0) return rc; - port_conf.txmode.offloads = uprt->tx_offload; - rc = rte_eth_dev_configure(uprt->id, uprt->nb_lcore, uprt->nb_lcore, &port_conf); RTE_LOG(NOTICE, USER1, diff --git a/lib/Makefile b/lib/Makefile index 6317af9..9bbe159 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -25,5 +25,6 @@ DIRS-y += libtle_misc DIRS-y += libtle_dring DIRS-y += libtle_timer DIRS-y += libtle_l4p +DIRS-y += libtle_glue include $(TLDK_ROOT)/mk/tle.subdir.mk diff --git a/lib/libtle_glue/Makefile b/lib/libtle_glue/Makefile new file mode 100644 index 0000000..13ceb82 --- /dev/null +++ b/lib/libtle_glue/Makefile @@ -0,0 +1,62 @@ +# Copyright (c) 2018 Ant Financial Services Group. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ifeq ($(RTE_SDK),) +$(error "Please define RTE_SDK environment variable") +endif + +# Default target, can be overwritten by command line or environment +RTE_TARGET ?= x86_64-native-linuxapp-gcc + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = libtle_glue.a + +CFLAGS += -O3 +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) + +EXPORT_MAP := tle_glue_version.map + +LIBABIVER := 1 + +# source files +SRCS-y += fd.c +SRCS-y += ctx.c +SRCS-y += arp.c +SRCS-y += icmp.c +SRCS-y += rxcb.c +SRCS-y += port.c +SRCS-y += sym.c +SRCS-y += init.c +SRCS-y += be.c +SRCS-y += epoll.c +SRCS-y += socket.c +SRCS-y += rxtx.c +SRCS-y += poll.c +SRCS-y += util.c +SRCS-y += tcp.c +SRCS-y += udp.c +SRCS-y += select.c + +ifeq ($(PACKETDRILL),y) +SRCS-y += packetdrill.c +endif + +# install this header file +SYMLINK-y-include += tle_glue.h + +# this lib dependencies +DEPDIRS-y += lib/libtle_l4p + +include $(TLDK_ROOT)/mk/tle.lib.mk diff --git a/lib/libtle_glue/arp.c b/lib/libtle_glue/arp.c new file mode 100644 index 0000000..9b13d9e --- /dev/null +++ b/lib/libtle_glue/arp.c @@ -0,0 +1,935 @@ +/* + * Copyright (c) 2019 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "log.h" +#include "ctx.h" +#include "internal.h" +#include "tle_timer.h" +#include "util.h" +#include "ndp.h" +#include "gateway.h" + +#define IPV6_MULTI_MASK_LEN 13 + +const struct in6_addr ipv6_all_multi = {{{ + 0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 +}}}; + +const struct in6_addr ipv6_multi_mask = {{{ + 0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}}}; + +static inline void +set_multicast_mac_v6(struct ether_addr *addr, const struct in6_addr *ip6_addr) +{ + unaligned_uint16_t *ea_words = (unaligned_uint16_t *)addr; + + ea_words[0] = 0x3333; + ea_words[1] = ip6_addr->__in6_u.__u6_addr16[6]; + ea_words[2] = ip6_addr->__in6_u.__u6_addr16[7]; +} + +static inline void +set_multicast_ipv6(uint8_t ipv6[16]) +{ + rte_memcpy(ipv6, &ipv6_multi_mask, IPV6_MULTI_MASK_LEN); +} + +static inline void +set_broadcast_addr(struct ether_addr *addr) +{ + unaligned_uint16_t *ea_words = (unaligned_uint16_t *)addr; + + ea_words[0] = 0xFFFF; + ea_words[1] = 0xFFFF; + ea_words[2] = 0xFFFF; +} + +static inline bool +match_addr(struct glue_ctx *ctx, struct rte_mbuf *pkt, const struct in_addr *addr) +{ + struct ipv4_hdr *ip4h; + const struct in_addr *gw; + + ip4h = rte_pktmbuf_mtod_offset(pkt, struct ipv4_hdr *, pkt->l2_len); + if ((ip4h->version_ihl >> 4) != 4) + return false; + + gw = ipv4_gateway_lookup(ctx, (struct in_addr *)&ip4h->dst_addr); + if (gw->s_addr != addr->s_addr) + return false; + + return true; +} + +static inline bool +match_addr6(struct glue_ctx *ctx, struct rte_mbuf *pkt, + const struct in6_addr *addr) +{ + struct ipv6_hdr *ip6h; + const struct in6_addr *gw; + + ip6h = rte_pktmbuf_mtod_offset(pkt, struct ipv6_hdr *, pkt->l2_len); + if (((ip6h->vtc_flow & 0xffffff00) >> 4) != 6) + return false; + + gw = ipv6_gateway_lookup(ctx, (struct in6_addr *)&ip6h->dst_addr); + if (memcmp(gw, addr, sizeof(struct in6_addr)) != 0) + return false; + + return true; +} + +static inline void +send_pkts(struct glue_ctx *ctx, struct rte_mbuf **pkts, uint16_t nb, + const char *prefix) +{ + uint16_t i, sent; + + sent = rte_eth_tx_burst(ctx->port_id, ctx->queue_id, pkts, nb); + for (i = sent; i < nb; i++) + rte_pktmbuf_free(pkts[i]); + + RTE_SET_USED(prefix); + TRACE("%s, send %u/%u pkts", prefix, sent, nb); +} + +static void +flush_arp_wait(int af, struct glue_ctx *ctx, const void *addr, + struct ether_addr *e_addr) +{ + struct rte_mbuf *pkt, *pre, *pkts[MAX_PKTS_BURST]; + struct ether_hdr *eth; + uint32_t nb_pkts; + + pre = NULL; + nb_pkts = 0; + for (pkt = ctx->arp_wait; pkt; pkt = pkt->next_pkt) { + if ((af == AF_INET && + !match_addr(ctx, pkt, (const struct in_addr *)addr)) || + (af == AF_INET6 && + !match_addr6(ctx, pkt, (const struct in6_addr *)addr))) { + pre = pkt; + continue; + } + + if (pre == NULL) + ctx->arp_wait = pkt->next_pkt; + else + pre->next_pkt = pkt->next_pkt; + eth = rte_pktmbuf_mtod(pkt, struct ether_hdr *); + ether_addr_copy(e_addr, ð->d_addr); + pkts[nb_pkts++] = pkt; + if (nb_pkts == MAX_PKTS_BURST) { + send_pkts(ctx, pkts, nb_pkts, "ARP learned"); + nb_pkts = 0; + } + } + if (nb_pkts) + send_pkts(ctx, pkts, nb_pkts, "ARP learned"); +} + +static inline void +ipv4_dst_set(struct glue_ctx *ctx, struct tle_dest *dst, + const struct in_addr *addr, struct ether_addr *e_addr) +{ + struct ether_hdr *eth; + struct ipv4_hdr *ip4h; + + if (is_ipv4_loopback_addr(addr->s_addr, ctx)) + dst->mtu = MTU_LOOPBACK; + else + dst->mtu = MTU_NORMAL; + dst->l2_len = sizeof(*eth); + dst->head_mp = get_mempool_by_socket(0); /* fix me */ + + eth = (struct ether_hdr *)dst->hdr; + ether_addr_copy(&ctx->mac, ð->s_addr); + if (e_addr == NULL) + set_broadcast_addr(ð->d_addr); + else + ether_addr_copy(e_addr, ð->d_addr); + eth->ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv4); + + dst->l3_len = sizeof(*ip4h); + ip4h = (struct ipv4_hdr *)(eth + 1); + ip4h->dst_addr = addr->s_addr; + ip4h->version_ihl = 4 << 4 | sizeof(*ip4h) / IPV4_IHL_MULTIPLIER; + ip4h->time_to_live = 64; + ip4h->next_proto_id = IPPROTO_TCP; +} + +static inline void +ipv6_dst_set(struct glue_ctx *ctx, struct tle_dest *dst, + const struct in6_addr *addr, struct ether_addr *e_addr) +{ + struct ether_hdr *eth; + struct ipv6_hdr *ip6h; + + if (is_ipv6_loopback_addr(addr, ctx)) + dst->mtu = MTU_LOOPBACK; + else + dst->mtu = MTU_NORMAL; + dst->l2_len = sizeof(*eth); + dst->head_mp = get_mempool_by_socket(0); /* fix me */ + + eth = (struct ether_hdr *)dst->hdr; + ether_addr_copy(&ctx->mac, ð->s_addr); + if (e_addr == NULL) + set_broadcast_addr(ð->d_addr); + else + ether_addr_copy(e_addr, ð->d_addr); + eth->ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv6); + + dst->l3_len = sizeof(*ip6h); + ip6h = (struct ipv6_hdr *)(eth + 1); + rte_memcpy(ip6h->dst_addr, addr, sizeof(struct in6_addr)); + ip6h->vtc_flow = 6 << 4; + ip6h->hop_limits = 255; + ip6h->proto = IPPROTO_TCP; +} + +#define arp_timer(ctx, entry, interval) \ + tle_timer_start(ctx->arp_tmw, entry, interval) + +void +ipv4_dst_add(struct glue_ctx *ctx, const struct in_addr *addr, + struct ether_addr *e_addr) +{ + struct arp_entry *entry; + struct tle_dest *dst; + struct ether_hdr *eth; + uint64_t idx; + bool check_wait; + int rc; + + rc = rte_hash_lookup_data(ctx->arp_hash, addr, (void**)&idx); + if (rc >= 0) { + entry = &ctx->arp4[idx]; + dst = &entry->dst; + eth = (struct ether_hdr *)dst->hdr; + check_wait = is_broadcast_ether_addr(ð->d_addr); + + /* update arp entry, reset timer */ + ether_addr_copy(e_addr, ð->d_addr); + print_arp(AF_INET, addr, ð->d_addr, "UPDATE"); + if(entry->timer != NULL) + tle_timer_stop(ctx->arp_tmw, entry->timer); + entry->timer = arp_timer(ctx, entry, ARP_ENTRY_EXPIRE); + entry->inuse = 0; + entry->req_time = 0; + + if(check_wait) + flush_arp_wait(AF_INET, ctx, addr, e_addr); + + return; + } + + idx = ctx->arp4_num; + entry = &ctx->arp4[idx]; + dst = &entry->dst; + + ipv4_dst_set(ctx, dst, addr, e_addr); + if (e_addr == NULL) { + entry->timer = arp_timer(ctx, entry, ARP_REQUEST_EXPIRE); + entry->req_time = 1; + } else { + entry->timer = arp_timer(ctx, entry, ARP_ENTRY_EXPIRE); + entry->inuse = 0; + } + + rc = rte_hash_add_key_data(ctx->arp_hash, addr, (void *)idx); + if (rc < 0) + rte_panic("Failed to add ARP entry"); + + ctx->arp4_num++; + eth = (struct ether_hdr *)dst->hdr; + print_arp(AF_INET, addr, ð->d_addr, "ADD"); +} + +void +ipv6_dst_add(struct glue_ctx *ctx, const struct in6_addr *addr, + struct ether_addr *e_addr) +{ + struct arp_entry* entry; + struct tle_dest *dst; + struct ether_hdr *eth; + uint64_t idx; + bool check_wait; + int rc; + + rc = rte_hash_lookup_data(ctx->arp6_hash, addr, (void**)&idx); + if (rc >= 0) { + entry = &ctx->arp6[idx]; + dst = &entry->dst; + eth = (struct ether_hdr *)dst->hdr; + check_wait = is_broadcast_ether_addr(ð->d_addr); + + /* update arp entry, reset timer */ + ether_addr_copy(e_addr, ð->d_addr); + print_arp(AF_INET6, addr, ð->d_addr, "UPDATE"); + if(entry->timer != NULL) + tle_timer_stop(ctx->arp_tmw, entry->timer); + entry->timer = arp_timer(ctx, entry, ARP_ENTRY_EXPIRE); + entry->inuse = 0; + entry->req_time = 0; + + if(check_wait) + flush_arp_wait(AF_INET6, ctx, addr, e_addr); + + return; + } + + idx = ctx->arp6_num; + entry = &ctx->arp6[idx]; + dst = &entry->dst; + + ipv6_dst_set(ctx, dst, addr, e_addr); + if (e_addr == NULL) { + entry->timer = arp_timer(ctx, entry, ARP_REQUEST_EXPIRE); + entry->req_time = 1; + } else { + entry->timer = arp_timer(ctx, entry, ARP_ENTRY_EXPIRE); + entry->inuse = 0; + } + + rc = rte_hash_add_key_data(ctx->arp6_hash, addr, (void *)idx); + if (rc < 0) + rte_panic("Failed to add ARP6 entry"); + + eth = (struct ether_hdr *)dst->hdr; + print_arp(AF_INET6, addr, ð->d_addr, "ADD"); + ctx->arp6_num++; +} + +static inline int +arp_ip_exist(const struct rte_hash *h, const void *ip) +{ + return rte_hash_lookup(h, ip) >= 0; +} + +struct rte_mbuf * +ndp_recv(struct glue_ctx *ctx, struct rte_mbuf *m, + uint32_t l2len, uint32_t l3len) +{ + struct ether_hdr *eth_h; + struct ipv6_hdr *ipv6_h; + struct nd_neighbor_solicit *ns_h; + struct nd_opt_hdr *opth; + + eth_h = rte_pktmbuf_mtod(m, struct ether_hdr *); + ipv6_h = rte_pktmbuf_mtod_offset(m, struct ipv6_hdr *, l2len); + ns_h = rte_pktmbuf_mtod_offset(m, struct nd_neighbor_solicit *, + l2len + l3len); + + if (ipv6_h->payload_len < sizeof(struct nd_neighbor_solicit)) + goto drop; + + /* We only learn mac when: + * 1. Normal NS for my ip, whose TargetAddr is me + * 2. Normal NA to my ip, whose DstIpv6 is me + * 3. Unsolicited NA, and we already have an entry for that IP + */ + + /* NS message */ + if (ns_h->nd_ns_hdr.icmp6_type == ND_NEIGHBOR_SOLICIT) { + /* not support Duplicate Address Detect NS yet */ + if (IN6_IS_ADDR_UNSPECIFIED(ipv6_h->src_addr)) + goto drop; + + if (memcmp(&ns_h->nd_ns_target, &ctx->ipv6, sizeof(ctx->ipv6))) + goto drop; + + /* NS message, target is my ipv6 addr */ + opth = (struct nd_opt_hdr*)(ns_h + 1); + ipv6_dst_add(ctx, (struct in6_addr *)ipv6_h->src_addr, + (struct ether_addr *)(opth + 1)); + + /* response NA message */ + ether_addr_copy(&ctx->mac, ð_h->s_addr); + ether_addr_copy((struct ether_addr*)(opth + 1), + ð_h->d_addr); + + rte_memcpy(ipv6_h->dst_addr, ipv6_h->src_addr, + sizeof(struct in6_addr)); + rte_memcpy(ipv6_h->src_addr, &ctx->ipv6, + sizeof(struct in6_addr)); + + ns_h->nd_ns_hdr.icmp6_type = ND_NEIGHBOR_ADVERT; + ns_h->nd_ns_hdr.icmp6_dataun.icmp6_un_data8[0] = 0x60; + ns_h->nd_ns_hdr.icmp6_cksum = 0; + + opth->nd_opt_type = ND_OPT_TARGET_LINKLAYER_ADDR; + ether_addr_copy(&ctx->mac, (struct ether_addr*)(opth + 1)); + + ns_h->nd_ns_hdr.icmp6_cksum = rte_ipv6_udptcp_cksum(ipv6_h, ns_h); + + if (m->pkt_len < ETHER_MIN_LEN) + rte_pktmbuf_append(m, ETHER_MIN_LEN - m->pkt_len); + + send_pkts(ctx, &m, 1, "NDP NA reply"); + return NULL; + } + + /* NA message */ + if (memcmp(ipv6_h->dst_addr, &ctx->ipv6, sizeof(ctx->ipv6)) == 0 || + (memcmp(ipv6_h->dst_addr, &ipv6_all_multi, sizeof(ctx->ipv6)) == 0 && + arp_ip_exist(ctx->arp6_hash, &ns_h->nd_ns_target))) { + opth = (struct nd_opt_hdr *)(ns_h + 1); + ipv6_dst_add(ctx, &ns_h->nd_ns_target, + (struct ether_addr *)(opth + 1)); + } + +drop: + rte_pktmbuf_free(m); + return NULL; +} + +struct rte_mbuf * +arp_recv(struct glue_ctx *ctx, struct rte_mbuf *m, uint32_t l2len) +{ + struct ether_hdr *eth; + struct arp_hdr *ahdr; + struct arp_ipv4 *adata; + uint32_t tip; + + eth = rte_pktmbuf_mtod(m, struct ether_hdr *); + ahdr = rte_pktmbuf_mtod_offset(m, struct arp_hdr *, l2len); + + if (ahdr->arp_hrd != rte_be_to_cpu_16(ARP_HRD_ETHER) || + ahdr->arp_pro != rte_be_to_cpu_16(ETHER_TYPE_IPv4)) + goto drop; + + adata = &ahdr->arp_data; + tip = adata->arp_tip; + + /* We only learn mac when: + * 1. tip is me, or + * 2. this is a RARP, and we already have an entry for that IP + */ + if (tip == ctx->ipv4 || + (tip == INADDR_ANY && arp_ip_exist(ctx->arp_hash, &adata->arp_sip))) + ipv4_dst_add(ctx, (struct in_addr *)&adata->arp_sip, + &adata->arp_sha); + + /* We only do ARP reply when: + * 1. tip is me. + */ + if (ahdr->arp_op == rte_be_to_cpu_16(ARP_OP_REQUEST) && + tip == ctx->ipv4) { + eth->d_addr = eth->s_addr; + eth->s_addr = ctx->mac; + ahdr->arp_op = rte_cpu_to_be_16(ARP_OP_REPLY); + + adata->arp_tip = adata->arp_sip; + adata->arp_sip = tip; + + adata->arp_tha = adata->arp_sha; + adata->arp_sha = ctx->mac; + if (m->pkt_len < ETHER_MIN_LEN) + rte_pktmbuf_append(m, ETHER_MIN_LEN - m->pkt_len); + send_pkts(ctx, &m, 1, "ARP reply"); + return NULL; + } +drop: + rte_pktmbuf_free(m); + return NULL; +} + +static void +arp6_send_request(struct glue_ctx *ctx, const struct in6_addr *addr) +{ + struct rte_mempool *mp = get_mempool_by_socket(0); /* fix me */ + struct ether_hdr *eth; + struct ipv6_hdr *ip6h; + struct nd_neighbor_solicit *nsh; + struct nd_opt_hdr *opth; + struct ether_addr *sll_addr; + struct rte_mbuf *m; +#ifdef ENABLE_TRACE + char str_ip[64]; +#endif + + m = rte_pktmbuf_alloc(mp); + if (m == NULL) + rte_panic("Failed to alloc mbuf for ndp ns request"); + + eth = (struct ether_hdr *)rte_pktmbuf_append(m, sizeof(*eth)); + ether_addr_copy(&ctx->mac, ð->s_addr); + set_multicast_mac_v6(ð->d_addr, addr); + eth->ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv6); + + ip6h = (struct ipv6_hdr*)rte_pktmbuf_append(m, sizeof(struct ipv6_hdr)); + ip6h->vtc_flow = 6 << 4; + ip6h->payload_len = sizeof(struct nd_neighbor_solicit) + + sizeof(struct nd_opt_hdr) + + sizeof(struct ether_addr); + ip6h->proto = IPPROTO_ICMPV6; + ip6h->hop_limits = 255; + rte_memcpy(ip6h->src_addr, &ctx->ipv6, sizeof(struct in6_addr)); + rte_memcpy(ip6h->dst_addr, addr, sizeof(struct in6_addr)); + set_multicast_ipv6(ip6h->dst_addr); + + nsh = (struct nd_neighbor_solicit *)rte_pktmbuf_append(m, sizeof(*nsh)); + nsh->nd_ns_hdr.icmp6_type = ND_NEIGHBOR_SOLICIT; + nsh->nd_ns_hdr.icmp6_code = 0; + nsh->nd_ns_hdr.icmp6_cksum = 0; + nsh->nd_ns_hdr.icmp6_dataun.icmp6_un_data32[0] = 0; + rte_memcpy(&nsh->nd_ns_target, addr, sizeof(struct in6_addr)); + + opth = (struct nd_opt_hdr *)rte_pktmbuf_append(m, sizeof(*opth)); + opth->nd_opt_type = ND_OPT_SOURCE_LINKLAYER_ADDR; + opth->nd_opt_len = 1; + + sll_addr = (struct ether_addr *)rte_pktmbuf_append(m, sizeof(*sll_addr)); + ether_addr_copy(&ctx->mac, sll_addr); + + nsh->nd_ns_hdr.icmp6_cksum = rte_ipv6_udptcp_cksum(ip6h, nsh); + + send_pkts(ctx, &m, 1, "ARP6 request"); +} + +static void +arp_send_request(struct glue_ctx *ctx, const struct in_addr *addr) +{ + struct rte_mempool *mp = get_mempool_by_socket(0); /* fix me */ + struct ether_hdr *eth; + struct arp_hdr *ahdr; + struct arp_ipv4 *adata; + struct rte_mbuf *m; + uint16_t pad_len, i; + char *pad; + + m = rte_pktmbuf_alloc(mp); + if (m == NULL) + rte_panic("Failed to alloc mbuf for arp request"); + + eth = (struct ether_hdr *)rte_pktmbuf_append(m, sizeof(*eth)); + ether_addr_copy(&ctx->mac, ð->s_addr); + set_broadcast_addr(ð->d_addr); + eth->ether_type = rte_cpu_to_be_16(ETHER_TYPE_ARP); + + ahdr = (struct arp_hdr *)rte_pktmbuf_append(m, sizeof(*ahdr)); + ahdr->arp_hrd = rte_be_to_cpu_16(ARP_HRD_ETHER); + ahdr->arp_pro = rte_be_to_cpu_16(ETHER_TYPE_IPv4); + ahdr->arp_hln = sizeof(struct ether_addr); + ahdr->arp_pln = sizeof(*addr); + ahdr->arp_op = rte_be_to_cpu_16(ARP_OP_REQUEST); + adata = &ahdr->arp_data; + ether_addr_copy(&ctx->mac, &adata->arp_sha); + adata->arp_sip = ctx->ipv4; + set_broadcast_addr(&adata->arp_tha); + adata->arp_tip = addr->s_addr; + + pad_len = ETHER_MIN_LEN - sizeof(*eth) - sizeof(*ahdr); + pad = rte_pktmbuf_append(m, pad_len); + for (i = 0; i < pad_len; ++i) + pad[i] = 0; + + send_pkts(ctx, &m, 1, "ARP request"); +} + +#define addr2ipv4(addr) (&((const struct sockaddr_in *)addr)->sin_addr) +#define addr2ipv6(addr) (&((const struct sockaddr_in6 *)addr)->sin6_addr) +void +mac_check(struct glue_ctx *ctx, const struct sockaddr *addr) +{ + int rc; + const struct in_addr *addr4 = NULL; + const struct in6_addr *addr6 = NULL; + + if(addr->sa_family == AF_INET) { + addr4 = ipv4_gateway_lookup(ctx, addr2ipv4(addr)); + rc = rte_hash_lookup(ctx->arp_hash, addr4); + } else { + addr6 = ipv6_gateway_lookup(ctx, addr2ipv6(addr)); + rc = rte_hash_lookup(ctx->arp6_hash, addr6); + } + if (rc >= 0) + return; + + if(addr->sa_family == AF_INET) + arp_send_request(ctx, addr4); + else + arp6_send_request(ctx, addr6); +} + +static int +arp_inherit(struct glue_ctx *ctx, const struct in_addr *addr) +{ + struct glue_ctx *next; + struct tle_dest *dst; + struct ether_hdr *eth; + uint64_t idx; + uint16_t i; + int rc; + + for (i = 0; i < nb_ctx; i++) { + next = &ctx_array[i++]; + if (next == NULL || next == ctx) + continue; + + rc = rte_hash_lookup_data(next->arp_hash, addr, (void **)&idx); + if (rc < 0) + continue; + + dst = &next->arp4[idx].dst; + eth = (struct ether_hdr *)dst->hdr; + ipv4_dst_add(ctx, addr, ð->d_addr); + return 0; + } + + return -1; +} + +static int +arp6_inherit(struct glue_ctx *ctx, const struct in6_addr *addr) +{ + struct glue_ctx *next; + struct ether_hdr *eth; + struct tle_dest *dst; + uint64_t idx; + uint16_t i; + int rc; + + for (i = 0; i < nb_ctx; i++) { + next = &ctx_array[i++]; + if (next == NULL || next == ctx) + continue; + + rc = rte_hash_lookup_data(next->arp6_hash, addr, (void **)&idx); + if (rc < 0) + continue; + + dst = &next->arp6[idx].dst; + eth = (struct ether_hdr *)dst->hdr; + ipv6_dst_add(ctx, addr, ð->d_addr); + return 0; + } + + return -1; +} + +#define len_dest(dst) \ + (offsetof(struct tle_dest, hdr) + dst->l2_len + dst->l3_len) + +int +arp_ipv6_dst_lookup(void *data, const struct in6_addr *addr, + struct tle_dest *res, int proto) +{ + int32_t rc; + uint64_t idx; + struct tle_dest *dst; + struct ipv6_hdr *ip6h; + struct glue_ctx *ctx = data; + + if (is_ipv6_loopback_addr(addr, ctx)) { + dst = &ctx->lb_dst_v6; + rte_memcpy(res, dst, len_dest(dst)); + if (proto == IPPROTO_TCP) + res->dev = ctx->lb_tcp_dev; + else + res->dev = ctx->lb_udp_dev; + rc = 0; + goto set_proto; + } + + rc = rte_hash_lookup_data(ctx->arp6_hash, addr, (void **)&idx); + if (rc >= 0) { + if (!ctx->arp6[idx].inuse) + ctx->arp6[idx].inuse = 1; + dst = &ctx->arp6[idx].dst; + rte_memcpy(res, dst, len_dest(dst)); + } else { + memset(res, 0, sizeof(*res)); + ipv6_dst_set(ctx, res, addr, NULL); + rc = 0; + } + + if (proto == IPPROTO_TCP) + res->dev = ctx->tcp_dev; + else + res->dev = ctx->udp_dev; + +set_proto: + ip6h = (struct ipv6_hdr *)&res->hdr[res->l2_len]; + ip6h->proto = proto; + return rc; +} + +int +arp_ipv4_dst_lookup(void *data, const struct in_addr *addr, + struct tle_dest *res, int proto) +{ + int32_t rc; + uint64_t idx; + struct tle_dest *dst; + struct ipv4_hdr *ip4h; + struct glue_ctx *ctx = data; + + if (is_ipv4_loopback_addr(addr->s_addr, ctx)) { + dst = &ctx->lb_dst; + rte_memcpy(res, dst, len_dest(dst)); + if (proto == IPPROTO_TCP) + res->dev = ctx->lb_tcp_dev; + else + res->dev = ctx->lb_udp_dev; + rc = 0; + goto set_proto; + } + + rc = rte_hash_lookup_data(ctx->arp_hash, addr, (void **)&idx); + if (rc >= 0) { + if (!ctx->arp4[idx].inuse) + ctx->arp4[idx].inuse = 1; + dst = &ctx->arp4[idx].dst; + rte_memcpy(res, dst, len_dest(dst)); + } else { + memset(res, 0, sizeof(*res)); + ipv4_dst_set(ctx, res, addr, NULL); + rc = 0; + } + + if (proto == IPPROTO_TCP) + res->dev = ctx->tcp_dev; + else + res->dev = ctx->udp_dev; + +set_proto: + ip4h = (struct ipv4_hdr *)&res->hdr[res->l2_len]; + ip4h->next_proto_id = proto; + return rc; +} + +int +mac_fill(struct glue_ctx *ctx, struct rte_mbuf *m) +{ + int32_t rc; + uint64_t idx; + uint8_t ipver; + struct arp_entry* entry; + struct ether_addr *dst, *dst1; + struct ipv4_hdr *ipv4_hdr; + struct ipv6_hdr *ipv6_hdr; + const struct in_addr *addr4 = NULL; + const struct in6_addr *addr6 = NULL; + + dst = rte_pktmbuf_mtod(m, struct ether_addr *); + if (!is_broadcast_ether_addr(dst)) + return 0; + + ipv4_hdr = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr *, m->l2_len); + ipv6_hdr = (struct ipv6_hdr*)ipv4_hdr; + ipver = ipv4_hdr->version_ihl >> 4; + +retry: + if (ipver == 4) { + addr4 = (struct in_addr *)&ipv4_hdr->dst_addr; + addr4 = ipv4_gateway_lookup(ctx, addr4); + rc = rte_hash_lookup_data(ctx->arp_hash, addr4, (void **)&idx); + if (rc >= 0) + entry = &ctx->arp4[idx]; + } else { + addr6 = (struct in6_addr *)ipv6_hdr->dst_addr; + addr6 = ipv6_gateway_lookup(ctx, addr6); + rc = rte_hash_lookup_data(ctx->arp6_hash, addr6, (void **)&idx); + if (rc >= 0) + entry = &ctx->arp6[idx]; + } + + if (rc >= 0) { + dst1 = (struct ether_addr *)entry->dst.hdr; + if (!is_broadcast_ether_addr(dst1)) { + ether_addr_copy(dst1 , dst); + return 0; + } + + if (ipver == 4) + arp_send_request(ctx, addr4); + else + arp6_send_request(ctx, addr6); + entry->req_time++; + if (entry->timer != NULL) + tle_timer_stop(ctx->arp_tmw, entry->timer); + entry->timer = arp_timer(ctx, entry, ARP_REQUEST_EXPIRE); + } else { + if (ipver == 4) { + if (arp_inherit(ctx, addr4) == 0) + goto retry; + ipv4_dst_add(ctx, addr4, NULL); + arp_send_request(ctx, addr4); + } else { + if (arp6_inherit(ctx, addr6) == 0) + goto retry; + ipv6_dst_add(ctx, addr6, NULL); + arp6_send_request(ctx, addr6); + } + } + + return -1; +} + +static inline const struct in_addr * +get_addr_from_entry(struct arp_entry *e) +{ + const struct ipv4_hdr *ipv4; + const struct in_addr *addr; + + ipv4 = (struct ipv4_hdr *)(e->dst.hdr + e->dst.l2_len); + addr = (const struct in_addr *)&ipv4->dst_addr; + return addr; +} + +static inline const struct in6_addr * +get_addr6_from_entry(struct arp_entry *e) +{ + const struct ipv6_hdr *ipv6; + const struct in6_addr *addr; + + ipv6 = (struct ipv6_hdr *)(e->dst.hdr + e->dst.l2_len); + addr = (const struct in6_addr *)ipv6->dst_addr; + return addr; +} + +static void +drop_arp_wait(int af, struct glue_ctx *ctx, const void *addr) +{ + struct rte_mbuf *pkt, *pre; + + for (pre = NULL, pkt = ctx->arp_wait; pkt; pkt = pkt->next_pkt) { + if ((af == AF_INET && + !match_addr(ctx, pkt, (const struct in_addr *)addr)) || + (af == AF_INET6 && + !match_addr6(ctx, pkt, (const struct in6_addr *)addr))) { + pre = pkt; + continue; + } + + if (pre == NULL) + ctx->arp_wait = pkt->next_pkt; + else + pre->next_pkt = pkt->next_pkt; + + rte_pktmbuf_free(pkt); + } +} + +static void +arp_entry_del(struct glue_ctx *ctx, int af, struct arp_entry *e) +{ + const void *addr; + struct arp_entry *t; + uint32_t idx, last_idx; + const struct rte_hash *h; + + if (af == AF_INET) { + addr = get_addr_from_entry(e); + t = ctx->arp4; + h = ctx->arp_hash; + last_idx = ctx->arp4_num - 1; + } else { + addr = get_addr6_from_entry(e); + t = ctx->arp6; + h = ctx->arp6_hash; + last_idx = ctx->arp6_num - 1; + } + + idx = e - t; + if (idx > last_idx) /* entry has been moved */ + return; + + print_arp(af, addr, (struct ether_addr *)e->dst.hdr, "DELETE"); + + if (e->req_time > ARP_MAX_REQ_TIMES) + drop_arp_wait(af, ctx, addr); + + rte_hash_del_key(h, addr); + + if (idx < last_idx) { + /* replace current entry with last entry */ + rte_memcpy(e, t + last_idx, sizeof(*e)); + rte_hash_add_key_data(h, addr, (void *)(uintptr_t)idx); + tle_timer_stop(ctx->arp_tmw, t[last_idx].timer); + if (e->req_time > 0) + e->timer = arp_timer(ctx, e, ARP_REQUEST_EXPIRE); + else { + e->timer = arp_timer(ctx, e, ARP_ENTRY_EXPIRE); + e->inuse = 0; + } + } + + /* we always delete the last entry to keep it contiguous */ + t[last_idx].timer = NULL; + t[last_idx].inuse = 0; + t[last_idx].req_time = 0; + if (af == AF_INET) + ctx->arp4_num--; + else + ctx->arp6_num--; +} + +void +mac_timeout(struct glue_ctx *ctx) +{ +#define ARP_PROCESS_MAX 32 + struct arp_entry *entry[ARP_PROCESS_MAX], *e; + struct tle_timer_wheel *tw; + const struct in_addr *addr4; + const struct in6_addr *addr6; + uint32_t i, cnt; + uint8_t *l3h; + + tw = ctx->arp_tmw; + tle_timer_expire(tw, rte_get_tsc_cycles() >> ctx->cycles_ms_shift); + cnt = tle_timer_get_expired_bulk(tw, (void**)entry, ARP_PROCESS_MAX); + if (cnt == 0) + return; + + for(i = 0; i < cnt; i++) { + e = entry[i]; + e->timer = NULL; + l3h = e->dst.hdr + e->dst.l2_len; + if (e->inuse || + (e->req_time > 0 && e->req_time <= ARP_MAX_REQ_TIMES)) { + if (((struct ipv4_hdr *)l3h)->version_ihl >> 4 == 4) { + addr4 = get_addr_from_entry(e); + arp_send_request(ctx, addr4); + } else { + addr6 = get_addr6_from_entry(e); + arp6_send_request(ctx, addr6); + } + + e->timer = arp_timer(ctx, e, ARP_REQUEST_EXPIRE); + e->inuse = 0; + e->req_time++; + } else { + if (((struct ipv4_hdr *)l3h)->version_ihl >> 4 == 4) + arp_entry_del(ctx, AF_INET, e); + else + arp_entry_del(ctx, AF_INET6, e); + } + } +} diff --git a/lib/libtle_glue/be.c b/lib/libtle_glue/be.c new file mode 100644 index 0000000..7e2227e --- /dev/null +++ b/lib/libtle_glue/be.c @@ -0,0 +1,256 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include "config.h" +#include "log.h" +#include "util.h" +#include "internal.h" + +static inline void +rte_pktmbuf_copy_seg(struct rte_mbuf *dst, struct rte_mbuf* src) +{ + size_t offset = offsetof(struct rte_mbuf, data_off); + rte_memcpy((char*)dst + offset, (char*)src + offset, + sizeof(struct rte_mbuf) - offset); + rte_mbuf_refcnt_set(dst, 1); + dst->ol_flags &= ~IND_ATTACHED_MBUF; + rte_memcpy(rte_pktmbuf_mtod(dst, void*), rte_pktmbuf_mtod(src, void*), + src->data_len); +} + +static inline struct rte_mbuf* +rte_pktmbuf_copy(struct rte_mbuf *md, struct rte_mempool* mp) +{ + struct rte_mbuf *mc, *mi, **prev; + uint32_t pktlen; + uint16_t nseg; + + if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL)) + return NULL; + + mi = mc; + prev = &mi->next; + pktlen = md->pkt_len; + nseg = 0; + + do { + nseg++; + rte_pktmbuf_copy_seg(mi, md); + *prev = mi; + prev = &mi->next; + } while ((md = md->next) != NULL && + (mi = rte_pktmbuf_alloc(mp)) != NULL); + + *prev = NULL; + mc->nb_segs = nseg; + mc->pkt_len = pktlen; + + /* Allocation of new indirect segment failed */ + if (unlikely(mi == NULL)) { + rte_pktmbuf_free(mc); + return NULL; + } + + __rte_mbuf_sanity_check(mc, 1); + return mc; +} + +static inline int +process_rx_pkts(struct glue_ctx *ctx, struct rte_mbuf *pkts[], + uint32_t n, uint8_t from_loopback) +{ + uint32_t i, j, k, jt, ju, jd; + struct rte_mbuf *tcp[MAX_PKTS_BURST]; + struct rte_mbuf *udp[MAX_PKTS_BURST]; + struct rte_mbuf *drop[MAX_PKTS_BURST]; + int32_t rc[MAX_PKTS_BURST]; + struct tle_dev *tcp_dev, *udp_dev; + struct rte_mempool *mp; + struct rte_mbuf *tmp; + uint64_t ts; + + if (n == 0) + return 0; + + if (unlikely(from_loopback)) { + tcp_dev = ctx->lb_tcp_dev; + udp_dev = ctx->lb_udp_dev; + mp = pkts[0]->pool; + for (i = 0; i < n; i++) { + tmp = rte_pktmbuf_copy(pkts[i], mp); + if (tmp != NULL) { + rte_pktmbuf_free(pkts[i]); + pkts[i] = tmp; + pkts[i]->ol_flags |= PKT_RX_IP_CKSUM_GOOD; + pkts[i]->ol_flags |= PKT_RX_L4_CKSUM_GOOD; + } else { + k = i; + for (; i < n; i++) { + rte_pktmbuf_free(pkts[i]); + } + n = k; + } + } + } else { + tcp_dev = ctx->tcp_dev; + udp_dev = ctx->udp_dev; + } + + ts = rte_get_tsc_cycles() >> (ctx->cycles_ms_shift - 10); + + for (j = 0, jt = 0, ju = 0, jd = 0; j < n; j++) { + pkts[j]->timestamp = ts; + switch (pkts[j]->packet_type & RTE_PTYPE_L4_MASK) { + case RTE_PTYPE_L4_TCP: + tcp[jt++] = pkts[j]; + break; + case RTE_PTYPE_L4_UDP: + udp[ju++] = pkts[j]; + break; + case RTE_PTYPE_L4_ICMP: + /* TODO */ + case RTE_PTYPE_L4_FRAG: + /* TODO */ + default: + drop[jd++] = pkts[j]; + } + } + + if (jt > 0) { + k = tle_tcp_rx_bulk(tcp_dev, tcp, drop + jd, rc, jt); + jd += jt - k; + + TRACE("(port=%u, queue=%u), %u/%u (TCP) pkts are received", + port_id, queue_id, k, n); + } + + if (ju > 0) { + k = tle_udp_rx_bulk(udp_dev, udp, drop + jd, rc, ju); + jd += ju - k; + + TRACE("(port=%u, queue=%u), %u/%u (UDP) pkts are received", + port_id, queue_id, k, n); + } + + for (j = 0; j < jd; j++) + rte_pktmbuf_free(drop[j]); + + return jt + ju - jd; +} + +static inline int +be_rx(struct glue_ctx *ctx) +{ + int ret; + uint32_t n; + struct rte_mbuf *pkts[MAX_PKTS_BURST]; + uint16_t port_id = ctx->port_id; + uint16_t queue_id = ctx->queue_id; + + n = rte_eth_rx_burst(port_id, queue_id, pkts, RTE_DIM(pkts)); + ret = process_rx_pkts(ctx, pkts, n, 0); + + return ret; +} + +int +be_tx(struct glue_ctx *ctx) +{ + uint32_t n, j, k, s, ret; + const uint16_t max_pkts = MAX_PKTS_BURST; + struct rte_mbuf *pkts[max_pkts]; + struct rte_mbuf *_pkts[max_pkts]; + uint16_t port_id = ctx->port_id; + uint16_t queue_id = ctx->queue_id; + + ret = 0; + tle_tcp_process(ctx->tcp_ctx, TCP_MAX_PROCESS); + + n = tle_tcp_tx_bulk(ctx->lb_tcp_dev, pkts, max_pkts); + n += tle_udp_tx_bulk(ctx->lb_udp_dev, pkts + n, max_pkts - n); + if (n > 0) { + ret += n; + rte_eth_tx_burst(ctx->lb_port_id, 0, pkts, n); + /* loopback device could receive after transmit immediately */ + n = rte_eth_rx_burst(ctx->lb_port_id, 0, pkts, RTE_DIM(pkts)); + process_rx_pkts(ctx, pkts, n, 1); + + /* wake up look-aside backend */ + wake_lookaside_backend(ctx); + } + + n = tle_tcp_tx_bulk(ctx->tcp_dev, pkts, max_pkts); + n += tle_udp_tx_bulk(ctx->udp_dev, pkts + n, max_pkts - n); + if (n == 0) + return 0; + + ret += n; + s = 0; + for (j = 0; j != n; j++) { + if (mac_fill(ctx, pkts[j]) == 0) { + PKT_DUMP(pkts[j]); + _pkts[s++] = pkts[j]; + continue; + } + + pkts[j]->next_pkt = ctx->arp_wait; + ctx->arp_wait = pkts[j]; + } + + /* For virtio-user/vhost-kernel test case, it's normal that vhost + * kthread cannot catch up with packets generation speed in stack. + * Shall we drop those packets immdiately or retry some times to + * keep those packets? We find dropping packets here is not a good + * idea, which leads to lots of retrans and inefficiency of vhost + * kthread. Even below code does not work well: + * + * for (k = 0, retry = 0; k < s && retry < 10000; retry++) + * k += rte_eth_tx_burst(port_id, queue_id, _pkts + k, s - k); + * + * So we choose to blockingly send out packes. + */ + k = 0; + while (k < s) + k += rte_eth_tx_burst(port_id, queue_id, _pkts + k, s - k); + + for (j = k; j != s; j++) + rte_pktmbuf_free(_pkts[j]); + + TRACE("(port=%u, queue=%u), %u/%u pkts are sent", + port_id, queue_id, k, s); + + return ret; +} + +int +be_process(struct glue_ctx *ctx) +{ + int ret; + + if (unlikely(stopped)) + return 0; + + ret = be_rx(ctx); + mac_timeout(ctx); + ret += be_tx(ctx); + + return ret; +} diff --git a/lib/libtle_glue/config.h b/lib/libtle_glue/config.h new file mode 100644 index 0000000..976495e --- /dev/null +++ b/lib/libtle_glue/config.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _TLE_GLUE_CONFIG_H_ +#define _TLE_GLUE_CONFIG_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_STREAMS_PER_CORE 64 * 1024 +#define MIN_STREAMS_PER_CORE 16 +#define DELTA_STREAMS 64 +#define FRAG_BUCKET 8 +#define FRAG_ENTRIES_PER_BUCKET 8 +#define MAX_ARP_ENTRY (1 << 10) + +/* RCV buffer & SND buffer + * This is not a reall rcv/snd buffer implementation. Below number means + * the slots to store mbufs of sent or received data. Each slot could + * contains a single mbuf with size of (1500B or 2048B) or a chained + * mbuf with size <= 64KB. + * + * TODO: add real snd/rcv buffer + */ +#define MAX_RECV_BUFS_PER_STREAM 256 +#define MAX_SEND_BUFS_PER_STREAM 256 + +#ifdef LOOK_ASIDE_BACKEND +#define MAX_NB_CTX 1 +#else +#define MAX_NB_CTX 16 +#endif + +#define MAX_MBUFS 0x80000 +/* should calculated by: + * MAX_NB_CTX * MAX_STREAMS_PER_CORE * (MAX_RECV_BUFS_PER_STREAM + MAX_SEND_BUFS_PER_STREAM)) + */ + +#define MBUF_DYNAMIC_SIZE 0x800 + +#define MBUF_PERCORE_CACHE 32 + +#define MAX_PKTS_BURST 0x20 + +#define TCP_MAX_PROCESS 32 + +#define ARP_ENTRY_EXPIRE 60000U +#define ARP_REQUEST_EXPIRE 1000U /* ms */ +#define ARP_MAX_REQ_TIMES 5 + +#define MTU_NORMAL 1500 +#define MTU_LOOPBACK 65535 + +#ifdef __cplusplus +} +#endif + +#endif /*_TLE_GLUE_CONFIG_H_ */ diff --git a/lib/libtle_glue/ctx.c b/lib/libtle_glue/ctx.c new file mode 100644 index 0000000..dc78f39 --- /dev/null +++ b/lib/libtle_glue/ctx.c @@ -0,0 +1,535 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "config.h" +#include "ctx.h" +#include "log.h" +#include "util.h" +#include "internal.h" +#include "gateway.h" +#include "tle_timer.h" + +RTE_DEFINE_PER_LCORE(struct glue_ctx *, glue_ctx); + +int nb_ctx; +struct glue_ctx ctx_array[MAX_NB_CTX]; +struct glue_ctx *default_ctx = &ctx_array[0]; + +static int +ipv4_dst_lookup_tcp(void *data, const struct in_addr *addr, + struct tle_dest *res) +{ + addr = ipv4_gateway_lookup(data, addr); + return arp_ipv4_dst_lookup(data, addr, res, IPPROTO_TCP); +} + +static int +ipv4_dst_lookup_udp(void *data, const struct in_addr *addr, + struct tle_dest *res) +{ + addr = ipv4_gateway_lookup(data, addr); + return arp_ipv4_dst_lookup(data, addr, res, IPPROTO_UDP); +} + +static int +ipv6_dst_lookup_tcp(void *data, const struct in6_addr *addr, + struct tle_dest *res) +{ + addr = ipv6_gateway_lookup(data, addr); + return arp_ipv6_dst_lookup(data, addr, res, IPPROTO_TCP); +} + +static int +ipv6_dst_lookup_udp(void *data, const struct in6_addr *addr, + struct tle_dest *res) +{ + addr = ipv6_gateway_lookup(data, addr); + return arp_ipv6_dst_lookup(data, addr, res, IPPROTO_UDP); +} + +static struct tle_ctx * +proto_ctx_create(uint32_t socket_id, uint32_t proto, void *data) +{ + struct tle_ctx_param cprm; + + if (proto != TLE_PROTO_TCP && proto != TLE_PROTO_UDP) + rte_panic("Invalid proto [%u]\n", proto); + + cprm.socket_id = socket_id; + cprm.proto = proto; + cprm.max_streams = MAX_STREAMS_PER_CORE; + cprm.min_streams = MIN_STREAMS_PER_CORE; + cprm.delta_streams = DELTA_STREAMS; + cprm.max_stream_rbufs = MAX_RECV_BUFS_PER_STREAM; + cprm.max_stream_sbufs = MAX_SEND_BUFS_PER_STREAM; + if (proto == TLE_PROTO_TCP) { + cprm.lookup4 = ipv4_dst_lookup_tcp; + cprm.lookup6 = ipv6_dst_lookup_tcp; + } else { + cprm.lookup4 = ipv4_dst_lookup_udp; + cprm.lookup6 = ipv6_dst_lookup_udp; + } + cprm.lookup4_data = data; + cprm.lookup6_data = data; +#ifdef LOOK_ASIDE_BACKEND + cprm.flags = 0; +#else + cprm.flags = TLE_CTX_FLAG_ST; /* ctx will be used by single thread*/ +#endif + cprm.send_bulk_size = 0; /* 32 if 0 */ + cprm.hash_alg = TLE_SIPHASH; + cprm.secret_key.u64[0] = rte_rand(); + cprm.secret_key.u64[1] = rte_rand(); + cprm.icw = 0; /**< congestion window, default is 2*MSS if 0. */ + cprm.timewait = 1; /* TLE_TCP_TIMEWAIT_DEFAULT */ + + return tle_ctx_create(&cprm); +} + +static int +evq_init(struct glue_ctx *ctx, uint32_t socket_id) +{ + struct tle_evq_param eprm = { + .socket_id = socket_id, + .max_events = 0, /* We don't pre-allocate any event */ + }; + + ctx->ereq = tle_evq_create(&eprm); + if (ctx->ereq == NULL) + rte_panic("Cannot create ereq"); + + ctx->rxeq = tle_evq_create(&eprm); + if (ctx->rxeq == NULL) + rte_panic("Cannot create rxeq"); + + ctx->txeq = tle_evq_create(&eprm); + if (ctx->txeq == NULL) + rte_panic("Cannot create txeq"); + + return 0; +} + +static void +tle_ctx_init(struct glue_ctx *ctx, uint32_t socket_id) +{ + struct tle_dev_param dprm; + struct rte_eth_dev_info dev_info; + uint16_t port_id = 0; /* currently only use one port */ + + ctx->tcp_ctx = proto_ctx_create(socket_id, TLE_PROTO_TCP, ctx); + if (ctx->tcp_ctx == NULL) + rte_panic("Cannot create tle_ctx for tcp"); + + ctx->udp_ctx = proto_ctx_create(socket_id, TLE_PROTO_UDP, ctx); + if (ctx->udp_ctx == NULL) + rte_panic("Cannot create tle_ctx for udp"); + + memset(&dprm, 0, sizeof(dprm)); + + /* offloading check and set */ + rte_eth_dev_info_get(port_id, &dev_info); + dprm.rx_offload = dev_info.rx_offload_capa & rx_offload; + dprm.tx_offload = dev_info.tx_offload_capa & tx_offload; + + dprm.local_addr4.s_addr = ctx->ipv4; + rte_memcpy(&dprm.local_addr6, &ctx->ipv6, sizeof(struct in6_addr)); + dprm.bl4.nb_port = 0; + dprm.bl4.port = NULL; + dprm.bl6.nb_port = 0; + dprm.bl6.port = NULL; + + ctx->tcp_dev = tle_add_dev(ctx->tcp_ctx, &dprm); + if (ctx->tcp_dev == NULL) + rte_panic("add tle_dev for tcp failed: %u", rte_errno); + + ctx->udp_dev = tle_add_dev(ctx->udp_ctx, &dprm); + if (ctx->udp_dev == NULL) + rte_panic("add tle_dev for udp failed: %u", rte_errno); + + if (ctx == default_ctx) { + dprm.rx_offload = rx_offload; + dprm.tx_offload = tx_offload; + dprm.local_addr4.s_addr = htonl(INADDR_LOOPBACK); + rte_memcpy(&dprm.local_addr6, &in6addr_loopback, + sizeof(struct in6_addr)); + + ctx->lb_tcp_dev = tle_add_dev(ctx->tcp_ctx, &dprm); + if (ctx->lb_tcp_dev == NULL) + rte_panic("failed to add loopback tcp dev: %u\n", + rte_errno); + + ctx->lb_udp_dev = tle_add_dev(ctx->udp_ctx, &dprm); + if (ctx->lb_udp_dev == NULL) + rte_panic("failed to add loopback udp dev: %u\n", + rte_errno); + } + + evq_init(ctx, socket_id); +} + +static uint32_t +get_ip(void) +{ + struct in_addr addr; + const char *ip_str = getenv(DPDK_IP); + + if (ip_str == NULL) { + ip_str = DPDK_IP_DEF; + GLUE_LOG(INFO, "will use the default IP %s", DPDK_IP_DEF); + } else + GLUE_LOG(INFO, "will use the IP %s", ip_str); + + if (inet_aton(ip_str, &addr) == 0) + rte_panic("Invalid addr from env DPDK_IP: %s", ip_str); + + return addr.s_addr; +} + +static uint8_t +get_ip_mask(void) +{ + const char *mask_str = getenv(DPDK_IP_MASK); + + if (mask_str == NULL) { + mask_str = DPDK_IP_MASK_DEF; + GLUE_LOG(INFO, "will use the default IP Mask %s", DPDK_IP_MASK_DEF); + } else + GLUE_LOG(INFO, "will use the IP Mask %s", mask_str); + + return (uint8_t)atoi(mask_str); +} + +static uint32_t +get_ip_gate(void) +{ + struct in_addr addr; + const char *ip_str = getenv(DPDK_IP_GATEWAY); + + if (ip_str == NULL) { + ip_str = DPDK_IP_GATEWAY_DEF; + GLUE_LOG(INFO, "will use the default IP gateway %s", + DPDK_IP_GATEWAY_DEF); + } else + GLUE_LOG(INFO, "will use the IP gateway %s", ip_str); + + if (inet_aton(ip_str, &addr) == 0) + rte_panic("Invalid addr from env DPDK_IP_GATEWAY: %s", ip_str); + + return addr.s_addr; +} + +static struct in6_addr* +get_ipv6(void) +{ + static struct in6_addr addr; + const char *ip_str = getenv(DPDK_IPV6); + + if (ip_str == NULL) { + ip_str = DPDK_IPV6_DEF; + GLUE_LOG(INFO, "will use the default IP(V6) %s", DPDK_IPV6_DEF); + } else + GLUE_LOG(INFO, "will use the IP(V6) %s", ip_str); + + if (inet_pton(AF_INET6, ip_str, &addr) == 0) + rte_panic("Invalid addr from env DPDK_IPV6: %s", ip_str); + + return &addr; +} + +static uint8_t +get_ipv6_mask(void) +{ + const char *mask_str = getenv(DPDK_IPV6_MASK); + + if (mask_str == NULL) { + mask_str = DPDK_IPV6_MASK_DEF; + GLUE_LOG(INFO, "will use the default IPV6 Mask %s", + DPDK_IPV6_MASK_DEF); + } else + GLUE_LOG(INFO, "will use the IPV6 Mask %s", mask_str); + + return (uint8_t)atoi(mask_str); +} + +static struct in6_addr* +get_ipv6_gate(void) +{ + static struct in6_addr addr; + const char *ip_str = getenv(DPDK_IPV6_GATEWAY); + + if (ip_str == NULL) { + ip_str = DPDK_IPV6_GATEWAY_DEF; + GLUE_LOG(INFO, "will use the default IP(V6) gateway %s", + DPDK_IPV6_GATEWAY_DEF); + } else + GLUE_LOG(INFO, "will use the IP(V6) gateway %s", ip_str); + + if (inet_pton(AF_INET6, ip_str, &addr) == 0) + rte_panic("Invalid addr from env DPDK_IPV6_GATEWAY: %s", ip_str); + + return &addr; +} + +static bool +lo4_enabled(void) +{ + const char *str = getenv("DPDK_LO4_ENABLED"); + if (str != NULL && strcmp(str, "0") == 0) + return false; + return true; +} + +static bool +lo6_enabled(void) +{ + const char *str = getenv("DPDK_LO6_ENABLED"); + if (str == NULL || strcmp(str, "1") != 0) + return false; + return true; +} + +static void +loopback_dst_init(struct glue_ctx *ctx) +{ + struct tle_dest *dst; + struct ether_hdr *eth; + struct ipv4_hdr *ip4h; + struct ipv6_hdr *ip6h; + + /* init ipv4 dst */ + dst = &ctx->lb_dst; + dst->mtu = 65535; + + dst->l2_len = sizeof(*eth); + dst->head_mp = get_mempool_by_socket(0); /* fix me */ + eth = (struct ether_hdr *)dst->hdr; + memset(eth, 0, 2 * sizeof(eth->d_addr)); + eth->ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv4); + + dst->l3_len = sizeof(*ip4h); + ip4h = (struct ipv4_hdr *)(eth + 1); + ip4h->dst_addr = htonl(INADDR_LOOPBACK); + ip4h->version_ihl = 4 << 4 | sizeof(*ip4h) / IPV4_IHL_MULTIPLIER; + ip4h->time_to_live = 64; + ip4h->next_proto_id = IPPROTO_TCP; + + /* init ipv6 dst */ + dst = &ctx->lb_dst_v6; + dst->mtu = 65535; + + dst->l2_len = sizeof(*eth); + dst->head_mp = get_mempool_by_socket(0); /* fix me */ + eth = (struct ether_hdr *)dst->hdr; + memset(eth, 0, 2 * sizeof(eth->d_addr)); + eth->ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv6); + + dst->l3_len = sizeof(*ip6h); + ip6h = (struct ipv6_hdr *)(eth + 1); + rte_memcpy(ip6h->dst_addr, &in6addr_loopback, sizeof(struct in6_addr)); + ip6h->vtc_flow = 6 << 4; + ip6h->hop_limits = 255; + ip6h->proto = IPPROTO_TCP; +} + +static void +arp_hash_init(struct glue_ctx *ctx, unsigned socket_id) +{ + char str[RTE_HASH_NAMESIZE]; + struct rte_hash_parameters hprm; + + /* init ipv4 arp hash */ + snprintf(str, sizeof(str), "arp_hash_4@ctx%u", ctx->queue_id); + memset(&hprm, 0, sizeof(hprm)); + hprm.name = str; + hprm.entries = MAX_ARP_ENTRY * 2; + hprm.socket_id = socket_id; + hprm.key_len = sizeof(struct in_addr); + ctx->arp_hash = rte_hash_create(&hprm); + if (ctx->arp_hash == NULL) { + rte_panic("Failed to init hashtable for ARP"); + } + + /* init ipv6 arp hash */ + snprintf(str, sizeof(str), "arp_hash_6@ctx%u", ctx->queue_id); + memset(&hprm, 0, sizeof(hprm)); + hprm.name = str; + hprm.entries = MAX_ARP_ENTRY * 2; + hprm.socket_id = socket_id; + hprm.key_len = sizeof(struct in6_addr); + ctx->arp6_hash = rte_hash_create(&hprm); + if (ctx->arp6_hash == NULL) { + rte_panic("Failed to init hashtable for ARP6"); + } +} + +/* get current timestamp in ms, see tcp_get_tms() */ +static inline uint64_t +arp_get_tms(uint32_t mshift) +{ + uint64_t ts; + + ts = rte_get_tsc_cycles() >> mshift; + return ts; +} + +static void +arp_timer_init(struct glue_ctx *ctx, unsigned socket_id) +{ + struct tle_timer_wheel_args twprm; + + twprm.tick_size = 1000U; + twprm.max_timer = MAX_ARP_ENTRY + 8; + twprm.socket_id = socket_id; + ctx->arp_tmw = tle_timer_create(&twprm, + arp_get_tms(ctx->cycles_ms_shift)); + if (ctx->arp_tmw == NULL) + rte_panic("Failed to init timer wheel for ARP"); +} + +static void +glue_ctx_init(struct glue_ctx *ctx, uint32_t socket_id) +{ + uint64_t ms; + + ctx->arp4 = rte_zmalloc_socket(NULL, + sizeof(struct arp_entry) * MAX_ARP_ENTRY, + RTE_CACHE_LINE_SIZE, socket_id); + ctx->arp6 = rte_zmalloc_socket(NULL, + sizeof(struct arp_entry) * MAX_ARP_ENTRY, + RTE_CACHE_LINE_SIZE, socket_id); + if (!ctx->arp4 || !ctx->arp6) + rte_panic("Failed to allocate arp table"); + + ctx->port_id = 0; + ctx->queue_id = nb_ctx - 1; + ctx->ipv4 = get_ip(); + ctx->ipv4_ml = get_ip_mask(); + ctx->ipv4_gw.s_addr = get_ip_gate(); + ctx->lo4_enabled = lo4_enabled(); + rte_memcpy(&ctx->ipv6, get_ipv6(), sizeof(struct in6_addr)); + ctx->ipv6_ml = get_ipv6_mask(); + rte_memcpy(&ctx->ipv6_gw, get_ipv6_gate(), sizeof(struct in6_addr)); + ctx->lo6_enabled = lo6_enabled(); + + /* caclulate closest shift to convert from cycles to ms (approximate) */ + ms = (rte_get_tsc_hz() + MS_PER_S - 1) / MS_PER_S; + ctx->cycles_ms_shift = sizeof(ms) * CHAR_BIT - __builtin_clzll(ms) - 1; + + arp_hash_init(ctx, socket_id); + arp_timer_init(ctx, socket_id); + ctx->arp_wait = NULL; + + ctx->frag_tbl = rte_ip_frag_table_create(FRAG_BUCKET, + FRAG_ENTRIES_PER_BUCKET, + FRAG_BUCKET * FRAG_ENTRIES_PER_BUCKET, + rte_get_tsc_hz(), + socket_id); + if (ctx->frag_tbl == NULL) + rte_panic("Failed to create ip defrag table"); + + PERCPU_MIB = &ctx->mib; +} + +static int ctx_seq; +static rte_spinlock_t ctx_lock = RTE_SPINLOCK_INITIALIZER; + +uint8_t +glue_ctx_alloc(void) +{ + uint32_t socket_id; + struct glue_ctx *ctx; + + /* fix me: we need a fine grainer lock */ + rte_spinlock_lock(&ctx_lock); + + GLUE_LOG(INFO, "allocate ctx: %d", ctx_seq); + if (ctx_seq == 0) + /* Called from constructor init() */ + ctx_seq = 1; + else if (ctx_seq == 1) { + /* Called from first epoll_create() or poll() */ + ctx_seq = 2; + ctx = default_ctx; + goto unlock; + } + + if (nb_ctx >= MAX_NB_CTX) + rte_panic("Exceed the max number of ctx"); + + ctx = &ctx_array[nb_ctx++]; + GLUE_LOG(INFO, "%u ctx allocated, and will init", nb_ctx); + + socket_id = get_socket_id(); + + glue_ctx_init(ctx, socket_id); + + /* reconfigure the "physical" port whenever # of ctx changes */ + port_reconfig(); + + if (ctx == default_ctx) { + loopback_dst_init(ctx); + + ctx->lb_port_id = create_loopback(socket_id); + GLUE_LOG(INFO, "loopback port_id: %u", ctx->lb_port_id); + } + + rte_eth_macaddr_get(ctx->port_id, &ctx->mac); + + tle_ctx_init(ctx, socket_id); + +unlock: + rte_spinlock_unlock(&ctx_lock); + return ctx - ctx_array; +} + +void +glue_ctx_free(struct glue_ctx *ctx __rte_unused) +{ + if (nb_ctx == 1 && ctx_seq == 2) { + GLUE_LOG(INFO, "free ctx"); + ctx_seq = 1; + return; + } + + rte_panic("close epoll fd on running is not supported\n"); +} + +struct glue_ctx * +glue_ctx_lookup(uint16_t port_id, uint16_t queue_id) +{ + int i; + + if (port_id == 1) /* loopback */ + return default_ctx; + + for (i = 0; i < nb_ctx; i++) { + if (ctx_array[i].port_id == port_id && + ctx_array[i].queue_id == queue_id) + return &ctx_array[i]; + } + + return NULL; +} diff --git a/lib/libtle_glue/ctx.h b/lib/libtle_glue/ctx.h new file mode 100644 index 0000000..e78b68f --- /dev/null +++ b/lib/libtle_glue/ctx.h @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _TLE_GLUE_SOCK_H_ +#define _TLE_GLUE_SOCK_H_ + +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include "config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define DPDK_IP "DPDK_IP" +#define DPDK_IP_DEF "0.0.0.0" +#define DPDK_IP_MASK "DPDK_IP_MASK" +#define DPDK_IP_MASK_DEF "16" +#define DPDK_IP_GATEWAY "DPDK_IP_GATEWAY" +#define DPDK_IP_GATEWAY_DEF "0.0.0.0" +#define DPDK_IPV6 "DPDK_IPV6" +#define DPDK_IPV6_DEF "::" +#define DPDK_IPV6_MASK "DPDK_IPV6_MASK" +#define DPDK_IPV6_MASK_DEF "64" +#define DPDK_IPV6_GATEWAY "DPDK_IPV6_GATEWAY" +#define DPDK_IPV6_GATEWAY_DEF "::" + +struct arp_entry { + struct tle_dest dst; + uint8_t inuse; + uint8_t req_time; + void* timer; +}; + +struct glue_ctx { + struct tle_ctx *tcp_ctx; + struct tle_dev *tcp_dev; + struct tle_dev *lb_tcp_dev; + struct tle_ctx *udp_ctx; + struct tle_dev *udp_dev; + struct tle_dev *lb_udp_dev; + + struct tle_evq *ereq; + struct tle_evq *rxeq; + struct tle_evq *txeq; + + uint16_t port_id; + uint16_t queue_id; + uint16_t lb_port_id; + + struct { + uint8_t ipv4_ml; + uint8_t ipv6_ml; + }; + + struct ether_addr mac; + struct rte_mbuf *arp_wait; + struct tle_timer_wheel *arp_tmw; + uint32_t cycles_ms_shift; /* to convert from cycles to ms */ + + struct { + uint32_t ipv4; + struct in_addr ipv4_gw; + bool lo4_enabled; + + uint32_t arp4_num; + struct arp_entry *arp4; + struct rte_hash *arp_hash; + }; + + struct { + struct in6_addr ipv6; + struct in6_addr ipv6_gw; + bool lo6_enabled; + + uint32_t arp6_num; + struct arp_entry *arp6; + struct rte_hash *arp6_hash; + }; + + struct { + rte_spinlock_t frag_lock; + struct rte_ip_frag_tbl *frag_tbl; + struct rte_ip_frag_death_row frag_dr; + }; + + struct tle_dest lb_dst; + struct tle_dest lb_dst_v6; + + struct tle_mib mib; +} __rte_cache_aligned; + +extern int nb_ctx; +extern struct glue_ctx *default_ctx; +extern struct glue_ctx ctx_array[MAX_NB_CTX]; + +RTE_DECLARE_PER_LCORE(struct glue_ctx *, glue_ctx); + +static inline struct glue_ctx * +get_ctx(void) +{ + if (RTE_PER_LCORE(glue_ctx)) + return RTE_PER_LCORE(glue_ctx); + return default_ctx; +} + +static inline uint8_t +get_cid(void) +{ + return get_ctx() - ctx_array; +} + +uint8_t glue_ctx_alloc(void); + +struct glue_ctx * glue_ctx_lookup(uint16_t port_id, uint16_t queue_id); + +void glue_ctx_free(struct glue_ctx *ctx); + +#ifdef __cplusplus +} +#endif + +#endif /* _TLE_GLUE_SOCK_H_ */ diff --git a/lib/libtle_glue/epoll.c b/lib/libtle_glue/epoll.c new file mode 100644 index 0000000..1c8751b --- /dev/null +++ b/lib/libtle_glue/epoll.c @@ -0,0 +1,577 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include +#include + +#include "fd.h" +#include "ctx.h" +#include "sym.h" +#include "log.h" +#include "util.h" +#include "sock.h" +#include "internal.h" +#include "tle_glue.h" +#include "../libtle_l4p/udp_stream.h" +#include "../libtle_l4p/tcp_stream.h" + +#define EPOLL_DATA_SPECIAL 0xFFFFFFFFFFFFFF01 + +/* We don't use rte_eth_dev_rx_intr_ctl_q as it has its + * own way to specify event.data + */ +static int +dev_rx_intr_ctl_q(uint16_t port_id, uint16_t queue_id, int efd, int op, int rx) +{ + int fd, ret; + uint32_t vec, efd_idx; + struct rte_eth_dev *dev; + struct rte_intr_handle *intr_handle; + static struct epoll_event ev = { + .events = EPOLLIN | EPOLLPRI | EPOLLET, + .data = { + .u64 = EPOLL_DATA_SPECIAL, + }, + }; + char buf[32]; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + dev = &rte_eth_devices[port_id]; + if (queue_id >= dev->data->nb_rx_queues) + return -EINVAL; + + if (!dev->intr_handle) + return -ENOTSUP; + + intr_handle = dev->intr_handle; + if (!intr_handle->intr_vec) + return -EPERM; + + vec = intr_handle->intr_vec[queue_id]; + + efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ? + (vec - RTE_INTR_VEC_RXTX_OFFSET) : vec; + + fd = intr_handle->efds[efd_idx]; + + if (rx) { + /* almost all devices use eventfd, we shall read out */ + ret = read(fd, buf, sizeof(uint64_t)); + RTE_SET_USED(ret); + } + + return k_epoll_ctl(efd, op, fd, &ev); +} + +int +PRE(epoll_create)(int size) +{ + int epfd; + struct sock *so; + + if (!fd_table_initialized) + return k_epoll_create(size); + + epfd = get_unused_fd(); + if (epfd == -1) { + errno = EMFILE; + return -1; + } + + + so = fd2sock(epfd); + so->cid = glue_ctx_alloc(); + + so->shadow_efd = k_epoll_create(1); + if (so->shadow_efd < 0) + rte_panic("Failed to create shadow efd"); + + if (dev_rx_intr_ctl_q(CTX(so)->port_id, CTX(so)->queue_id, + so->shadow_efd, RTE_INTR_EVENT_ADD, 0) < 0) + rte_panic("Failed to epoll_ctl rxq interrupt fd"); + + so->epoll = 1; + + return epfd; +} + +int +PRE(epoll_create1)(int flags __rte_unused) +{ + return PRE(epoll_create)(1); +} + +int +PRE(epoll_ctl)(int epfd, int op, int fd, struct epoll_event *event) +{ + struct sock *so_ep; + struct sock *so; + + if (is_kernel_fd(epfd)) { + if (!is_kernel_fd(fd)) + rte_panic("kernel epoll (%d) on an userspace fd: %d", + epfd, fd); + + return k_epoll_ctl(epfd, op, fd, event); + } + + so_ep = fd2sock(epfd); + + if (is_kernel_fd(fd)) { + /* Use a shadow epoll fd for possible kernel I/O events. */ + return k_epoll_ctl(so_ep->shadow_efd, op, fd, event); + } + + so = fd2sock(fd); + + if (unlikely(so->cid != so_ep->cid)) + rte_panic("Different ctx %d and %d for epoll fd and socket fd", + so_ep->cid, so->cid); + + GLUE_DEBUG("epoll_ctl: op = %x, fd = %d, event = %x", + op, fd, event->events); + switch (op) { + case EPOLL_CTL_ADD: + if (so->event.events) { + errno = EEXIST; + return -1; + } + +#ifdef LOOK_ASIDE_BACKEND + if (event->events & EPOLLIN) + tle_event_active(&so->rxev, TLE_SEV_DOWN); + if (event->events & EPOLLOUT) + tle_event_active(&so->txev, TLE_SEV_DOWN); +#endif + so->event = *event; + + break; + case EPOLL_CTL_MOD: + if (so->event.events == 0) { + errno = ENOENT; + return -1; + } + +#ifdef LOOK_ASIDE_BACKEND + if (event->events & EPOLLIN) + tle_event_active(&so->rxev, TLE_SEV_DOWN); + else + tle_event_idle(&so->rxev); + if (event->events & EPOLLOUT) + tle_event_active(&so->txev, TLE_SEV_DOWN); + else + tle_event_idle(&so->txev); +#endif + so->event = *event; + break; + case EPOLL_CTL_DEL: + if (so->event.events == 0) { + errno = ENOENT; + return -1; + } + +#ifdef LOOK_ASIDE_BACKEND + if (so->event.events & EPOLLIN) + tle_event_idle(&so->rxev); + if (so->event.events & EPOLLOUT) + tle_event_idle(&so->txev); +#endif + so->event.events = 0; + break; + default: + errno = EINVAL; + return -1; + } + + return 0; +} + +static inline int32_t +tle_evq_fetch(struct tle_evq *evq, const void *evd[], + uint32_t num, uint32_t event) +{ + uint32_t i, k; + uint32_t polled; + struct tle_event *ev; + struct tle_event *next; + + if (evq->nb_armed == 0) + return 0; + + rte_compiler_barrier(); + + rte_spinlock_lock(&evq->lock); + ev = TAILQ_FIRST(&evq->armed); + for (i = 0, k = 0; i != evq->nb_armed; i++) { + next = TAILQ_NEXT(ev, ql); + polled = ((const struct sock *)ev->data)->event.events; + /* Always report EPOLLHUP, see man epoll_ctl(2) */ + if (polled && ((polled | EPOLLHUP) & event)) { + evd[k++] = ev->data; + TAILQ_REMOVE(&evq->armed, ev, ql); + /* don't down erev; and assign NULL to data means this + * ev is already removed from the queue, refer to + * tle_event_idle_err(). + */ + if (event != EPOLLHUP) + ev->state = TLE_SEV_DOWN; + else + ev->data = NULL; + } + if (k == num) + break; + ev = next; + } + evq->nb_armed -= k; + rte_spinlock_unlock(&evq->lock); + return k; +} + +static int +evq_drain(struct tle_evq *q, uint32_t event, + struct epoll_event *events, int maxevents) +{ + uint32_t i, n; + struct sock *socks[maxevents]; + + n = tle_evq_fetch(q, (const void **)(uintptr_t)socks, maxevents, event); + for (i = 0; i < n; ++i) { + events[i].events = event; + events[i].data = socks[i]->event.data; + + /* when EPOLLHUP happens, also return EPOLLIN and EPOLLOUT + * if they are registered. So as to emulate behaviour of linux + * kernel. + * Some applications (e.g. redis) need these events to determine + * following works. + */ + if (event & EPOLLHUP) + events[i].events |= (socks[i]->event.events & + (EPOLLIN | EPOLLOUT)); + + /* if multiple events of single socket are triggered, + * return single event with multiple event types rather than + * multiple events. + * + * we drain evq in order of EPOLLOUT -> EPOLLIN -> EPOLLHUP, + * so only need to check event in evq that has not been drained. + */ + switch (event) { + case EPOLLOUT: + if ((socks[i]->event.events & EPOLLIN) && + tle_event_state(&socks[i]->rxev) == TLE_SEV_UP) { + tle_event_down(&socks[i]->rxev); + events[i].events |= EPOLLIN; + } + /* fallthrough */ + case EPOLLIN: + if (tle_event_state(&socks[i]->erev) == TLE_SEV_UP) { + rte_spinlock_lock(&socks[i]->erev.head->lock); + if (socks[i]->erev.data != NULL && + tle_event_state(&socks[i]->erev) == TLE_SEV_UP) { + TAILQ_REMOVE(&socks[i]->erev.head->armed, + &socks[i]->erev, ql); + socks[i]->erev.head->nb_armed--; + socks[i]->erev.data = NULL; + } + rte_spinlock_unlock(&socks[i]->erev.head->lock); + events[i].events |= EPOLLHUP; + } + } + + GLUE_DEBUG("event for fd = %d, event = %x", + socks[i]->event.data.fd, event); + } + return n; +} + +#ifdef LOOK_ASIDE_BACKEND +rte_atomic32_t flag_sleep; + +int +epoll_kernel_wait(struct glue_ctx *ctx, int efd, + struct epoll_event *events, + int maxevents, int timeout, int *rx) +{ + struct epoll_event event; + uint16_t port_id = ctx->port_id; + uint16_t queue_id = ctx->queue_id; + + RTE_SET_USED(events); + RTE_SET_USED(maxevents); + RTE_SET_USED(rx); + + rte_eth_dev_rx_intr_enable(port_id, queue_id); + + /* TODO: timeout shall be limited by the latest tcp timer */ + + if (be_process(ctx) > 0) /* use this way to avoid concurrency */ { + /* Do nothing */ + } else + sleep_with_lock(efd, &event, 1, timeout); + + rte_eth_dev_rx_intr_disable(port_id, queue_id); + /* We don't have kernel events for report, so just return zero */ + return 0; +} +#else +int +epoll_kernel_wait(struct glue_ctx *ctx, int efd, + struct epoll_event *events, + int maxevents, int timeout, int *rx) +{ + int i, j, rc; + int flag_tmp = 0; + uint16_t port_id = ctx->port_id; + uint16_t queue_id = ctx->queue_id; +#define LEAST_EVENTS 8 + struct epoll_event s_events[LEAST_EVENTS]; + struct epoll_event *r_events; + int r_maxevents; + int fastpath = 0; + + *rx = 0; + + if (efd == -1) { + flag_tmp = 1; + efd = k_epoll_create(1); + if (efd < 0) + rte_panic("Failed to create tmp efd"); + } + + if (stopped) { + rc = k_epoll_pwait(efd, events, maxevents, timeout, NULL); + goto check; + } + + if (maxevents < LEAST_EVENTS) { + r_events = s_events; + r_maxevents = maxevents + 1; + } else { + r_events = events; + r_maxevents = maxevents; + } + + if (flag_tmp && + dev_rx_intr_ctl_q(port_id, queue_id, efd, RTE_INTR_EVENT_ADD, 0) < 0) + /* TODO: fall back to busy polling */ + rte_panic("Failed to enable rxq interrupt"); + + rte_eth_dev_rx_intr_enable(port_id, queue_id); + + /* TODO: timeout shall be limited by the latest tcp timer */ + + if (timeout != 0 && be_process(ctx) > 0) { + /* use this way to avoid concurrency */ + rc = 0; + fastpath = 1; + } else + rc = sleep_with_lock(efd, r_events, r_maxevents, timeout); + + rte_eth_dev_rx_intr_disable(port_id, queue_id); + + /* filter out rxq event */ + for (i = 0, j = 0; i < rc; ++i) { + if (r_events[i].data.u64 == EPOLL_DATA_SPECIAL) { + *rx = true; + if (i + 1 < rc) { + memcpy(&r_events[j], &r_events[i+1], + (rc-i-1) * sizeof(*events)); + } + rc -= 1; + break; + } else { + if (i != j) + r_events[j] = r_events[i]; + j++; + } + } + + if (rc > 0 && maxevents < LEAST_EVENTS) + memcpy(events, r_events, rc * sizeof(*events)); + + if (flag_tmp) + dev_rx_intr_ctl_q(port_id, queue_id, efd, + RTE_INTR_EVENT_DEL, *rx); + + if (fastpath) + *rx = true; +check: + if (flag_tmp) + close(efd); + + return rc; +} +#endif + +/* If only there are some packets to process, we don't sleep; we will poll + * for some number of iterations to check packets. + * + * TODO: change to wait for a period of time? + */ +#define IDLE_ITERATIONS 5 + +int +poll_common(struct glue_ctx *ctx, struct epoll_event *events, + int maxevents, int timeout, int shadow_efd) +{ + int rx; + int total = 0; + int idle = IDLE_ITERATIONS; + +again: + /* We will start with send, then recv, and last err queue, as we want + * to serve exiting connections firstly, then new connections, and + * lastly, the wrong connections. + */ + + /* 0. send evq */ + total += evq_drain(ctx->txeq, EPOLLOUT, + events + total, maxevents-total); + if (total == maxevents) + return total; + + /* 1. recv evq */ + total += evq_drain(ctx->rxeq, EPOLLIN, + events + total, maxevents-total); + if (total == maxevents) + return total; + + /* 2. err evq */ + total += evq_drain(ctx->ereq, EPOLLHUP, + events + total, maxevents-total); + + if (total > 0) + return total; + + if (idle > 0) { + if (be_process(ctx) == 0) + idle--; + else + idle = IDLE_ITERATIONS; + goto again; + } + + if (timeout == 0) + return 0; + + /* Setup rxq interrupt mode, and check kernel I/O events */ + total = epoll_kernel_wait(ctx, shadow_efd, events, + maxevents, timeout, &rx); + + /* Kernel I/O events are available (total > 0) or + * timeout (total < 0) or something bad happens. + */ + if (total != 0) + return total; + + /* Check userspace I/O events */ + idle = IDLE_ITERATIONS; + be_process(ctx); + goto again; +} + +int +PRE(epoll_wait)(int epfd, struct epoll_event *events, + int maxevents, int timeout) +{ + struct sock *so; + + if (is_kernel_fd(epfd)) + return k_epoll_pwait(epfd, events, maxevents, timeout, NULL); + + so = fd2sock(epfd); + + /* thread <> context binding happens here */ + if (RTE_PER_LCORE(glue_ctx) == NULL) + RTE_PER_LCORE(glue_ctx) = CTX(so); + + return poll_common(CTX(so), events, maxevents, timeout, so->shadow_efd); +} + +int +PRE(epoll_pwait)(int epfd, struct epoll_event *events, + int maxevents, int timeout, const sigset_t *sigmask) +{ + if (sigmask != NULL) { + rte_panic("epoll_pwait with signal is not supported"); + } + + return epoll_wait(epfd, events, maxevents, timeout); +} + +int +fd_ready(int fd, int events) +{ + int ret = 0; + struct sock *so = fd2sock(fd); + + if (unlikely(!so->s)) { + if (tle_event_state(&so->erev) == TLE_SEV_UP) + /* socket has been shutdown */ + return events | EPOLLHUP; + else /* socket is not set up yet */ + return 0; + } + + if (unlikely(IS_TCP(so) && + TCP_STREAM(so->s)->tcb.state == TCP_ST_CLOSED)) { + return events | EPOLLHUP | EPOLLERR; + } + + if (tle_event_state(&so->erev) == TLE_SEV_UP) + ret |= EPOLLHUP; + + if (events & EPOLLIN) { + if (so->rx_left || + (IS_TCP(so) && rte_ring_count(TCP_STREAM(so->s)->rx.q) > 0) || + (IS_UDP(so) && rte_ring_count(UDP_STREAM(so->s)->rx.q) > 0)) + ret |= EPOLLIN; + } + + if (events & EPOLLOUT) { + if ((IS_TCP(so) && + TCP_STREAM(so->s)->tcb.state >= TCP_ST_ESTABLISHED && + rte_ring_free_count(TCP_STREAM(so->s)->tx.q) > 0) || + (IS_UDP(so) && + rte_ring_count(UDP_STREAM(so->s)->tx.drb.r) > 0)) + ret |= EPOLLOUT; + } + + return ret; +} + +void +v_get_stats_snmp(unsigned long mibs[]) +{ + int i, j, k; + + memcpy(mibs, &default_mib, sizeof(default_mib)); + + for (i = 0; i < nb_ctx; ++i) { + for (j = 0; j < TCP_MIB_MAX; ++j) + mibs[j] += ctx_array[i].mib.tcp.mibs[j]; + + for (k = 0; k < UDP_MIB_MAX; ++k) + mibs[j+k] += ctx_array[i].mib.udp.mibs[k]; + } +} diff --git a/lib/libtle_glue/fd.c b/lib/libtle_glue/fd.c new file mode 100644 index 0000000..cc855f9 --- /dev/null +++ b/lib/libtle_glue/fd.c @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include "fd.h" +#include "log.h" +#include "util.h" +#include "config.h" + +bool fd_table_initialized; + +struct fd_table fd_table = { .fd_base = INT_MAX, }; + +static int +get_ulimit_nofile(void) +{ + struct rlimit rlim; + +#define GLUE_BASE_FD 1024 + if (getrlimit(RLIMIT_NOFILE, &rlim) < 0) + return GLUE_BASE_FD; + + return rlim.rlim_cur; /* soft limit, rlim_max is the hard limit */ +} + +static void +fd_num_set(int *fd_base, int *fd_num) +{ + int limit = get_ulimit_nofile(); + + /* fix me: alignment of power of two */ + /* fix me: use dup2 to occupy these fds */ + *fd_num = limit / 2; + *fd_num = RTE_MIN(MAX_STREAMS_PER_CORE * 2 * MAX_NB_CTX, *fd_num); + + *fd_base = limit - *fd_num; + GLUE_LOG(INFO, "fd_base = %d, fd_num = %d", *fd_base, *fd_num); +} + +static void +add_fd(struct rte_mempool *mp __rte_unused, void *opaque __rte_unused, + void *obj, unsigned obj_idx) +{ + ((struct sock *)obj)->fd = obj_idx + fd_table.fd_base; + fd_table.socks[obj_idx] = obj; +} + +void +fd_init(void) +{ + int ret; + size_t sz; + uint32_t socket_id; + int fd_base, fd_num; + struct rte_mempool *mp = NULL; + char name[RTE_MEMPOOL_NAMESIZE]; + + socket_id = get_socket_id(); + + fd_num_set(&fd_base, &fd_num); + + sz = sizeof(fd_table.socks[0]) * fd_num; + fd_table.socks = rte_zmalloc_socket("fdtable", sz, + RTE_CACHE_LINE_SIZE, socket_id); + if (fd_table.socks == NULL) { + GLUE_LOG(ERR, "Failed to malloc fd table"); + goto err; + } + + snprintf(name, RTE_MEMPOOL_NAMESIZE, "mp_fd_%d_%d", fd_base, fd_num); + mp = rte_mempool_create_empty(name, fd_num - 1, sizeof(struct sock), + 32, 0, socket_id, MEMPOOL_F_DYNAMIC); + if (mp == NULL) { + GLUE_LOG(ERR, "Failed to create mp for fd table"); + goto err; + } + + GLUE_LOG(INFO, "sizeof(struct sock): %lu, elt_size of fd table = %u", + sizeof(struct sock), mp->elt_size); + + ret = rte_mempool_set_ops_byname(mp, "ring_mp_mc", NULL); + if (ret != 0) { + GLUE_LOG(ERR, "Failed to set mp ops: %d", ret); + goto err; + } + + rte_mempool_set_dynamic_size(mp, 1024); + rte_mempool_set_dynamic_cb(mp, add_fd); + + fd_table.mp = mp; + fd_table.fd_base = fd_base; + fd_table.fd_num = fd_num; + + /* should populate after fd_table is set */ + ret = rte_mempool_populate_default(mp); + if (ret < 0) { + GLUE_LOG(ERR, "Failed to populate mp: %d", ret); + goto err; + } + + fd_table_initialized = true; + + return; +err: + rte_mempool_free(mp); + rte_panic("Failed to init fd_table"); +} diff --git a/lib/libtle_glue/fd.h b/lib/libtle_glue/fd.h new file mode 100644 index 0000000..d0ac4fe --- /dev/null +++ b/lib/libtle_glue/fd.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _TLE_GLUE_FD_H_ +#define _TLE_GLUE_FD_H_ + +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include "log.h" +#include "sock.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct fd_table { + int fd_base; /* The mininum fd, 64 aligned */ + int fd_num; /* The number of fds, 64 aligned */ + struct rte_mempool *mp; /* O(1) get and put */ + struct sock **socks; +}; + +extern bool fd_table_initialized; +extern struct fd_table fd_table; + +static inline struct sock * +fd2sock(int fd) +{ + return fd_table.socks[fd - fd_table.fd_base]; +} + +static inline int +sock2fd(struct sock *so) +{ + return so->fd; +} + +static inline int +get_unused_fd(void) +{ + struct sock *so; + + if (unlikely(rte_mempool_get(fd_table.mp, (void **)&so) < 0)) { + GLUE_LOG(ERR, "FDs have been exhausted"); + return -1; + } + + so->valid = 1; + return sock2fd(so); +} + +static inline void +tle_event_idle_err(struct tle_event *ev) +{ + struct tle_evq *q; + + if (ev->state == TLE_SEV_IDLE) + return; + + q = ev->head; + rte_compiler_barrier(); + + rte_spinlock_lock(&q->lock); + if (ev->state == TLE_SEV_UP && ev->data) { + TAILQ_REMOVE(&q->armed, ev, ql); + q->nb_armed--; + } + ev->state = TLE_SEV_IDLE; + rte_spinlock_unlock(&q->lock); +} + +static inline void +put_free_fd(int fd) +{ + struct sock *so = fd2sock(fd); + + rte_mempool_put(fd_table.mp, so); +} + +static inline bool +is_kernel_fd(int fd) +{ + return fd < fd_table.fd_base; +} + +void fd_init(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _TLE_GLUE_FD_H_ */ diff --git a/lib/libtle_glue/gateway.h b/lib/libtle_glue/gateway.h new file mode 100644 index 0000000..29de6b1 --- /dev/null +++ b/lib/libtle_glue/gateway.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2019 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _TLE_GATEWAY_H_ +#define _TLE_GATEWAY_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +static inline bool +is_ipv4_loopback_addr(in_addr_t addr, struct glue_ctx *ctx) +{ + if (addr == ctx->ipv4 || addr == htonl(INADDR_LOOPBACK)) + return true; + else + return false; +} + +static inline bool +is_ipv6_loopback_addr(const struct in6_addr *addr, struct glue_ctx *ctx) +{ + if (memcmp(addr, &ctx->ipv6, sizeof(struct in6_addr)) == 0 || + IN6_IS_ADDR_LOOPBACK(addr) || + (IN6_IS_ADDR_V4COMPAT(addr) && + addr->__in6_u.__u6_addr32[3] == htonl(INADDR_LOOPBACK)) || + (IN6_IS_ADDR_V4MAPPED(addr) && + addr->__in6_u.__u6_addr32[3] == htonl(INADDR_LOOPBACK))) + return true; + else + return false; +} + +static inline const struct in_addr * +ipv4_gateway_lookup(void *data, const struct in_addr *addr) +{ + uint8_t ls; + struct glue_ctx *ctx = data; + + if (is_ipv4_loopback_addr(addr->s_addr, ctx)) + return addr; + + ls = 32 - ctx->ipv4_ml; + if ((addr->s_addr << ls) == (ctx->ipv4 << ls)) + return addr; + + if (ctx->ipv4_gw.s_addr != 0) + return &ctx->ipv4_gw; + + return addr; +} + +static inline const struct in6_addr * +ipv6_gateway_lookup(void *data, const struct in6_addr *addr) +{ + uint8_t ls; + struct glue_ctx *ctx = data; + + if (is_ipv6_loopback_addr(addr, ctx)) + return addr; + + if (ctx->ipv6_ml <= 64) { + ls = 64 - ctx->ipv6_ml; + if ((*(const uint64_t*)addr << ls) == + (*(const uint64_t*)&ctx->ipv6 << ls)) + return addr; + } else if (*(const uint64_t*)addr == *(const uint64_t*)&ctx->ipv6) { + ls = 128 - ctx->ipv6_ml; + if ((*((const uint64_t*)addr + 1) << ls) == + (*((const uint64_t*)&ctx->ipv6 + 1) << ls)) + return addr; + } + + if (!IN6_IS_ADDR_UNSPECIFIED(&ctx->ipv6_gw)) + return &ctx->ipv6_gw; + + return addr; +} + +#ifdef __cplusplus +} +#endif + +#endif /* _TLE_GATEWAY_H_ */ diff --git a/lib/libtle_glue/icmp.c b/lib/libtle_glue/icmp.c new file mode 100644 index 0000000..aba1c4b --- /dev/null +++ b/lib/libtle_glue/icmp.c @@ -0,0 +1,297 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include "log.h" +#include "ctx.h" +#include "internal.h" + +#define ICMP_ECHOREPLY 0 /* Echo Reply */ +#define ICMP_ECHO 8 /* Echo Request */ +#define ICMP_TIMESTAMP 13 /* Timestamp Request */ +#define ICMP_TIMESTAMPREPLY 14 /* Timestamp Reply */ + +/* Codes for TIME_EXCEEDED. */ +#define ICMP_EXC_TTL 0 /* TTL count exceeded */ +#define ICMP_EXC_FRAGTIME 1 /* Fragment Reass time exceeded */ + +/* Parameters used to convert the timespec values */ +#define SECONDS_PER_DAY 86400L +#define MSEC_PER_SEC 1000L +#define USEC_PER_MSEC 1000L +#define NSEC_PER_USEC 1000L +#define NSEC_PER_MSEC (NSEC_PER_USEC * USEC_PER_MSEC) + +#define IS_IPV4_BCAST(x) ((x) == (uint32_t)0xFFFFFFFF) + +struct icmp_pkt { + struct icmp_hdr icmp_h; + uint32_t times[3]; +}; + +/* Return remainder for ``dividend / divisor`` */ +static inline uint32_t +div_uint64_rem(uint64_t dividend, uint32_t divisor) +{ + return dividend % divisor; +} + +/* Return milliseconds since midnight (UTC) in network byte order. */ +static uint32_t +current_timestamp(void) +{ + struct timespec ts; + uint32_t msecs; + uint32_t secs; + + (void)clock_gettime(CLOCK_REALTIME, &ts); + + /* Get secs since midnight. */ + secs = div_uint64_rem(ts.tv_sec, SECONDS_PER_DAY); + /* Convert to msecs. */ + msecs = secs * MSEC_PER_SEC; + /* Convert nsec to msec. */ + msecs += (uint32_t)ts.tv_nsec / NSEC_PER_MSEC; + + /* Convert to network byte order. */ + return rte_cpu_to_be_32(msecs); +} + +/* + * Process the checksum of an ICMP packet. The checksum field must be set + * to 0 by the caller. + */ +static uint16_t +icmp_cksum(const struct icmp_hdr *icmp, uint32_t data_len) +{ + uint16_t cksum; + + cksum = rte_raw_cksum(icmp, sizeof(struct icmp_hdr) + data_len); + return (cksum == 0xffff) ? cksum : ~cksum; +} + +/** + * Receive and handle an ICMP packet. + * + * @param ctx + * The pointer to the glue context. + * @param pkt + * The pointer to the raw packet data. + * @param l2_len + * The the size of the l2 header. + * @return + * MUST return NULL now. :-) + */ +struct rte_mbuf * +icmp_recv(struct glue_ctx *ctx, struct rte_mbuf *pkt, + uint32_t l2_len, uint32_t l3_len) +{ + struct ether_addr eth_addr; + struct icmp_pkt *icmp_pkt; + struct ether_hdr *eth_h; + struct icmp_hdr *icmp_h; + struct ipv4_hdr *ip_h; + uint32_t ip_addr; + uint32_t cksum; + + eth_h = rte_pktmbuf_mtod(pkt, struct ether_hdr *); + ip_h = (struct ipv4_hdr *) ((char *)eth_h + l2_len); + + icmp_h = (struct icmp_hdr *)((char *)ip_h + l3_len); + if (icmp_h->icmp_type != IP_ICMP_ECHO_REQUEST && + icmp_h->icmp_type != ICMP_TIMESTAMP) + goto drop_pkt; + + icmp_pkt = (struct icmp_pkt *)icmp_h; + + ether_addr_copy(ð_h->s_addr, ð_addr); + ether_addr_copy(ð_h->d_addr, ð_h->s_addr); + ether_addr_copy(ð_addr, ð_h->d_addr); + + /* + * Similar to Linux implementation, we silently drop the broadcast or + * multicast ICMP pakcets. + * + * RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be + * silently ignored. + * RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently + * discarded if to broadcast/multicast. + */ + ip_addr = rte_be_to_cpu_32(ip_h->dst_addr); + if (IS_IPV4_MCAST(ip_addr) || IS_IPV4_BCAST(ip_addr)) + goto drop_pkt; + + ip_addr = ip_h->src_addr; + ip_h->src_addr = ip_h->dst_addr; + ip_h->dst_addr = ip_addr; + + if (icmp_h->icmp_type == IP_ICMP_ECHO_REQUEST && + icmp_h->icmp_code == 0) { + + /* Must clear checksum field before calling the helper. */ + ip_h->hdr_checksum = 0; + ip_h->hdr_checksum = rte_ipv4_cksum(ip_h); + + icmp_h->icmp_type = IP_ICMP_ECHO_REPLY; + icmp_h->icmp_code = 0; + + /* + * Fix me: the data part of an ICMP echo request/reply + * message is implementation specific, we don't know + * how to verify or calculate the checksum. + * + * Need to see BSD or LINUX implementation. + */ + cksum = ~icmp_h->icmp_cksum & 0xffff; + cksum += ~rte_cpu_to_be_16(IP_ICMP_ECHO_REQUEST << 8) & 0xffff; + cksum += rte_cpu_to_be_16(IP_ICMP_ECHO_REPLY << 8); + cksum = (cksum & 0xffff) + (cksum >> 16); + cksum = (cksum & 0xffff) + (cksum >> 16); + icmp_h->icmp_cksum = ~cksum; + + } else if (icmp_h->icmp_type == ICMP_TIMESTAMP && + icmp_h->icmp_code == 0) { + + /* + * RFC 1122: 3.2.2.8 MAY implement ICMP timestamp requests. + * SHOULD be in the kernel for minimum random latency. + * MUST be accurate to a few minutes. + * MUST be updated at least at 15Hz. + */ + icmp_h->icmp_type = ICMP_TIMESTAMPREPLY; + icmp_h->icmp_code = 0; + icmp_pkt->times[1] = current_timestamp(); + icmp_pkt->times[2] = icmp_pkt->times[1]; + + icmp_h->icmp_cksum = 0; + /* the data part of an ICMP timestamp reply is 12 bytes. */ + icmp_h->icmp_cksum = icmp_cksum(icmp_h, 12); + } else + goto drop_pkt; + + if (pkt->pkt_len < ETHER_MIN_LEN) + rte_pktmbuf_append(pkt, ETHER_MIN_LEN - pkt->pkt_len); + + if (rte_eth_tx_burst(ctx->port_id, ctx->queue_id, &pkt, 1)) + GLUE_LOG(DEBUG, "Send ICMP echo reply OK"); + + return NULL; + +drop_pkt: + rte_pktmbuf_free(pkt); + return NULL; +} + +/** + * Receive and handle an ICMPv6 packet. + * + * @param ctx + * The pointer to the glue context. + * @param pkt + * The pointer to the raw packet data. + * @param l2_len + * The the size of the l2 header. + * @return + * MUST return NULL now. :-) + */ +struct rte_mbuf * +icmp6_recv(struct glue_ctx *ctx, struct rte_mbuf *pkt, + uint32_t l2_len, uint32_t l3_len) +{ + struct ether_addr eth_addr; + struct ether_hdr *eth_h; + struct icmp6_hdr *icmp6_h; + struct ipv6_hdr *ipv6_h; + struct in6_addr ipv6_addr; + uint32_t cksum; + + eth_h = rte_pktmbuf_mtod(pkt, struct ether_hdr *); + ipv6_h = (struct ipv6_hdr *) ((char *)eth_h + l2_len); + + icmp6_h = (struct icmp6_hdr *)((char *)ipv6_h + l3_len); + + /* NDP pkt */ + if ((icmp6_h->icmp6_type == ND_NEIGHBOR_SOLICIT || + icmp6_h->icmp6_type == ND_NEIGHBOR_ADVERT) && + icmp6_h->icmp6_code == 0) + return ndp_recv(ctx, pkt, l2_len, l3_len); + + /* only support ECHO now, other types of pkts are dropped */ + if ((icmp6_h->icmp6_type != ICMP6_ECHO_REQUEST && + icmp6_h->icmp6_type != ICMP6_ECHO_REPLY) || + icmp6_h->icmp6_code != 0) + goto drop_pkt; + + ether_addr_copy(ð_h->s_addr, ð_addr); + ether_addr_copy(ð_h->d_addr, ð_h->s_addr); + ether_addr_copy(ð_addr, ð_h->d_addr); + + /* + * Now, we silently drop the anycast or multicast ICMP pakcets. + * But it does not conform to RFC 4443. Maybe fix it latter. + * + * RFC 4443: 4.2 An Echo Reply SHOULD be sent in response to an + * Echo Request message sent to an IPv6 multicast or anycast address. + * In this case, thesource address of the reply MUST be a unicast + * address belonging to the interface on which the Echo Request + * message was received. + */ + switch (icmp6_h->icmp6_type) { + case ICMP6_ECHO_REQUEST: + if (memcmp(ipv6_h->dst_addr, &ctx->ipv6, + sizeof(struct in6_addr)) != 0) + goto drop_pkt; + + rte_memcpy(&ipv6_addr, ipv6_h->src_addr, + sizeof(struct in6_addr)); + rte_memcpy(ipv6_h->src_addr, ipv6_h->dst_addr, + sizeof(struct in6_addr)); + rte_memcpy(ipv6_h->dst_addr, &ipv6_addr, + sizeof(struct in6_addr)); + + icmp6_h->icmp6_type = ICMP6_ECHO_REPLY; + + cksum = ~icmp6_h->icmp6_cksum & 0xffff; + cksum += ~rte_cpu_to_be_16(ICMP6_ECHO_REQUEST << 8) & 0xffff; + cksum += rte_cpu_to_be_16(ICMP6_ECHO_REPLY << 8); + cksum = (cksum & 0xffff) + (cksum >> 16); + cksum = (cksum & 0xffff) + (cksum >> 16); + icmp6_h->icmp6_cksum = ~cksum; + + break; + default: + goto drop_pkt; + } + + if (pkt->pkt_len < ETHER_MIN_LEN) + rte_pktmbuf_append(pkt, ETHER_MIN_LEN - pkt->pkt_len); + + if (rte_eth_tx_burst(ctx->port_id, ctx->queue_id, &pkt, 1)) + GLUE_LOG(DEBUG, "Send ICMP echo reply OK"); + + return NULL; + +drop_pkt: + rte_pktmbuf_free(pkt); + return NULL; +} diff --git a/lib/libtle_glue/init.c b/lib/libtle_glue/init.c new file mode 100644 index 0000000..d845ef8 --- /dev/null +++ b/lib/libtle_glue/init.c @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include + +#include +#include +#include +#include + +#include "util.h" +#include "fd.h" +#include "ctx.h" +#include "sym.h" +#include "log.h" +#include "internal.h" +#include "tle_glue.h" + +void +glue_init1(int argc, char **argv) +{ + GLUE_LOG(INFO, "init: DPDK and fd table..."); + + if (rte_eal_init(argc, argv) < 0) + rte_panic("Failed to init DPDK"); + + fd_init(); +} + +static void __attribute__((constructor(1000))) +glue_init(void) +{ + char *p; + int i, err, argc = 0; + char **argv = NULL, **argv_to_release = NULL; + char *vnic, *params, *no_huge; + cpu_set_t cpuset; + pthread_t tid = pthread_self(); + + symbol_init(); + +#define DPDK_PARAMS "DPDK_PARAMS" + params = getenv(DPDK_PARAMS); +#define DPDK_NO_HUGE "DPDK_NO_HUGE" + no_huge = getenv(DPDK_NO_HUGE); +#define DPDK_VNIC "DPDK_VNIC" + vnic = getenv(DPDK_VNIC); + + if (params == NULL && no_huge == NULL && vnic == NULL) + return; + + argv = grow_argv(argv, argc, 1); + argv[argc++] = xstrdup("userspace-stack"); + + /* Get the main thread affinity */ + CPU_ZERO(&cpuset); + err = pthread_getaffinity_np(tid, sizeof(cpu_set_t), &cpuset); + if (!err) { + for (i = 0; i < CPU_SETSIZE; i++) { + if (CPU_ISSET(i, &cpuset)) { + argv = grow_argv(argv, argc, 2); + argv[argc++] = xstrdup("-l"); + argv[argc++] = xasprintf("%d", i); + i = CPU_SETSIZE; + } + } + } else { + argv = grow_argv(argv, argc, 2); + argv[argc++] = xstrdup("-l"); + argv[argc++] = xasprintf("0"); + } + + if (params) + p = strtok(params, " "); + else + p = NULL; + while (p != NULL) { + argv = grow_argv(argv, argc, 1); + argv[argc++] = xstrdup(p); + p = strtok(NULL, " "); + } + + if (no_huge) { + argv = grow_argv(argv, argc, 3); + argv[argc++] = xstrdup("-m"); + argv[argc++] = xstrdup("2048"); + argv[argc++] = xstrdup("--no-huge"); + } + + if (vnic) { + argv = grow_argv(argv, argc, 2); + argv[argc++] = xstrdup(vnic); + argv[argc++] = xstrdup("--no-pci"); + } + + argv = grow_argv(argv, argc, 1); + argv[argc++] = xstrdup("--"); + + argv_to_release = grow_argv(argv_to_release, 0, argc); + for (i = 0; i < argc; ++i) + argv_to_release[i] = argv[i]; + + glue_init1(argc, argv); + + /* Alloc and setup this default ctx for any sockets operations before + * thread/ctx binding which happens when epoll_wait. + */ + glue_ctx_alloc(); + + release_argv(argc, argv_to_release, argv); + + /* Set back the affinity */ + err = pthread_setaffinity_np(tid, sizeof(cpu_set_t), &cpuset); + if (err) + GLUE_LOG(ERR, "Failed to set back affinity"); +} + +static void __attribute__((destructor)) +glue_uninit(void) +{ + struct sock *so; + struct glue_ctx *ctx; + int i, max = fd_table.fd_base + fd_table.fd_num; + + /* TODO: lets optimize it */ + for (i = fd_table.fd_base; i < max; i++) { + so = fd2sock(i); + if (!so || !so->valid) + continue; + if (IS_TCP(so)) + tle_tcp_stream_kill(so->s); + } + + for (i = 0; i < nb_ctx; ++i) { + ctx = glue_ctx_lookup(0, i); + while (be_process(ctx)) { /* empty */ }; + } +} diff --git a/lib/libtle_glue/internal.h b/lib/libtle_glue/internal.h new file mode 100644 index 0000000..91fe784 --- /dev/null +++ b/lib/libtle_glue/internal.h @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _TLE_GLUE_INTERNAL_H_ +#define _TLE_GLUE_INTERNAL_H_ + +#include +#include + +#include + +#include +#include +#include +#include + +#include "ctx.h" +#include "sym.h" +#include + +#ifdef __cplusplus +extern "C" { +#endif + +extern int stopped; + +extern uint64_t rx_offload; +extern uint64_t tx_offload; + +void port_reconfig(void); + +uint16_t create_loopback(uint32_t socket_id); + +struct rte_mempool * get_mempool_by_socket(int32_t socket_id); + +int be_process(struct glue_ctx *ctx); + +int be_tx(struct glue_ctx *ctx); + +struct rte_mbuf * arp_recv(struct glue_ctx *ctx, + struct rte_mbuf *m, uint32_t l2len); + +struct rte_mbuf * ndp_recv(struct glue_ctx *ctx, + struct rte_mbuf *m, uint32_t l2len, uint32_t l3len); + + +void mac_check(struct glue_ctx *ctx, const struct sockaddr* addr); + +int arp_ipv4_dst_lookup(void *data, const struct in_addr *addr, + struct tle_dest *res, int proto); + +int arp_ipv6_dst_lookup(void *data, const struct in6_addr *addr, + struct tle_dest *res, int proto); + +int mac_fill(struct glue_ctx *ctx, struct rte_mbuf *m); + +void mac_timeout(struct glue_ctx *ctx); + +int setup_rx_cb(uint16_t port_id, uint16_t qid); + +int epoll_kernel_wait(struct glue_ctx *ctx, int efd, + struct epoll_event *events, + int maxevents, int timeout, int *rx); + +int poll_common(struct glue_ctx *ctx, struct epoll_event *events, + int maxevents, int timeout, int shadow_efd); + +int dev_rxq_wakeup(uint16_t port_id); + +struct rte_mbuf * icmp_recv(struct glue_ctx *ctx, struct rte_mbuf *pkt, + uint32_t l2len, uint32_t l3len); + +struct rte_mbuf * icmp6_recv(struct glue_ctx *ctx, struct rte_mbuf *pkt, + uint32_t l2len, uint32_t l3len); + +uint16_t typen_rx_callback(uint16_t port, uint16_t queue, + struct rte_mbuf *pkt[], uint16_t nb_pkts, + uint16_t max_pkts, void *user_param); + +void ipv4_dst_add(struct glue_ctx *ctx, const struct in_addr *addr, + struct ether_addr *e_addr); + +void ipv6_dst_add(struct glue_ctx *ctx, const struct in6_addr *addr, + struct ether_addr *e_addr); + +#ifdef LOOK_ASIDE_BACKEND +extern rte_atomic32_t flag_sleep; + +enum { + IOTHREAD_BUSY = 0, /* io thread is busy */ + IOTHREAD_SLEEP, /* io thread is sleeping */ + IOTHREAD_PREEMPT, /* io thread is preempted by another worker thread */ +}; + +static inline int +sleep_with_lock(int efd, struct epoll_event *events, int max, int to) +{ + int rc; + + rte_atomic32_set(&flag_sleep, IOTHREAD_SLEEP); + rc = k_epoll_pwait(efd, events, max, to, NULL); + while (rte_atomic32_cmpset((volatile uint32_t *)&flag_sleep, + IOTHREAD_SLEEP, IOTHREAD_BUSY) == 0); + + return rc; +} + +static inline void +be_tx_with_lock(struct glue_ctx *ctx) +{ + if (rte_atomic32_cmpset((volatile uint32_t *)&flag_sleep, + IOTHREAD_SLEEP, IOTHREAD_PREEMPT)) { + while (be_tx(ctx) > 0) {}; + rte_atomic32_set(&flag_sleep, IOTHREAD_SLEEP); + } +} + +static inline void +wake_lookaside_backend(struct glue_ctx *ctx) +{ + if (rte_atomic32_read(&flag_sleep) == IOTHREAD_PREEMPT) + dev_rxq_wakeup(ctx->port_id); +} + +static inline bool +io_thread_in_sleep(void) +{ + return rte_atomic32_read(&flag_sleep) == IOTHREAD_SLEEP; +} +#else +#define sleep_with_lock k_epoll_wait +#define be_tx_with_lock(ctx) do {} while(0) +#define wake_lookaside_backend(ctx) do {} while(0) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _TLE_GLUE_INTERNAL_H_ */ diff --git a/lib/libtle_glue/log.h b/lib/libtle_glue/log.h new file mode 100644 index 0000000..da31ea3 --- /dev/null +++ b/lib/libtle_glue/log.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2019 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _GLUE_LOG_H_ +#define _GLUE_LOG_H_ + +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * logging related macros. + */ + +#define GLUE_LOG(lvl, fmt, args...) RTE_LOG(lvl, USER1, fmt "\n", ##args) + +#define DUMMY_MACRO do {} while (0) + +#ifdef ENABLE_DEBUG +#define GLUE_DEBUG(fmt, arg...) fprintf(stderr, fmt "\n", ##arg) +#else +#define GLUE_DEBUG(fmt, arg...) DUMMY_MACRO +#endif + +#ifdef ENABLE_TRACE +#define TRACE(fmt, arg...) fprintf(stderr, fmt "\n", ##arg) +#define PKT_DUMP(p) rte_pktmbuf_dump(stderr, (p), 64) +#else +#define TRACE(fmt, arg...) DUMMY_MACRO +#define PKT_DUMP(p) DUMMY_MACRO +#endif + +#ifdef DEBUG_ARP +static inline void +print_arp(int af, const void *src, const struct ether_addr *mac, + const char *action) +{ + char str_ip[64]; + char str_mac[32]; + socklen_t sz; + + ether_format_addr(str_mac, sizeof(str_mac), mac); + sz = (af == AF_INET) ? sizeof(struct in_addr) : sizeof(struct in6_addr); + inet_ntop(af, src, str_ip, sz); + RTE_LOG(INFO, "%s ARP entry: %s\tmac=%s", action, str_ip, str_mac); +} +#else +#define print_arp(arg...) DUMMY_MACRO +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _GLUE_LOG_H_ */ diff --git a/lib/libtle_glue/ndp.h b/lib/libtle_glue/ndp.h new file mode 100644 index 0000000..a61ff5b --- /dev/null +++ b/lib/libtle_glue/ndp.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2019 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _TLE_NDP_H_ +#define _TLE_NDP_H_ + +#define ND_OPT_SOURCE_LINKLAYER_ADDR 1 +#define ND_OPT_TARGET_LINKLAYER_ADDR 2 +#define ND_OPT_PREFIX_INFORMATION 3 +#define ND_OPT_REDIRECTED_HEADER 4 +#define ND_OPT_MTU 5 + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _TLE_NDP_H_ */ diff --git a/lib/libtle_glue/packetdrill.c b/lib/libtle_glue/packetdrill.c new file mode 100644 index 0000000..79d1d52 --- /dev/null +++ b/lib/libtle_glue/packetdrill.c @@ -0,0 +1,544 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include + +#include "packetdrill.h" +#include "tle_glue.h" +#include "internal.h" +#include "fd.h" + +#include +#include +#include +#include +#include + +static int vhost_vid; +enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM}; +static const char *sockname = "/tmp/sock0"; + +static int +new_device(int vid) +{ + vhost_vid = vid; + + /* Disable notifications. */ + rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0); + rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0); + + return 0; +} + +static void +destroy_device(int vid) +{ + RTE_SET_USED(vid); +} + +static const struct vhost_device_ops device_ops = +{ + .new_device = new_device, + .destroy_device = destroy_device, +}; + +static void +vhost_init(void) +{ + unlink(sockname); + + if (rte_vhost_driver_register(sockname, 0) != 0) + rte_exit(EXIT_FAILURE, "failed to register vhost driver \n"); + + if (rte_vhost_driver_callback_register(sockname, &device_ops) != 0) + rte_exit(EXIT_FAILURE, "failed to register vhost driver callbacks.\n"); + + if (rte_vhost_driver_start(sockname) < 0) + rte_exit(EXIT_FAILURE, "failed to start vhost driver.\n"); + + rte_log_set_level(RTE_LOGTYPE_USER1, RTE_LOG_NOTICE); +} + +static uint64_t +now_usecs(void) +{ + struct timeval tv; + + gettimeofday(&tv, NULL); + return ((uint64_t) tv.tv_sec * 1000000) + tv.tv_usec; +} + +static void +pd_free(void *userdata) +{ + RTE_SET_USED(userdata); +} + +static int +pd_socket(void *userdata, int domain, int type, int protocol) +{ + RTE_SET_USED(userdata); + return PRE(socket)(domain, type, protocol); +} + +static int +pd_bind(void *userdata, int sockfd, const struct sockaddr *addr, + socklen_t addrlen) +{ + RTE_SET_USED(userdata); + return PRE(bind)(sockfd, addr, addrlen); +} + +static int +pd_listen(void *userdata, int sockfd, int backlog) +{ + RTE_SET_USED(userdata); + return PRE(listen)(sockfd, backlog); +} + +static int +pd_accept(void *userdata, int sockfd, struct sockaddr *addr, + socklen_t *addrlen) +{ + RTE_SET_USED(userdata); + return PRE(accept)(sockfd, addr, addrlen); +} + +static int +pd_connect(void *userdata, int sockfd, const struct sockaddr *addr, + socklen_t addrlen) +{ + RTE_SET_USED(userdata); + return PRE(connect)(sockfd, addr, addrlen); +} + +static ssize_t +pd_read(void *userdata, int fd, void *buf, size_t count) +{ + RTE_SET_USED(userdata); + return PRE(read)(fd, buf, count); +} + +static ssize_t +pd_readv(void *userdata, int fd, const struct iovec *iov, int iovcnt) +{ + RTE_SET_USED(userdata); + return PRE(readv)(fd, iov, iovcnt); +} + +static ssize_t +pd_recv(void *userdata, int sockfd, void *buf, size_t len, int flags) +{ + RTE_SET_USED(userdata); + return PRE(recv)(sockfd, buf, len, flags); +} + +static ssize_t +pd_recvfrom(void *userdata, int sockfd, void *buf, size_t len, + int flags, struct sockaddr *src_addr, socklen_t *addrlen) +{ + RTE_SET_USED(userdata); + return PRE(recvfrom)(sockfd, buf, len, flags, src_addr, addrlen); +} + +static ssize_t +pd_recvmsg(void *userdata, int sockfd, struct msghdr *msg, int flags) +{ + RTE_SET_USED(userdata); + return PRE(recvmsg)(sockfd, msg, flags); +} + +static ssize_t +pd_write(void *userdata, int fd, const void *buf, size_t count) +{ + RTE_SET_USED(userdata); + return PRE(write)(fd, buf, count); +} + +static ssize_t +pd_writev(void *userdata, int fd, const struct iovec *iov, int iovcnt) +{ + RTE_SET_USED(userdata); + return PRE(writev)(fd, iov, iovcnt); +} + +static ssize_t +pd_send(void *userdata, int sockfd, const void *buf, size_t len, int flags) +{ + RTE_SET_USED(userdata); + return PRE(send)(sockfd, buf, len, flags); +} + +static ssize_t +pd_sendto(void *userdata, int sockfd, const void *buf, size_t len, int flags, + const struct sockaddr *dest_addr, socklen_t addrlen) +{ + RTE_SET_USED(userdata); + return PRE(sendto)(sockfd, buf, len, flags, dest_addr, addrlen); +} + +static ssize_t +pd_sendmsg(void *userdata, int sockfd, const struct msghdr *msg, int flags) +{ + RTE_SET_USED(userdata); + return PRE(sendmsg)(sockfd, msg, flags); +} + +static int +pd_fcntl(void *userdata, int fd, int cmd, ...) +{ + void *arg; + va_list ap; + + va_start(ap, cmd); + arg = va_arg(ap, void *); + va_end(ap); + + RTE_SET_USED(userdata); + return PRE(fcntl)(fd, cmd, arg); +} + +static int +pd_ioctl(void *userdata, int fd, unsigned long request, ...) +{ + void *arg; + va_list ap; + + va_start(ap, request); + arg = va_arg(ap, void *); + va_end(ap); + + RTE_SET_USED(userdata); + return PRE(ioctl)(fd, request, arg); +} + +static int +pd_close(void *userdata, int fd) +{ + RTE_SET_USED(userdata); + return PRE(close)(fd); +} + +static int +pd_shutdown(void *userdata, int sockfd, int how) +{ + RTE_SET_USED(userdata); + return PRE(shutdown)(sockfd, how); +} + +static int +pd_getsockopt(void *userdata, int sockfd, int level, int optname, + void *optval, socklen_t *optlen) +{ + RTE_SET_USED(userdata); + return PRE(getsockopt)(sockfd, level, optname, optval, optlen); +} + +static int +pd_setsockopt(void *userdata, int sockfd, int level, int optname, + const void *optval, socklen_t optlen) +{ + RTE_SET_USED(userdata); + return PRE(setsockopt)(sockfd, level, optname, optval, optlen); +} + +static int +pd_poll(void *userdata, struct pollfd *fds, nfds_t nfds, int timeout) +{ + RTE_SET_USED(userdata); + return PRE(poll)(fds, nfds, timeout); +} + +static struct rte_mbuf * +from_buf_to_mbuf(const void *buf, size_t count) +{ + struct rte_mempool *mp = get_mempool_by_socket(0); + uint16_t nb_mbufs = (count + RTE_MBUF_DEFAULT_DATAROOM - 1) / + RTE_MBUF_DEFAULT_DATAROOM; + struct rte_mbuf *mbufs[nb_mbufs + 1]; + uint16_t i, copy_len; + size_t done = 0; + char *dst; + + if (unlikely(rte_pktmbuf_alloc_bulk(mp, mbufs, nb_mbufs) < 0)) + rte_exit(EXIT_FAILURE, "allocate mbuf fails\n"); + + for (i = 0; i < nb_mbufs; ++i) { + copy_len = RTE_MIN((size_t)RTE_MBUF_DEFAULT_DATAROOM, + count - done); + dst = rte_pktmbuf_mtod(mbufs[i], char *); + rte_memcpy(dst, (const char *)buf + done, copy_len); + done += copy_len; + mbufs[i]->data_len = copy_len; + if (i > 0) + mbufs[i-1]->next = mbufs[i]; + } + + mbufs[0]->pkt_len = count; + mbufs[0]->nb_segs = nb_mbufs; + + return mbufs[0]; +} + +/* Send @count bytes of data starting from @buf to the TCP stack. + * Return 0 on success or -1 on error. + */ +static int +pd_netdev_send(void *userdata, const void *buf, size_t count) +{ + struct ether_hdr *hdr; + struct rte_mbuf *m; + + RTE_SET_USED(userdata); + + m = from_buf_to_mbuf(buf, count); + + // add l2 header + hdr = (struct ether_hdr *)rte_pktmbuf_prepend(m, sizeof(struct ether_hdr)); + hdr->ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv4); + + if (rte_vhost_enqueue_burst(vhost_vid, VIRTIO_RXQ, &m, 1) == 1) + return 0; + + return -1; +} + +static inline struct rte_mbuf * +from_mbuf_to_buf(struct rte_mbuf *m, char *buf, size_t len, int ispeek, int needcpy) +{ + void *src; + uint32_t done = 0; + uint32_t left = len, orig_pkt_len; + uint16_t copy_len, seg_len; + struct rte_mbuf *m_next, *orig_pkt; + + if (len == 0) + return m; + + orig_pkt = m; + orig_pkt_len = m->pkt_len; + + do { + seg_len = rte_pktmbuf_data_len(m); + copy_len = RTE_MIN(seg_len, left); + src = rte_pktmbuf_mtod(m, void *); + if (needcpy) + rte_memcpy(buf + done, src, copy_len); + done += copy_len; + left -= copy_len; + if (copy_len < seg_len) { + if (!ispeek) { + rte_pktmbuf_adj(m, copy_len); + } + break; + } + m_next = m->next; + if (!ispeek) { + rte_pktmbuf_free_seg(m); + } + m = m_next; + } while (left && m); + + if (m && !ispeek) + m->pkt_len = orig_pkt_len - done; + + if(ispeek) + return orig_pkt; + else + return m; +} + +/* Sniff the next packet leaving the TCP stack. + * Put packet data in @buf. @count is passed in as the buffer size. + * The actual number of bytes received should be put in @count. + * Set @count to 0 if received nothing. + * Set @time_usecs to the receive timestamp. + * Return 0 on success or -1 on error. */ +static int +pd_netdev_recv(void *userdata, void *buf, size_t *count, long long *time_usecs) +{ + struct rte_mbuf *m; + struct rte_mempool *mp = get_mempool_by_socket(0); + + RTE_SET_USED(userdata); + + while (rte_vhost_dequeue_burst(vhost_vid, VIRTIO_TXQ, mp, &m, 1) == 0); + + // remove l2 header + rte_pktmbuf_adj(m, sizeof(struct ether_hdr)); + + *count = m->pkt_len; + from_mbuf_to_buf(m, buf, *count, 0, 1); + + *time_usecs = now_usecs(); + return 0; +} + +static int +pd_usleep(void *userdata, useconds_t usec) +{ + RTE_SET_USED(userdata); + return usleep(usec); +} + +static int +pd_gettimeofday(void *userdata, struct timeval *tv, struct timezone *tz) +{ + RTE_SET_USED(userdata); + return gettimeofday(tv, tz); +} + +static int +pd_epoll_create(void *userdata, int size) +{ + RTE_SET_USED(userdata); + return PRE(epoll_create)(size); +} + +static int +pd_epoll_ctl(void *userdata, int epfd, int op, int fd, + struct epoll_event *event) +{ + RTE_SET_USED(userdata); + return PRE(epoll_ctl)(epfd, op, fd, event); +} + +static int +pd_epoll_wait(void *userdata, int epfd, struct epoll_event *events, + int maxevents, int timeout) +{ + RTE_SET_USED(userdata); + return PRE(epoll_wait)(epfd, events, maxevents, timeout); +} + +static int +pd_pipe(void *userdata, int pipefd[2]) +{ + RTE_SET_USED(userdata); + return pipe(pipefd); +} + +static int +pd_splice(void *userdata, int fd_in, loff_t *off_in, int fd_out, + loff_t *off_out, size_t len, unsigned int flags) +{ + RTE_SET_USED(userdata); + return PRE(splice)(fd_in, off_in, fd_out, off_out, len, flags); +} + +static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; + +static void * +io(void *arg) +{ + int epfd; + struct in_addr ipv4; + struct ether_addr mac = { .addr_bytes = { 0xee, 0xff, 0xff, 0xff, 0xff, 0xff}, }; + struct epoll_event events[128]; + + RTE_SET_USED(arg); + + setenv(DPDK_IP, "192.168.0.2", 1); + setenv(DPDK_IP_MASK, "16", 1); + setenv(DPDK_IP_GATEWAY, "192.168.0.1", 1); + + setenv(DPDK_IPV6, "fd3d:fa7b:d17d::0", 1); + setenv(DPDK_IPV6_MASK, "48", 1); + setenv(DPDK_IPV6_GATEWAY, "fd3d:fa7b:d17d:8888::0", 1); + + epfd = PRE(epoll_create)(0); + + inet_pton(AF_INET, "192.168.0.1", &ipv4); + + ipv4_dst_add(default_ctx, &ipv4, &mac); + + pthread_mutex_unlock(&lock); + + while (1) { + PRE(epoll_wait)(epfd, events, 128, 0); + } + + return NULL; +} + +void +packetdrill_interface_init(const char *flags, + struct packetdrill_interface *ifc) +{ + int argc = 0; + char *argv[16]; + pthread_t tid; + + RTE_SET_USED(flags); + + argv[argc++] = strdup("test"); + argv[argc++] = strdup("-l"); + argv[argc++] = strdup("0"); + argv[argc++] = strdup("--no-pci"); + argv[argc++] = strdup("--in-memory"); + argv[argc++] = strdup("--single-file-segments"); + argv[argc++] = strdup("--"); + + if (rte_eal_init(argc, argv) < 0) + rte_exit(EXIT_FAILURE, "Failed to init DPDK\n"); + + fd_init(); + + vhost_init(); + + if (rte_eal_hotplug_add("vdev", "virtio_user0", "path=/tmp/sock0") < 0) + rte_exit(EXIT_FAILURE, "hot plug virtio-user failed\n"); + + pthread_mutex_lock(&lock); + + pthread_create(&tid, NULL, io, NULL); + + pthread_mutex_lock(&lock); + + ifc->free = pd_free; + ifc->socket = pd_socket; + ifc->bind = pd_bind; + ifc->listen = pd_listen; + ifc->accept = pd_accept; + ifc->connect = pd_connect; + ifc->read = pd_read; + ifc->readv = pd_readv; + ifc->recv = pd_recv; + ifc->recvfrom = pd_recvfrom; + ifc->recvmsg = pd_recvmsg; + ifc->write = pd_write; + ifc->writev = pd_writev; + ifc->send = pd_send; + ifc->sendto = pd_sendto; + ifc->sendmsg = pd_sendmsg; + ifc->fcntl = pd_fcntl; + ifc->ioctl = pd_ioctl; + ifc->close = pd_close; + ifc->shutdown = pd_shutdown; + ifc->getsockopt = pd_getsockopt; + ifc->setsockopt = pd_setsockopt; + ifc->poll = pd_poll; + ifc->netdev_send = pd_netdev_send; + ifc->netdev_receive = pd_netdev_recv; + ifc->usleep = pd_usleep; + ifc->gettimeofday = pd_gettimeofday; + ifc->epoll_create = pd_epoll_create; + ifc->epoll_ctl = pd_epoll_ctl; + ifc->epoll_wait = pd_epoll_wait; + ifc->pipe = pd_pipe; + ifc->splice = pd_splice; +} diff --git a/lib/libtle_glue/packetdrill.h b/lib/libtle_glue/packetdrill.h new file mode 100644 index 0000000..6f84a87 --- /dev/null +++ b/lib/libtle_glue/packetdrill.h @@ -0,0 +1,111 @@ +/* + * Copyright 2015 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: xiaoj@google.com (Xiao Jia) + * + * Interface for packetdrill. + * + * To be tested against as a shared object (*.so) file, implement this + * interface, export a function "packetdrill_interface_init", and + * initialize the interface struct passed in with your own functions. + */ + +#ifndef __PACKETDRILL_H__ +#define __PACKETDRILL_H__ + +#include +#include +#include +#include +#include +#include +#include + +struct packetdrill_interface { + void *userdata; + void (*free)(void *userdata); + int (*socket)(void *userdata, int domain, int type, int protocol); + int (*bind)(void *userdata, int sockfd, const struct sockaddr *addr, + socklen_t addrlen); + int (*listen)(void *userdata, int sockfd, int backlog); + int (*accept)(void *userdata, int sockfd, struct sockaddr *addr, + socklen_t *addrlen); + int (*connect)(void *userdata, int sockfd, const struct sockaddr *addr, + socklen_t addrlen); + ssize_t (*read)(void *userdata, int fd, void *buf, size_t count); + ssize_t (*readv)(void *userdata, int fd, const struct iovec *iov, + int iovcnt); + ssize_t (*recv)(void *userdata, int sockfd, void *buf, size_t len, + int flags); + ssize_t (*recvfrom)(void *userdata, int sockfd, void *buf, size_t len, + int flags, struct sockaddr *src_addr, + socklen_t *addrlen); + ssize_t (*recvmsg)(void *userdata, int sockfd, struct msghdr *msg, + int flags); + ssize_t (*write)(void *userdata, int fd, const void *buf, size_t count); + ssize_t (*writev)(void *userdata, int fd, const struct iovec *iov, + int iovcnt); + ssize_t (*send)(void *userdata, int sockfd, const void *buf, size_t len, + int flags); + ssize_t (*sendto)(void *userdata, int sockfd, const void *buf, + size_t len, int flags, + const struct sockaddr *dest_addr, socklen_t addrlen); + ssize_t (*sendmsg)(void *userdata, int sockfd, const struct msghdr *msg, + int flags); + int (*fcntl)(void *userdata, int fd, int cmd, ...); + int (*ioctl)(void *userdata, int fd, unsigned long request, ...); + int (*close)(void *userdata, int fd); + int (*shutdown)(void *userdata, int sockfd, int how); + int (*getsockopt)(void *userdata, int sockfd, int level, int optname, + void *optval, socklen_t *optlen); + int (*setsockopt)(void *userdata, int sockfd, int level, int optname, + const void *optval, socklen_t optlen); + int (*poll)(void *userdata, struct pollfd *fds, nfds_t nfds, + int timeout); + /* Send @count bytes of data starting from @buf to the TCP stack. + * Return 0 on success or -1 on error. */ + int (*netdev_send)(void *userdata, const void *buf, size_t count); + /* Sniff the next packet leaving the TCP stack. + * Put packet data in @buf. @count is passed in as the buffer size. + * The actual number of bytes received should be put in @count. + * Set @count to 0 if received nothing. + * Set @time_usecs to the receive timestamp. + * Return 0 on success or -1 on error. */ + int (*netdev_receive)(void *userdata, void *buf, size_t *count, + long long *time_usecs); + int (*usleep)(void *userdata, useconds_t usec); + int (*gettimeofday)(void *userdata, struct timeval *tv, + struct timezone *tz); + int (*epoll_create)(void *userdata, int size); + int (*epoll_ctl)(void *userdata, int epfd, int op, int fd, + struct epoll_event *event); + int (*epoll_wait)(void *userdata, int epfd, struct epoll_event *events, + int maxevents, int timeout); + int (*pipe)(void *userdata, int pipefd[2]); + int (*splice)(void *userdata, int fd_in, loff_t *off_in, int fd_out, + loff_t *off_out, size_t len, unsigned int flags); +}; + +typedef void (*packetdrill_interface_init_t)(const char *flags, + struct packetdrill_interface *); + +void +packetdrill_interface_init(const char *flags, struct packetdrill_interface *ifc); + +#endif /* __PACKETDRILL_H__ */ diff --git a/lib/libtle_glue/poll.c b/lib/libtle_glue/poll.c new file mode 100644 index 0000000..ebc0110 --- /dev/null +++ b/lib/libtle_glue/poll.c @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include + +#include "fd.h" +#include "ctx.h" +#include "sym.h" +#include "log.h" +#include "util.h" +#include "internal.h" +#include "tle_glue.h" + +int +PRE(poll)(struct pollfd *fds, nfds_t nfds, int timeout) +{ + int efd; + int total = 0, j; + int tmp_ev; + uint32_t i; + uint32_t k_n = 0; + int k_fds[nfds]; + struct sock *so; + struct glue_ctx *ctx; + struct epoll_event k_ev; + struct epoll_event events[nfds]; + + for (i = 0; i < nfds; ++i) { + if (is_kernel_fd(fds[i].fd)) { + k_fds[k_n++] = i; + continue; + } + + so = fd2sock(fds[i].fd); + if (!so->valid) + continue; + + fds[i].revents = fd_ready(fds[i].fd, fds[i].events); + if (fds[i].revents) { + total++; + continue; + } + + /* We fill sock->event here as we need this when + * we filter events in poll_common(). But it was + * originally set by epoll_ctl(). Now we have to + * assume that there are no application which + * uses epoll and poll at the same time. + */ + so->event.events = fds[i].events; + so->event.data.u32 = i; /* store idx */ + } + + if (k_n == nfds) + return k_poll(fds, nfds, timeout); + + if (total > 0) + return total; + + /* thread <> context binding happens here */ + if (RTE_PER_LCORE(glue_ctx) == NULL) { + ctx = &ctx_array[glue_ctx_alloc()]; + RTE_PER_LCORE(glue_ctx) = ctx; + } else + ctx = RTE_PER_LCORE(glue_ctx); + + total = poll_common(ctx, events, nfds, 0, -1); + + /* We assume kernel I/O events are not as important as user ones */ + if (total > 0) + goto format; + + efd = k_epoll_create(1); + if (efd < 0) + rte_panic("k_epoll_create failed %d", errno); + + for (i = 0; i < k_n; ++i) { + k_ev.events = fds[k_fds[i]].events; + k_ev.data.u32 = k_fds[i]; /* store idx */ + k_epoll_ctl(efd, EPOLL_CTL_ADD, fds[k_fds[i]].fd, &k_ev); + } + + total = poll_common(ctx, events, nfds, timeout, efd); + k_close(efd); +format: + for (j = 0; j < total; ++j) { + tmp_ev = events[j].events; + if (tmp_ev == POLLHUP) { + tmp_ev |= POLLERR | (fds[events[j].data.u32].events & + (POLLIN | POLLOUT)); + } + fds[events[j].data.u32].revents = tmp_ev; + } + + return total; +} + +int +PRE(ppoll)(struct pollfd *fds, nfds_t nfds, + const struct timespec *tmo_p, const sigset_t *sigmask) +{ + int timeout; + + if (sigmask != NULL) + rte_panic("ppoll with signal is not supported"); + + if (tmo_p == NULL) + timeout = -1; + else + timeout = tmo_p->tv_sec * 1000 + tmo_p->tv_nsec / 1000000; + + return poll(fds, nfds, timeout); +} + +extern int __poll_chk(struct pollfd *fds, nfds_t nfds, int timeout, + __SIZE_TYPE__ fdslen); +int +__poll_chk(struct pollfd *fds, nfds_t nfds, int timeout, + __SIZE_TYPE__ fdslen __rte_unused) +{ + return poll(fds, nfds, timeout); +} diff --git a/lib/libtle_glue/port.c b/lib/libtle_glue/port.c new file mode 100644 index 0000000..7a4cf2e --- /dev/null +++ b/lib/libtle_glue/port.c @@ -0,0 +1,246 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include "log.h" +#include "ctx.h" +#include "config.h" +#include "internal.h" + +int stopped; + +static struct rte_mempool *mpool[RTE_MAX_NUMA_NODES]; + +struct rte_mempool * +get_mempool_by_socket(int32_t socket_id) +{ + struct rte_mempool *mp; + char name[RTE_MEMPOOL_NAMESIZE]; + + if (socket_id == SOCKET_ID_ANY) + socket_id = 0; + + if (mpool[socket_id]) + return mpool[socket_id]; + + snprintf(name, sizeof(name), "MP%u", socket_id); + mp = rte_pktmbuf_dynamic_pool_create(name, MAX_MBUFS - 1, + MBUF_PERCORE_CACHE, 0, + RTE_MBUF_DEFAULT_BUF_SIZE, + socket_id, MBUF_DYNAMIC_SIZE); + + if (mp == NULL) + rte_panic("Failed to create mbuf mempool"); + + mpool[socket_id] = mp; + return mp; +} + +static void +update_rss_conf(uint16_t port_id) +{ + struct rte_eth_rss_conf rss_conf = { + .rss_key = NULL, + .rss_key_len = 0, + .rss_hf = ETH_RSS_IP | ETH_RSS_TCP | ETH_RSS_UDP, + }; + + if (rte_eth_dev_rss_hash_update(port_id, &rss_conf) < 0) + rte_panic("Failed to update rss hash"); +} + +static void +queue_init(uint16_t port_id, uint16_t nb_queues, + struct rte_eth_dev_info *dev_info, + struct rte_eth_conf *port_conf) +{ + uint16_t q; + int32_t socket_id, rc; + uint16_t nb_rxd = 1024, nb_txd = 1024; + struct rte_mempool *mp; + struct rte_eth_txconf txq_conf = dev_info->default_txconf; + struct rte_eth_rxconf rxq_conf = dev_info->default_rxconf; + + socket_id = rte_eth_dev_socket_id(port_id); + mp = get_mempool_by_socket(socket_id); + + dev_info->default_rxconf.rx_drop_en = 1; + + rc = rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, &nb_txd); + if (rc < 0) + rte_panic("Cannot adjust number of desc"); + + rxq_conf.offloads = port_conf->rxmode.offloads; + txq_conf.offloads = port_conf->txmode.offloads; + + /* faster free of tx entries */ + txq_conf.tx_free_thresh = nb_txd - 64; + + for (q = 0; q < nb_queues; q++) { + rc = rte_eth_rx_queue_setup(port_id, q, nb_rxd, + socket_id, &rxq_conf, mp); + if (rc < 0) + rte_panic("rx queue=%u setup failed: %d", q, rc); + + rc = setup_rx_cb(port_id, q); + if (rc < 0) + rte_panic("rx queue=%u rx setup failed: %d", q, rc); + } + + for (q = 0; q < nb_queues; q++) { + rc = rte_eth_tx_queue_setup(port_id, q, nb_txd, + socket_id, &txq_conf); + if (rc < 0) + rte_panic("tx queue=%u setup failed: %d", q, rc); + } +} + +uint64_t rx_offload = + DEV_RX_OFFLOAD_IPV4_CKSUM | + DEV_RX_OFFLOAD_UDP_CKSUM | + DEV_RX_OFFLOAD_TCP_CKSUM; +/* nice to have: + DEV_RX_OFFLOAD_CRC_STRIP | + DEV_RX_OFFLOAD_TCP_LRO | + DEV_RX_OFFLOAD_HEADER_SPLIT | + DEV_RX_OFFLOAD_SCATTER | + DEV_RX_OFFLOAD_TIMESTAMP +*/ + +uint64_t tx_offload = + DEV_TX_OFFLOAD_UDP_CKSUM | + DEV_TX_OFFLOAD_TCP_CKSUM | + DEV_TX_OFFLOAD_TCP_TSO | + DEV_TX_OFFLOAD_MULTI_SEGS; + +int +dev_rxq_wakeup(uint16_t port_id) +{ + int fd; + uint16_t qid; + uint32_t vec, efd_idx; + struct rte_eth_dev *dev; + struct rte_intr_handle *intr_handle; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + dev = &rte_eth_devices[port_id]; + intr_handle = dev->intr_handle; + if (!intr_handle) + return -ENOTSUP; + if (!intr_handle->intr_vec) + return -EPERM; + + for (qid = 0; qid < dev->data->nb_rx_queues; qid++) { + vec = intr_handle->intr_vec[qid]; + efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ? + (vec - RTE_INTR_VEC_RXTX_OFFSET) : vec; + fd = intr_handle->efds[efd_idx]; + if (eventfd_write(fd, (eventfd_t) 1) < 0) + return -errno; + } + + return 0; +} + +void +port_reconfig(void) +{ + int32_t rc; + struct rte_eth_dev_info dev_info; + uint16_t port_id = 0; /* We use and only use port 0 */ + uint16_t nb_port; + uint16_t nb_queues = nb_ctx; + + struct rte_eth_conf port_conf = { + .intr_conf = { + .rxq = 1, + }, + }; + + /* 0. dev number check */ + nb_port = rte_eth_dev_count_avail(); + if (nb_port < 1 || nb_port >2) + rte_panic("One port is mandatory with an optional loopback device\n"); + + stopped = 1; + rte_wmb(); + /* wake up all rxqs */ + if (nb_ctx > 1) + dev_rxq_wakeup(port_id); + + usleep(1); /* fix me: this cannot gurantee correctness */ + + rte_eth_dev_stop(port_id); + + /* 1. offloading check and set*/ + rte_eth_dev_info_get(port_id, &dev_info); + rx_offload &= dev_info.rx_offload_capa; + port_conf.rxmode.offloads = rx_offload; + tx_offload &= dev_info.tx_offload_capa; + port_conf.txmode.offloads = tx_offload; + + GLUE_LOG(INFO, "configure queues = %d, offloads: rx = %"PRIx64", tx = %"PRIx64, + nb_queues, rx_offload, tx_offload); + + /* 2. dev configure */ + rc = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf); + if (rc != 0) + rte_panic("Failed to configure device, %d", rc); + + /* 3. queue setup */ + queue_init(port_id, nb_queues, &dev_info, &port_conf); + + /* 4. rss conf */ + if (nb_queues > 1) + update_rss_conf(port_id); + + /* 5. dev start */ + if (rte_eth_dev_start(port_id) < 0) + rte_panic("Failed to start device"); + + stopped = 0; +} + +uint16_t +create_loopback(uint32_t socket_id) +{ + int ret; + struct rte_ring* lb_queue; + static uint16_t lb_port_id = 0xFFFF; + const char *ring_name = "loopback-ring"; + + if (lb_port_id != 0xFFFF) + return lb_port_id; + + lb_queue = rte_ring_create(ring_name, MAX_PKTS_BURST * 8, socket_id, + RING_F_SP_ENQ | RING_F_SC_DEQ); + if (!lb_queue) + rte_panic("Failed to create ring for loopback\n"); + ret = rte_eth_from_ring(lb_queue); + if (ret < 0) + rte_panic("Failed to create ethdev from ring\n"); + lb_port_id = ret; + + if (setup_rx_cb(lb_port_id, 0) < 0) + rte_panic("Failed to set up rx cb for loopback\n"); + + return lb_port_id; +} diff --git a/lib/libtle_glue/rxcb.c b/lib/libtle_glue/rxcb.c new file mode 100644 index 0000000..51f31c9 --- /dev/null +++ b/lib/libtle_glue/rxcb.c @@ -0,0 +1,834 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include + +#include "log.h" +#include "ctx.h" +#include "internal.h" + +struct ptype2cb { + uint32_t mask; + const char *name; + rte_rx_callback_fn fn; +}; + +enum { + ETHER_ARP_PTYPE = 0x1, + IPV4_PTYPE = 0x2, + IPV4_EXT_PTYPE = 0x4, + IPV6_PTYPE = 0x8, + IPV6_EXT_PTYPE = 0x10, + TCP_PTYPE = 0x20, + UDP_PTYPE = 0x40, + ICMP_PTYPE = 0x80, +}; + +static inline uint64_t +_mbuf_tx_offload(uint64_t il2, uint64_t il3, uint64_t il4, uint64_t tso, + uint64_t ol3, uint64_t ol2) +{ + return il2 | il3 << 7 | il4 << 16 | tso << 24 | ol3 << 40 | ol2 << 49; +} + +static inline int32_t +fill_pkt_hdr_len(struct rte_mbuf *m, uint32_t l2, uint32_t l3, uint32_t l4) +{ + if (l2 + l3 + l4 > m->pkt_len) + return -1; + m->tx_offload = _mbuf_tx_offload(l2, l3, l4, 0, 0, 0); + return 0; +} + +static inline int +is_ipv4_frag(const struct ipv4_hdr *iph) +{ + const uint16_t mask = rte_cpu_to_be_16(~IPV4_HDR_DF_FLAG); + + return ((mask & iph->fragment_offset) != 0); +} + +static inline uint32_t +get_tcp_header_size(struct rte_mbuf *m, uint32_t l2_len, uint32_t l3_len) +{ + const struct tcp_hdr *tcp; + + tcp = rte_pktmbuf_mtod_offset(m, struct tcp_hdr *, l2_len + l3_len); + return (tcp->data_off >> 4) * 4; +} + +static inline int32_t +adjust_ipv4_pktlen(struct rte_mbuf *m, uint32_t l2_len) +{ + uint32_t plen, trim; + const struct ipv4_hdr *iph; + + iph = rte_pktmbuf_mtod_offset(m, const struct ipv4_hdr *, l2_len); + plen = rte_be_to_cpu_16(iph->total_length) + l2_len; + if (plen < m->pkt_len) { + trim = m->pkt_len - plen; + rte_pktmbuf_trim(m, trim); + } else if (plen > m->pkt_len) + return -1; + + return 0; +} + +static inline int32_t +adjust_ipv6_pktlen(struct rte_mbuf *m, uint32_t l2_len) +{ + uint32_t plen, trim; + const struct ipv6_hdr *iph; + + iph = rte_pktmbuf_mtod_offset(m, const struct ipv6_hdr *, l2_len); + plen = rte_be_to_cpu_16(iph->payload_len) + sizeof(*iph) + l2_len; + if (plen < m->pkt_len) { + trim = m->pkt_len - plen; + rte_pktmbuf_trim(m, trim); + } else if (plen > m->pkt_len) + return -1; + + return 0; +} + +static inline uint32_t +get_ipv4_hdr_len(struct rte_mbuf *m, uint32_t l2, uint32_t proto, uint32_t frag) +{ + const struct ipv4_hdr *iph; + int32_t dlen, len; + + dlen = rte_pktmbuf_data_len(m); + dlen -= l2; + + iph = rte_pktmbuf_mtod_offset(m, const struct ipv4_hdr *, l2); + len = (iph->version_ihl & IPV4_HDR_IHL_MASK) * IPV4_IHL_MULTIPLIER; + + if (frag != 0 && is_ipv4_frag(iph)) { + m->packet_type &= ~RTE_PTYPE_L4_MASK; + m->packet_type |= RTE_PTYPE_L4_FRAG; + } + + if (len > dlen || (proto <= IPPROTO_MAX && iph->next_proto_id != proto)) + m->packet_type = RTE_PTYPE_UNKNOWN; + + return len; +} + +static inline uint32_t +get_ipv6x_hdr_len(struct rte_mbuf *m, uint32_t l2, uint32_t *fproto) +{ + const struct ipv6_hdr *ip6h; + const struct ip6_ext *ipx; + uint32_t nproto; + int32_t dlen, len, ofs; + + ip6h = rte_pktmbuf_mtod_offset(m, struct ipv6_hdr*, l2); + nproto = ip6h->proto; + len = sizeof(struct ipv6_hdr); + + dlen = rte_pktmbuf_data_len(m); + dlen -= l2; + + ofs = l2 + len; + ipx = rte_pktmbuf_mtod_offset(m, const struct ip6_ext *, ofs); + + while (ofs > 0 && len < dlen) { + switch (nproto) { + case IPPROTO_HOPOPTS: + case IPPROTO_ROUTING: + case IPPROTO_DSTOPTS: + ofs = (ipx->ip6e_len + 1) << 3; + break; + case IPPROTO_AH: + ofs = (ipx->ip6e_len + 2) << 2; + break; + case IPPROTO_FRAGMENT: + /* + * tso_segsz is not used by RX, so use it as temporary + * buffer to store the fragment offset. + */ + m->tso_segsz = l2 + len; + ofs = sizeof(struct ip6_frag); + m->packet_type &= ~RTE_PTYPE_L4_MASK; + m->packet_type |= RTE_PTYPE_L4_FRAG; + break; + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_ICMPV6: + ofs = 0; + if (*fproto == 0) + *fproto = nproto; + break; + default: + ofs = 0; + } + + if (ofs > 0) { + nproto = ipx->ip6e_nxt; + len += ofs; + ipx += ofs / sizeof(*ipx); + } + } + + /* unrecognized or invalid packet. */ + if (*fproto == 0 || len > dlen) + m->packet_type = RTE_PTYPE_UNKNOWN; + + return len; +} + +static inline uint32_t +get_ipv6_hdr_len(struct rte_mbuf *m, uint32_t l2, uint32_t fproto) +{ + const struct ipv6_hdr *iph; + + iph = rte_pktmbuf_mtod_offset(m, const struct ipv6_hdr *, + sizeof(struct ether_hdr)); + + if (iph->proto == fproto) + return sizeof(struct ipv6_hdr); + else + return get_ipv6x_hdr_len(m, l2, &fproto); +} + +static inline struct rte_mbuf* +process_ipv4_frag(struct rte_mbuf *m, struct glue_ctx *ctx, + uint32_t l2_len, uint32_t l3_len) +{ + struct ipv4_hdr* iph; + + m->l2_len = l2_len; + m->l3_len = l3_len; + /* fixme: ip checksum should be checked here. + * After reassemble, the ip checksum would be invalid. + */ + m = rte_ipv4_frag_reassemble_packet(ctx->frag_tbl, + &ctx->frag_dr, m, rte_rdtsc(), + rte_pktmbuf_mtod_offset(m, struct ipv4_hdr*, m->l2_len)); + rte_ip_frag_free_death_row(&ctx->frag_dr, 3); + if (m == NULL) + return NULL; + iph = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr*, m->l2_len); + switch (iph->next_proto_id) { + case IPPROTO_TCP: + m->packet_type &= ~RTE_PTYPE_L4_MASK; + m->packet_type |= RTE_PTYPE_L4_TCP; + break; + case IPPROTO_UDP: + m->packet_type &= ~RTE_PTYPE_L4_MASK; + m->packet_type |= RTE_PTYPE_L4_UDP; + break; + } + return m; +} + +static inline struct rte_mbuf* +process_ipv6_frag(struct rte_mbuf *m, struct glue_ctx *ctx, + uint32_t l2_len, uint32_t l3_len) +{ + struct ipv6_hdr* ip6h; + + m->l2_len = l2_len; + m->l3_len = l3_len; + m = rte_ipv6_frag_reassemble_packet(ctx->frag_tbl, + &ctx->frag_dr, m, rte_rdtsc(), + rte_pktmbuf_mtod_offset(m, struct ipv6_hdr*, l2_len), + rte_pktmbuf_mtod_offset(m, struct ipv6_extension_fragment*, + m->tso_segsz)); + rte_ip_frag_free_death_row(&ctx->frag_dr, 3); + if (m == NULL) + return NULL; + ip6h = rte_pktmbuf_mtod_offset(m, struct ipv6_hdr*, m->l2_len); + switch (ip6h->proto) { + case IPPROTO_TCP: + m->packet_type &= ~RTE_PTYPE_L4_MASK; + m->packet_type |= RTE_PTYPE_L4_TCP; + break; + case IPPROTO_UDP: + m->packet_type &= ~RTE_PTYPE_L4_MASK; + m->packet_type |= RTE_PTYPE_L4_UDP; + break; + } + return m; +} + +static inline struct rte_mbuf * +fill_ptypes_and_hdr_len(struct glue_ctx *ctx, struct rte_mbuf *m) +{ + uint32_t dlen, l2_len, l3_len, l4_len, proto; + const struct ether_hdr *eth; + uint32_t ptypes; + uint16_t etp; + int32_t error = 0; + + dlen = rte_pktmbuf_data_len(m); + + /* L2 */ + l2_len = sizeof(*eth); + + eth = rte_pktmbuf_mtod(m, const struct ether_hdr *); + etp = eth->ether_type; + while (etp == rte_be_to_cpu_16(ETHER_TYPE_VLAN)) { + etp = rte_pktmbuf_mtod_offset(m, struct vlan_hdr*, l2_len)->eth_proto; + l2_len += sizeof(struct vlan_hdr); + } + + if (etp == rte_be_to_cpu_16(ETHER_TYPE_ARP)) + return arp_recv(ctx, m, l2_len); + + if (etp == rte_be_to_cpu_16(ETHER_TYPE_IPv4)) { + const struct ipv4_hdr *hdr; + + /* L3 */ + hdr = rte_pktmbuf_mtod_offset(m, const struct ipv4_hdr *, l2_len); + error = adjust_ipv4_pktlen(m, l2_len); + if (error) { + rte_pktmbuf_free(m); + return NULL; + } + l3_len = get_ipv4_hdr_len(m, l2_len, IPPROTO_MAX + 1, 1); + + if ((m->packet_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_FRAG) { + m = process_ipv4_frag(m, ctx, l2_len, l3_len); + if (m == NULL) + return NULL; + hdr = rte_pktmbuf_mtod_offset(m, const struct ipv4_hdr*, + m->l2_len); + l3_len = get_ipv4_hdr_len(m, m->l2_len, + IPPROTO_MAX + 1, 0); + } + + /* L4 */ + switch (hdr->next_proto_id) { + case IPPROTO_ICMP: + return icmp_recv(ctx, m, l2_len, l3_len); + case IPPROTO_TCP: + ptypes = RTE_PTYPE_L4_TCP | + RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L2_ETHER; + l4_len = get_tcp_header_size(m, l2_len, l3_len); + break; + case IPPROTO_UDP: + ptypes = RTE_PTYPE_L4_UDP | + RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L2_ETHER; + l4_len = sizeof(struct udp_hdr); + break; + default: + GLUE_LOG(ERR, "drop ipv4 pkt of unknow L4: (%d)", + hdr->next_proto_id); + rte_pktmbuf_free(m); + return NULL; + } + + } else if (etp == rte_be_to_cpu_16(ETHER_TYPE_IPv6) && + dlen >= l2_len + sizeof(struct ipv6_hdr) + sizeof(struct udp_hdr)) { + /* L3 */ + error = adjust_ipv6_pktlen(m, l2_len); + if (error) { + rte_pktmbuf_free(m); + return NULL; + } + proto = 0; + l3_len = get_ipv6x_hdr_len(m, l2_len, &proto); + + if ((m->packet_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_FRAG) { + m = process_ipv6_frag(m, ctx, l2_len, l3_len); + if (m == NULL) + return NULL; + l3_len = get_ipv6x_hdr_len(m, m->l2_len, &proto); + } + + /* L4 */ + switch (proto) { + case IPPROTO_TCP: + ptypes = RTE_PTYPE_L4_TCP | + RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L2_ETHER; + l4_len = get_tcp_header_size(m, l2_len, l3_len); + break; + case IPPROTO_UDP: + ptypes = RTE_PTYPE_L4_UDP | + RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L2_ETHER; + l4_len = sizeof(struct udp_hdr); + break; + case IPPROTO_ICMPV6: + return icmp6_recv(ctx, m, l2_len, l3_len); + default: + GLUE_DEBUG("drop ipv6 pkt of unknown L4: (%x)", proto); + rte_pktmbuf_free(m); + return NULL; + } + } else { + GLUE_DEBUG("Drop unknown L3 packet: %x", etp); + rte_pktmbuf_free(m); + return NULL; + } + + m->packet_type = ptypes; + error = fill_pkt_hdr_len(m, l2_len, l3_len, l4_len); + if (error) { + rte_pktmbuf_free(m); + return NULL; + } + + return m; +} + +/* exclude NULLs from the final list of packets. */ +static inline uint32_t +compress_pkt_list(struct rte_mbuf *pkt[], uint32_t nb_pkt, uint32_t nb_zero) +{ + uint32_t i, j, k, l; + + for (j = nb_pkt; nb_zero != 0 && j-- != 0; ) { + + /* found a hole. */ + if (pkt[j] == NULL) { + + /* find how big is it. */ + for (i = j; i-- != 0 && pkt[i] == NULL; ) + ; + /* fill the hole. */ + for (k = j + 1, l = i + 1; k != nb_pkt; k++, l++) + pkt[l] = pkt[k]; + + nb_pkt -= j - i; + nb_zero -= j - i; + j = i + 1; + } + } + + return nb_pkt; +} + +static inline struct rte_mbuf * +common_fill_hdr_len(struct rte_mbuf *m, uint32_t tp, struct glue_ctx *ctx) +{ + uint32_t l4_len, l3_len, l2_len = sizeof(struct ether_hdr); + int32_t error = 0; + + switch (tp) { + /* possibly fragmented packets. */ + case (RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L2_ETHER): + case (RTE_PTYPE_L3_IPV4_EXT | RTE_PTYPE_L2_ETHER): + l3_len = get_ipv4_hdr_len(m, l2_len, IPPROTO_MAX + 1, 1); + if ((m->packet_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_FRAG) { + m = process_ipv4_frag(m, ctx, l2_len, l3_len); + if (m == NULL) + return NULL; + tp = m->packet_type & (RTE_PTYPE_L2_MASK | + RTE_PTYPE_L3_MASK | + RTE_PTYPE_L4_MASK); + } + break; + case (RTE_PTYPE_L3_IPV6 | RTE_PTYPE_L2_ETHER): + case (RTE_PTYPE_L3_IPV6_EXT | RTE_PTYPE_L2_ETHER): + l3_len = get_ipv6_hdr_len(m, l2_len, IPPROTO_MAX + 1); + if ((m->packet_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_FRAG) { + m = process_ipv6_frag(m, ctx, l2_len, l3_len); + if (m == NULL) + return NULL; + tp = m->packet_type & (RTE_PTYPE_L2_MASK | + RTE_PTYPE_L3_MASK | + RTE_PTYPE_L4_MASK); + } + break; + } + + switch (tp) { + /* non fragmented tcp packets. */ + case (RTE_PTYPE_L4_TCP | RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L2_ETHER): + l3_len = sizeof(struct ipv4_hdr); + l4_len = get_tcp_header_size(m, l2_len, l3_len); + error = adjust_ipv4_pktlen(m, l2_len); + break; + case (RTE_PTYPE_L4_TCP | RTE_PTYPE_L3_IPV6 | RTE_PTYPE_L2_ETHER): + l3_len = sizeof(struct ipv6_hdr); + l4_len = get_tcp_header_size(m, l2_len, l3_len); + error = adjust_ipv6_pktlen(m, l2_len); + break; + case (RTE_PTYPE_L4_TCP | RTE_PTYPE_L3_IPV4_EXT | RTE_PTYPE_L2_ETHER): + l3_len = get_ipv4_hdr_len(m, l2_len, + IPPROTO_TCP, 0); + l4_len = get_tcp_header_size(m, l2_len, l3_len); + error = adjust_ipv4_pktlen(m, l2_len); + break; + case (RTE_PTYPE_L4_TCP | RTE_PTYPE_L3_IPV6_EXT | RTE_PTYPE_L2_ETHER): + l3_len = get_ipv6_hdr_len(m, l2_len, IPPROTO_TCP); + l4_len = get_tcp_header_size(m, l2_len, l3_len); + error = adjust_ipv6_pktlen(m, l2_len); + break; + + /* non fragmented udp packets. */ + case (RTE_PTYPE_L4_UDP | RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L2_ETHER): + l3_len = sizeof(struct ipv4_hdr); + l4_len = sizeof(struct udp_hdr); + error = adjust_ipv4_pktlen(m, l2_len); + break; + case (RTE_PTYPE_L4_UDP | RTE_PTYPE_L3_IPV6 | RTE_PTYPE_L2_ETHER): + l3_len = sizeof(struct ipv6_hdr); + l4_len = sizeof(struct udp_hdr); + error = adjust_ipv6_pktlen(m, l2_len); + break; + case (RTE_PTYPE_L4_UDP | RTE_PTYPE_L3_IPV4_EXT | RTE_PTYPE_L2_ETHER): + l3_len = get_ipv4_hdr_len(m, l2_len, + IPPROTO_UDP, 0); + l4_len = sizeof(struct udp_hdr); + error = adjust_ipv4_pktlen(m, l2_len); + break; + case (RTE_PTYPE_L4_UDP | RTE_PTYPE_L3_IPV6_EXT | RTE_PTYPE_L2_ETHER): + l3_len = get_ipv6_hdr_len(m, l2_len, IPPROTO_UDP); + l4_len = sizeof(struct udp_hdr); + error = adjust_ipv6_pktlen(m, l2_len); + break; + default: + GLUE_LOG(ERR, "drop unknown pkt"); + rte_pktmbuf_free(m); + return NULL; + } + + if (error) { + rte_pktmbuf_free(m); + return NULL; + } + error = fill_pkt_hdr_len(m, l2_len, l3_len, l4_len); + if (error) { + rte_pktmbuf_free(m); + return NULL; + } + return m; +} + + +/* + * HW can recognize L2-arp/L3 with/without extensions/L4 (i40e) + */ +static uint16_t +type0_rx_callback(uint16_t port, + uint16_t queue, + struct rte_mbuf *pkt[], + uint16_t nb_pkts, + uint16_t max_pkts, + void *user_param) +{ + uint32_t j, tp, l2_len, l3_len; + struct glue_ctx *ctx; + uint16_t nb_zero = 0; + + RTE_SET_USED(port); + RTE_SET_USED(queue); + RTE_SET_USED(max_pkts); + + ctx = user_param; + + for (j = 0; j != nb_pkts; j++) { + tp = pkt[j]->packet_type & (RTE_PTYPE_L4_MASK | + RTE_PTYPE_L3_MASK | RTE_PTYPE_L2_MASK); + + switch (tp) { + case (RTE_PTYPE_L2_ETHER_ARP): + arp_recv(ctx, pkt[j], sizeof(struct ether_hdr)); + pkt[j] = NULL; + nb_zero++; + break; + case (RTE_PTYPE_L4_ICMP | RTE_PTYPE_L3_IPV4 | + RTE_PTYPE_L2_ETHER): + case (RTE_PTYPE_L4_ICMP | RTE_PTYPE_L3_IPV4_EXT | + RTE_PTYPE_L2_ETHER): + l2_len = sizeof(struct ether_hdr); + l3_len = get_ipv4_hdr_len(pkt[j], l2_len, IPPROTO_ICMP, 0); + icmp_recv(ctx, pkt[j], l2_len, l3_len); + pkt[j] = NULL; + nb_zero++; + break; + case (RTE_PTYPE_L4_ICMP | RTE_PTYPE_L3_IPV6 | + RTE_PTYPE_L2_ETHER): + case (RTE_PTYPE_L4_ICMP | RTE_PTYPE_L3_IPV6_EXT | + RTE_PTYPE_L2_ETHER): + l2_len = sizeof(struct ether_hdr); + l3_len = get_ipv6_hdr_len(pkt[j], l2_len, IPPROTO_ICMPV6); + icmp6_recv(ctx, pkt[j], l2_len, l3_len); + pkt[j] = NULL; + nb_zero++; + break; + default: + if (common_fill_hdr_len(pkt[j], tp, ctx) == NULL) { + pkt[j] = NULL; + nb_zero++; + } + break; + } + } + + if (nb_zero == 0) + return nb_pkts; + + return compress_pkt_list(pkt, nb_pkts, nb_zero); +} + +/* + * HW can recognize L2/L3/L4 and fragments; but cannot recognize ARP + * nor ICMP (ixgbe). + */ +static uint16_t +type1_rx_callback(uint16_t port, + uint16_t queue, + struct rte_mbuf *pkt[], + uint16_t nb_pkts, + uint16_t max_pkts, + void *user_param) +{ + uint32_t j, tp, l2_len, l3_len; + struct glue_ctx *ctx; + uint16_t nb_zero = 0; + const struct ether_hdr *eth; + const struct ipv4_hdr *ip4; + const struct ipv6_hdr *ip6; + uint16_t etp; + + RTE_SET_USED(port); + RTE_SET_USED(queue); + RTE_SET_USED(max_pkts); + + ctx = user_param; + + for (j = 0; j != nb_pkts; j++) { + tp = pkt[j]->packet_type & (RTE_PTYPE_L4_MASK | RTE_PTYPE_L3_MASK | + RTE_PTYPE_L2_MASK); + + switch (tp) { + case RTE_PTYPE_L2_ETHER: + eth = rte_pktmbuf_mtod(pkt[j], const struct ether_hdr *); + etp = eth->ether_type; + if (etp == rte_be_to_cpu_16(ETHER_TYPE_ARP)) + arp_recv(ctx, pkt[j], sizeof(*eth)); + pkt[j] = NULL; + nb_zero++; + break; + case (RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L2_ETHER): + case (RTE_PTYPE_L3_IPV4_EXT | RTE_PTYPE_L2_ETHER): + ip4 = rte_pktmbuf_mtod_offset(pkt[j], + const struct ipv4_hdr *, + sizeof(*eth)); + if (ip4->next_proto_id == IPPROTO_ICMP) { + l2_len = sizeof(struct ether_hdr); + l3_len = get_ipv4_hdr_len(pkt[j], l2_len, + IPPROTO_ICMP, 0); + icmp_recv(ctx, pkt[j], l2_len, l3_len); + } else + rte_pktmbuf_free(pkt[j]); + + pkt[j] = NULL; + nb_zero++; + break; + case (RTE_PTYPE_L3_IPV6 | RTE_PTYPE_L2_ETHER): + case (RTE_PTYPE_L3_IPV6_EXT | RTE_PTYPE_L2_ETHER): + ip6 = rte_pktmbuf_mtod_offset(pkt[j], + const struct ipv6_hdr *, + sizeof(*eth)); + if (ip6->proto == IPPROTO_ICMPV6) { + l2_len = sizeof(struct ether_hdr); + l3_len = get_ipv6_hdr_len(pkt[j], l2_len, + IPPROTO_ICMPV6); + icmp6_recv(ctx, pkt[j], l2_len, l3_len); + } else + rte_pktmbuf_free(pkt[j]); + + pkt[j] = NULL; + nb_zero++; + break; + default: + if (common_fill_hdr_len(pkt[j], tp, ctx) == NULL) { + pkt[j] = NULL; + nb_zero++; + } + break; + } + } + + if (nb_zero == 0) + return nb_pkts; + + return compress_pkt_list(pkt, nb_pkts, nb_zero); +} + +/* + * generic, assumes HW doesn't recognize any packet type. + */ +uint16_t +typen_rx_callback(uint16_t port, + uint16_t queue, + struct rte_mbuf *pkt[], + uint16_t nb_pkts, + uint16_t max_pkts, + void *user_param) +{ + uint32_t j; + uint16_t nb_zero; + struct glue_ctx *ctx; + + RTE_SET_USED(port); + RTE_SET_USED(queue); + RTE_SET_USED(max_pkts); + + ctx = user_param; + + nb_zero = 0; + for (j = 0; j != nb_pkts; j++) { + /* fix me: now we avoid checking ip checksum */ + pkt[j]->ol_flags &= (~PKT_RX_IP_CKSUM_BAD); + pkt[j]->packet_type = 0; + pkt[j] = fill_ptypes_and_hdr_len(ctx, pkt[j]); + nb_zero += (pkt[j] == NULL); + } + + if (nb_zero == 0) + return nb_pkts; + + return compress_pkt_list(pkt, nb_pkts, nb_zero); +} + +static uint32_t +get_ptypes(uint16_t port_id) +{ + uint32_t smask; + int32_t i, rc; + const uint32_t pmask = + RTE_PTYPE_L2_MASK | RTE_PTYPE_L3_MASK | RTE_PTYPE_L4_MASK; + + smask = 0; + rc = rte_eth_dev_get_supported_ptypes(port_id, pmask, NULL, 0); + if (rc < 0) { + RTE_LOG(ERR, USER1, + "%s(port=%u) failed to get supported ptypes;\n", + __func__, port_id); + return smask; + } + + uint32_t ptype[rc]; + rc = rte_eth_dev_get_supported_ptypes(port_id, pmask, ptype, rc); + + for (i = 0; i != rc; i++) { + switch (ptype[i]) { + case RTE_PTYPE_L2_ETHER_ARP: + smask |= ETHER_ARP_PTYPE; + break; + case RTE_PTYPE_L3_IPV4: + case RTE_PTYPE_L3_IPV4_EXT_UNKNOWN: + smask |= IPV4_PTYPE; + break; + case RTE_PTYPE_L3_IPV4_EXT: + smask |= IPV4_EXT_PTYPE; + break; + case RTE_PTYPE_L3_IPV6: + case RTE_PTYPE_L3_IPV6_EXT_UNKNOWN: + smask |= IPV6_PTYPE; + break; + case RTE_PTYPE_L3_IPV6_EXT: + smask |= IPV6_EXT_PTYPE; + break; + case RTE_PTYPE_L4_TCP: + smask |= TCP_PTYPE; + break; + case RTE_PTYPE_L4_UDP: + smask |= UDP_PTYPE; + break; + case RTE_PTYPE_L4_ICMP: + smask |= ICMP_PTYPE; + break; + } + } + + return smask; +} + +/* In rx callbacks, we need to check and make sure below things are done, + * either by hw or by sw: + * 1. filter out arp packets, and handle arp packets properly + * - for arp request packet, reply arp if it's requesting myself. + * 2. fill l2, l3, l4 header length + * + * 3. GSO/GRO setup (TODO) + * + */ +int +setup_rx_cb(uint16_t port_id, uint16_t qid) +{ + int32_t rc; + uint32_t i, n, smask; + const void *cb; + struct glue_ctx *ctx; + const struct ptype2cb *ptype2cb; + + static const struct ptype2cb tcp_arp_ptype2cb[] = { + { /* i40e */ + .mask = ETHER_ARP_PTYPE | + ICMP_PTYPE | + IPV4_PTYPE | IPV4_EXT_PTYPE | + IPV6_PTYPE | IPV6_EXT_PTYPE | + TCP_PTYPE | UDP_PTYPE, + .name = "HW l2-arp/l3x/l4-tcp ptype", + .fn = type0_rx_callback, + }, + { /* ixgbe does not support ARP ptype */ + .mask = IPV4_PTYPE | IPV4_EXT_PTYPE | + IPV6_PTYPE | IPV6_EXT_PTYPE | + TCP_PTYPE | UDP_PTYPE, + .name = "HW l3x/l4-tcp ptype", + .fn = type1_rx_callback, + }, + { /* virtio */ + .mask = 0, + .name = "HW does not support any ptype", + .fn = typen_rx_callback, + }, + }; + + ctx = glue_ctx_lookup(port_id, qid); + if (ctx == NULL) { + GLUE_LOG(ERR, "no ctx fount by port(%d) and queue (%d)", + port_id, qid); + return -EINVAL; + } + + smask = get_ptypes(port_id); + + ptype2cb = tcp_arp_ptype2cb; + n = RTE_DIM(tcp_arp_ptype2cb); + + for (i = 0; i != n; i++) { + if ((smask & ptype2cb[i].mask) == ptype2cb[i].mask) { + cb = rte_eth_add_rx_callback(port_id, qid, + ptype2cb[i].fn, ctx); + rc = -rte_errno; + GLUE_LOG(ERR, "%s(port=%u), setup RX callback \"%s\";", + __func__, port_id, ptype2cb[i].name); + return ((cb == NULL) ? rc : 0); + } + } + + GLUE_LOG(ERR, "%s(port=%u) failed to find an appropriate callback", + __func__, port_id); + return -ENOENT; +} diff --git a/lib/libtle_glue/rxtx.c b/lib/libtle_glue/rxtx.c new file mode 100644 index 0000000..b80a3ac --- /dev/null +++ b/lib/libtle_glue/rxtx.c @@ -0,0 +1,573 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "sym.h" + +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include "tle_glue.h" +#include "fd.h" +#include "util.h" +#include "internal.h" + +rte_atomic32_t thr_cnt; + +#define MAX_UDP_PKT_LEN ((2 << 16) - 1 - sizeof(struct ipv4_hdr) - sizeof(struct udp_hdr)) + +static inline struct rte_mbuf * +from_mbuf_to_buf(struct rte_mbuf *m, char *buf, + size_t len, int ispeek, int needcpy) +{ + void *src; + uint32_t done = 0; + uint32_t left = len, orig_pkt_len; + uint16_t copy_len, seg_len, segs; + struct rte_mbuf *m_next, *orig_pkt; + + if (len == 0) + return m; + + orig_pkt = m; + orig_pkt_len = m->pkt_len; + segs = m->nb_segs; + + do { + seg_len = rte_pktmbuf_data_len(m); + copy_len = RTE_MIN(seg_len, left); + src = rte_pktmbuf_mtod(m, void *); + if (needcpy) + rte_memcpy(buf + done, src, copy_len); + done += copy_len; + left -= copy_len; + if (copy_len < seg_len) { + if (!ispeek) + rte_pktmbuf_adj(m, copy_len); + break; + } + m_next = m->next; + if (!ispeek) { + rte_pktmbuf_free_seg(m); + segs--; + } + m = m_next; + } while (left && m); + + if (m && !ispeek) { + m->nb_segs = segs; + m->pkt_len = orig_pkt_len - done; + } + + if(ispeek) + return orig_pkt; + else + return m; +} + +static inline bool +is_peer_closed(struct sock *so) +{ + if (errno == EAGAIN && tle_event_state(&so->erev) == TLE_SEV_UP) + return true; + + return false; +} + +static ssize_t +_recv(int sockfd, void *buf, size_t len, struct sockaddr *src_addr, int flags) +{ + int rx; + ssize_t rc; + ssize_t recvlen; + size_t tmplen; + struct sock *so; + struct rte_mbuf *m; + struct epoll_event event; + int needcpy; + + if (RTE_PER_LCORE(_lcore_id) == LCORE_ID_ANY) { + RTE_PER_LCORE(_lcore_id) = rte_atomic32_add_return(&thr_cnt, 1); + } + + so = fd2sock(sockfd); + + if (so->s == NULL) { + if (IS_UDP(so) && is_nonblock(so, flags)) + errno = EAGAIN; + else + errno = ENOTCONN; + return -1; + } + + if (so->rx_left) { + m = so->rx_left; + so->rx_left = NULL; + if (src_addr) { + OPS(so)->getname(so, src_addr, 1); + /* fixme: cannot get addr for UDP in this way */ + } + } else { + rc = OPS(so)->recv(so->s, &m, 1, src_addr); + if (rc == 0) { + if (is_nonblock(so, flags)) { + /* socket closed, return 0 */ + if (is_peer_closed(so)) { + GLUE_DEBUG("peer closed: %d", sockfd); + return 0; + } + + /* According to linux stack, + * receive from shutdown tcp socket returns 0. + * And receive from shutdown udp socket generate + * EAGAIN. In special case, we return ESHUTDOWN + * to notify upper application. + */ + if (so->shutdown & RECV_SHUTDOWN) { + if (so->proto == PROTO_TCP) + return 0; + else { +#ifdef LOOK_ASIDE_BACKEND + errno = ESHUTDOWN; +#else + errno = EAGAIN; +#endif + return -1; + } + } + return -1; + } + + do { + /* in blocking mode, recv from shutdown socket + * return 0 immediately */ + if (so->shutdown & RECV_SHUTDOWN) + return 0; + + /* some error occured, return -1 */ + if (errno != EAGAIN) + return -1; + + /* socket closed, return 0 */ + if (is_peer_closed(so)) { + GLUE_DEBUG("peer closed: %d", sockfd); + return 0; + } + + epoll_kernel_wait(CTX(so), -1, &event, 1, 1, &rx); + + be_process(CTX(so)); + } while((rc = OPS(so)->recv(so->s, &m, 1, src_addr)) == 0); + } + } + + /* get one pkt */ + if (!so->option.timestamp) + so->s->timestamp = m->timestamp; + + needcpy = 1; + recvlen = RTE_MIN(m->pkt_len, len); + if (flags & MSG_TRUNC) { + if (IS_UDP(so)) + recvlen = m->pkt_len; + else + /* According to linux manual, data will be discarded + * if recv TCP stream with MSG_TRUNC flag */ + needcpy = 0; + } + + so->rx_left = from_mbuf_to_buf(m, buf, len, flags & MSG_PEEK, needcpy); + + if (((flags & MSG_PEEK) == 0) && IS_UDP(so) && so->rx_left) { + rte_pktmbuf_free(so->rx_left); + so->rx_left = NULL; + } + + /* UDP socket only receive one pkt at one time */ + if (IS_UDP(so) || (flags & MSG_PEEK)) { + return recvlen; + } + /* TCP socket: try best to fill buf */ + len -= recvlen; + buf = (char*)buf + recvlen; + while (len) { + if (OPS(so)->recv(so->s, &m, 1, src_addr) == 0) + break; + + tmplen = (m->pkt_len < len) ? m->pkt_len : len; + so->rx_left = from_mbuf_to_buf(m, buf, tmplen, 0, needcpy); + len -= tmplen; + recvlen += tmplen; + buf = (char*)buf + tmplen; + } + + if (so->rx_left) + tle_event_raise(&so->rxev); + + /* may send window increase ACK after receive*/ + if (recvlen > 0) + be_tx_with_lock(CTX(so)); + + return recvlen; +} + +ssize_t PRE(recv)(int sockfd, void *buf, size_t len, int flags) +{ + if (is_kernel_fd(sockfd)) + return k_read(sockfd, buf, len); + + return _recv(sockfd, buf, len, NULL, flags); +} + +ssize_t PRE(recvfrom)(int sockfd, void *buf, size_t len, int flags, + struct sockaddr *src_addr, socklen_t *addrlen) +{ + ssize_t rc; + if (is_kernel_fd(sockfd)) + return k_recv(sockfd, buf, len, flags); + + if (src_addr && !addrlen) { + errno = EINVAL; + return -1; + } + rc = _recv(sockfd, buf, len, src_addr, flags); + if (rc >= 0 && src_addr) { + if (src_addr->sa_family == AF_INET) { + *addrlen = sizeof(struct sockaddr_in); + } else { + *addrlen = sizeof(struct sockaddr_in6); + } + } + return rc; +} + +#define RECV_CONTINUE (-2) +static inline ssize_t +try_recvmsg(struct sock *so, struct msghdr *msg, int flags) +{ + ssize_t sz; + + if (so->s == NULL) { + if (IS_UDP(so) && is_nonblock(so, flags)) + errno = EAGAIN; + else + errno = ENOTCONN; + return -1; + } + + sz = OPS(so)->readv(so->s, msg, flags); + if (sz >= 0) { /* get data */ + /* may send window increase ACK after receive*/ + if (sz > 0) + be_tx_with_lock(CTX(so)); + return sz; + } + else if (errno != EAGAIN) /* error occurred */ + return -1; + else if (is_peer_closed(so)) { + GLUE_DEBUG("peer closed: %d", so->fd); + return 0; + } else if (is_nonblock(so, flags)) + return -1; + + return RECV_CONTINUE; +} + +ssize_t PRE(recvmsg)(int sockfd, struct msghdr *msg, int flags) +{ + ssize_t sz; + struct sock *so; + + if (is_kernel_fd(sockfd)) + return k_recvmsg(sockfd, msg, flags); + + so = fd2sock(sockfd); + + if (so->rx_left == NULL && OPS(so)->readv && + (flags & MSG_PEEK) == 0 && + ((flags & MSG_TRUNC) == 0 || so->proto == PROTO_UDP)) { + /* udp_readv supports MSG_TRUNC, tcp_readv not yet. + * so only udp socket implement with readv interface. + */ + sz = try_recvmsg(so, msg, flags); + if (sz != RECV_CONTINUE) + return sz; + } + + /* 1. rx_left != NULL; 2. get no data, fall back to blocking read */ + + if (so->rx_left != NULL && msg != NULL && msg->msg_control != NULL) { + if (so->option.timestamp) + tle_set_timestamp(msg, so->rx_left); + else + msg->msg_controllen = 0; + } + + sz = PRE(recvfrom)(sockfd, msg->msg_iov[0].iov_base, + msg->msg_iov[0].iov_len, flags, + (struct sockaddr *)msg->msg_name, + &msg->msg_namelen); + + return sz; +} + +ssize_t PRE(read)(int fd, void *buf, size_t count) +{ + if (is_kernel_fd(fd)) + return k_read(fd, buf, count); + + return _recv(fd, buf, count, NULL, 0); +} + +#define DECONST(type, var) ((type)(uintptr_t)(const void *)(var)) + +ssize_t PRE(readv)(int fd, const struct iovec *iov, int iovcnt) +{ + ssize_t sz; + struct sock *so; + struct msghdr msg; + + if (is_kernel_fd(fd)) + return k_readv(fd, iov, iovcnt); + + if (RTE_PER_LCORE(_lcore_id) == LCORE_ID_ANY) { + RTE_PER_LCORE(_lcore_id) = rte_atomic32_add_return(&thr_cnt, 1); + } + + so = fd2sock(fd); + + if (so->rx_left == NULL && OPS(so)->readv) { + memset(&msg, 0, sizeof(msg)); + msg.msg_iov = DECONST(struct iovec *, iov); + msg.msg_iovlen = iovcnt; + sz = try_recvmsg(so, &msg, 0); + if (sz != RECV_CONTINUE) + return sz; + } + + /* 1. rx_left != NULL; 2. get no data, fall back to blocking read */ + + /* fixme: when so->rx_left != NULL, also needs readv. + * maybe need to modify readv interface args of ops */ + return _recv(fd, iov[0].iov_base, iov[0].iov_len, NULL, 0); +} + +static ssize_t +_send(int sockfd, const void *buf, size_t len, + const struct sockaddr *peer, int flags) +{ + struct sock *so = fd2sock(sockfd); + struct rte_mempool *mp = get_mempool_by_socket(0); /* fix me */ + uint16_t nb_mbufs = (len + RTE_MBUF_DEFAULT_DATAROOM - 1) + / RTE_MBUF_DEFAULT_DATAROOM; + uint16_t i, cnt, copy_len; + int rc; + struct rte_mbuf *mbufs[nb_mbufs + 1]; + size_t done = 0; + uint32_t left = 0; + char *dst; + int blocking = !is_nonblock(so, flags); + + if (RTE_PER_LCORE(_lcore_id) == LCORE_ID_ANY) { + RTE_PER_LCORE(_lcore_id) = rte_atomic32_add_return(&thr_cnt, 1); + } + + if (!blocking && len > def_sndbuf && so->proto == PROTO_TCP) { + len = def_sndbuf; + nb_mbufs = (len + RTE_MBUF_DEFAULT_DATAROOM - 1) + / RTE_MBUF_DEFAULT_DATAROOM; + } + + if (unlikely(len == 0)) { + if (so->proto == PROTO_TCP) + return 0; + else + nb_mbufs = 1; + } + + if (unlikely(len > MAX_UDP_PKT_LEN && IS_UDP(so))) { + errno = EMSGSIZE; + return -1; + } + + if (blocking) + be_process(get_ctx()); + + if (unlikely(rte_pktmbuf_alloc_bulk(mp, mbufs, nb_mbufs) < 0)) { + errno = ENOMEM; + return -1; + } + + for (i = 0; i < nb_mbufs; ++i) { + copy_len = RTE_MIN((size_t)RTE_MBUF_DEFAULT_DATAROOM, + len - done); + dst = rte_pktmbuf_mtod(mbufs[i], char *); + rte_memcpy(dst, (const char *)buf + done, copy_len); + done += copy_len; + mbufs[i]->data_len = copy_len; + mbufs[i]->pkt_len = copy_len; + } + + cnt = 0; +do_send: + rc = OPS(so)->send(so, mbufs + cnt, nb_mbufs - cnt, peer); + + cnt += rc; + + if (cnt > 0) + be_tx_with_lock(CTX(so)); + + if (cnt > 0 && blocking) + be_process(get_ctx()); + + if (blocking && + cnt < nb_mbufs && + (rc > 0 || errno == EAGAIN) && + tle_event_state(&so->erev) != TLE_SEV_UP) { + be_process(get_ctx()); + goto do_send; + } + + for (i = cnt; i < nb_mbufs; ++i) { + left += mbufs[i]->pkt_len; + rte_pktmbuf_free_seg(mbufs[i]); + } + + if (cnt == 0) + return -1; + else + return len - left; +} + +ssize_t PRE(send)(int sockfd, const void *buf, size_t len, int flags) +{ + if (is_kernel_fd(sockfd)) + return k_write(sockfd, buf, len); + + /* MSG_NOSIGNAL means "Do not generate SIGPIPE". Ignore this flag */ + flags &= ~MSG_NOSIGNAL; + + return _send(sockfd, buf, len, NULL, flags); +} + +ssize_t PRE(sendto)(int sockfd, const void *buf, size_t len, int flags, + const struct sockaddr *dest_addr, socklen_t addrlen) +{ + if (is_kernel_fd(sockfd)) + return k_sendto(sockfd, buf, len, flags, dest_addr, addrlen); + + /* MSG_NOSIGNAL means "Do not generate SIGPIPE". Ignore this flag */ + flags &= ~MSG_NOSIGNAL; + + return _send(sockfd, buf, len, dest_addr, flags); +} + +ssize_t PRE(sendmsg)(int sockfd, const struct msghdr *msg, int flags) +{ + ssize_t ret; + struct sock *so; + + if (is_kernel_fd(sockfd)) + return k_sendmsg(sockfd, msg, flags); + + /* MSG_NOSIGNAL means "Do not generate SIGPIPE". Ignore this flag */ + flags &= ~MSG_NOSIGNAL; + + so = fd2sock(sockfd); + if (OPS(so)->writev) { + ret = OPS(so)->writev(so, msg->msg_iov, msg->msg_iovlen, + msg->msg_name); + if (ret < 0) { + if (errno != EAGAIN || is_nonblock(so, flags)) + return -1; + } else { + /* TODO: blocking && ret < total length */ + be_tx_with_lock(CTX(so)); + return ret; + } + + /* fall through to blocking send */ + } + + return _send(sockfd, msg->msg_iov[0].iov_base, msg->msg_iov[0].iov_len, + (struct sockaddr *)msg->msg_name, flags); +} + +ssize_t PRE(write)(int fd, const void *buf, size_t count) +{ + if (is_kernel_fd(fd)) + return k_write(fd, buf, count); + + return _send(fd, buf, count, NULL, 0); +} + +ssize_t PRE(writev)(int fd, const struct iovec *iov, int iovcnt) +{ + ssize_t ret; + struct sock *so; + + if (is_kernel_fd(fd)) + return k_writev(fd, iov, iovcnt); + + if (RTE_PER_LCORE(_lcore_id) == LCORE_ID_ANY) { + RTE_PER_LCORE(_lcore_id) = rte_atomic32_add_return(&thr_cnt, 1); + } + + so = fd2sock(fd); + if (OPS(so)->writev) { + ret = OPS(so)->writev(so, iov, iovcnt, NULL); + if (ret < 0) { + if (errno != EAGAIN || is_nonblock(so, 0)) + return -1; + } else { + /* TODO: blocking && ret < total length */ + be_tx_with_lock(CTX(so)); + return ret; + } + + /* fall through to blocking send */ + } + + return _send(fd, iov[0].iov_base, iov[0].iov_len, NULL, 0); +} + +/* advanced functions */ +ssize_t PRE(splice)(int fd_in, loff_t *off_in, int fd_out, + loff_t *off_out, size_t len, unsigned int flags) +{ + if (is_kernel_fd(fd_in) && is_kernel_fd(fd_out)) + return k_splice(fd_in, off_in, fd_out, off_out, len, flags); + + rte_panic("splice is not supported yet"); + errno = EOPNOTSUPP; + return -1; +} + +ssize_t PRE(sendfile)(int out_fd, int in_fd, off_t *offset, size_t count) +{ + if (is_kernel_fd(out_fd) && is_kernel_fd(in_fd)) + return k_sendfile(out_fd, in_fd, offset, count); + + rte_panic("sendfile is not supported yet"); + errno = EOPNOTSUPP; + return -1; +} diff --git a/lib/libtle_glue/select.c b/lib/libtle_glue/select.c new file mode 100644 index 0000000..b3b8539 --- /dev/null +++ b/lib/libtle_glue/select.c @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include +#include +#include + +#include "fd.h" +#include "ctx.h" +#include "sym.h" +#include "log.h" +#include "util.h" +#include "internal.h" +#include "tle_glue.h" + +#define FD_ZERO_N(s, n) do { memset((s)->fds_bits, 0, n/sizeof(long)); } while(0) + +static int +fdset_to_events_user(int nfds, fd_set *fdset, int *total, int event) +{ + int i, num = 0; + struct sock *so; + const struct tle_event *ev; + + for (i = fd_table.fd_base; i < nfds; ++i) { + if (!FD_ISSET(i, fdset)) + continue; + + so = fd2sock(i); /* fix me: check if fd is opened */ + + switch (event) { + case EPOLLIN: + ev = &so->rxev; + break; + case EPOLLOUT: + ev = &so->txev; + break; + case EPOLLERR: + ev = &so->erev; + break; + default: + rte_panic("non-sense value\n"); + } + /* Check event is ready */ + if (TLE_SEV_UP == tle_event_state(ev)) { + *total = *total + 1; + } else { + FD_CLR(i, fdset); + num++; + } + + /* We fill sock->event here as we need this when + * we filter events in poll_common(). But it was + * originally set by epoll_ctl(). Now we have to + * assume that there are no application which + * uses epoll/poll/select at the same time. + */ + so->event.events |= event; + so->event.data.u32 = i; + } + + return num; +} + +static int +fdset_to_events_kernel(int nfds, fd_set *fdset, int efd, int event) +{ + int i, num = 0; + struct epoll_event k_ev; + + for (i = 0; i < nfds; ++i) { + if (!FD_ISSET(i, fdset)) + continue; + + k_ev.events = event; + k_ev.data.u32 = i; + k_epoll_ctl(efd, EPOLL_CTL_ADD, i, &k_ev); + num++; + } + + return num; +} + +int +PRE(select)(int nfds, fd_set *readfds, fd_set *writefds, + fd_set *exceptfds, struct timeval *timeout) +{ + int to; + struct glue_ctx *ctx; + int j, efd, total = 0, max = 0; + + /* thread <> context binding happens here */ + if (RTE_PER_LCORE(glue_ctx) == NULL) { + ctx = &ctx_array[glue_ctx_alloc()]; + RTE_PER_LCORE(glue_ctx) = ctx; + } else + ctx = RTE_PER_LCORE(glue_ctx); + + /* step 0, process some packets */ + be_process(ctx); + + /* step 1, check if any userspace events are ready */ + + if (readfds) + max += fdset_to_events_user(nfds, readfds, + &total, EPOLLIN); + if (writefds) + max += fdset_to_events_user(nfds, writefds, + &total, EPOLLOUT); + if (exceptfds) + max += fdset_to_events_user(nfds, writefds, + &total, EPOLLERR); + if (total > 0) { + /* userspace events go firstly */ + if (readfds) + FD_ZERO_N(readfds, fd_table.fd_base); + if (writefds) + FD_ZERO_N(writefds, fd_table.fd_base); + if (exceptfds) + FD_ZERO_N(exceptfds, fd_table.fd_base); + + return total; + } + + /* step 2, only wait for kernel events? */ + if (max == 0) + return k_select(nfds, readfds, writefds, exceptfds, timeout); + + /* step 3, slow path: wait for I/O and kernel events */ + efd = k_epoll_create(1); + if (efd < 0) + rte_panic("k_epoll_create failed %d", errno); + + nfds = RTE_MIN(nfds, fd_table.fd_base); + if (readfds) + max += fdset_to_events_kernel(nfds, readfds, + efd, EPOLLIN); + if (writefds) + max += fdset_to_events_kernel(nfds, writefds, + efd, EPOLLOUT); + if (exceptfds) + max += fdset_to_events_kernel(nfds, exceptfds, + efd, EPOLLERR); + + struct epoll_event events[max]; + + if (timeout) + to = timeout->tv_sec * 1000 + timeout->tv_usec / 1000; + else + to = -1; + total = poll_common(ctx, events, max, to, efd); + + k_close(efd); + for (j = 0; j < total; ++j) { + if (events[j].events & EPOLLIN) + FD_SET(events[j].data.fd, readfds); + + if (events[j].events & EPOLLOUT) + FD_SET(events[j].data.fd, writefds); + + if ((events[j].events & (EPOLLHUP | EPOLLERR)) && exceptfds) + FD_SET(events[j].data.fd, exceptfds); + } + return total; +} + +int +PRE(pselect)(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, + const struct timespec *timeout, const sigset_t *sigmask) +{ + struct timeval tv, *tv_to; + + if (sigmask != NULL) + rte_panic("pselect with signal is not supported"); + + if (timeout) { + tv.tv_usec = timeout->tv_nsec / 1000; + tv.tv_sec = timeout->tv_sec; + tv_to = &tv; + } else + tv_to = NULL; + + return select(nfds, readfds, writefds, exceptfds, tv_to); +} diff --git a/lib/libtle_glue/sock.h b/lib/libtle_glue/sock.h new file mode 100644 index 0000000..fcd6362 --- /dev/null +++ b/lib/libtle_glue/sock.h @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef _SOCK_H_ +#define _SOCK_H_ + +#include +#include +#include +#include + +#include +#include + +#include "ctx.h" + +#ifdef __cplusplus +extern "C" { +#endif + +extern unsigned int def_sndbuf; +extern unsigned int def_rcvbuf; + +#ifndef TCP_FASTOPEN +#define TCP_FASTOPEN 23 +#endif + +#ifndef TCP_USER_TIMEOUT +#define TCP_USER_TIMEOUT 18 +#endif + +#ifndef TCP_FASTOPEN_CONNECT +#define TCP_FASTOPEN_CONNECT 30 +#endif + +struct sock; + +struct proto { + int (*setsockopt)(struct sock *sk, int optname, const void *optval, + socklen_t optlen); + int (*getsockopt)(struct sock *sk, int optname, void *optval, + socklen_t *option); + int (*getname)(struct sock *sk, struct sockaddr *addr, int peer); + + int (*bind)(struct sock *sk, const struct sockaddr *addr); + int (*listen)(struct sock *sk, int backlog); + int (*connect)(struct sock *sk, const struct sockaddr *addr); + int (*accept)(struct sock *sk, struct sockaddr *addr, + socklen_t *addrlen, int flags); + + ssize_t (*recv)(struct tle_stream *s, struct rte_mbuf *pkt[], + uint16_t num, struct sockaddr *addr); + ssize_t (*send)(struct sock *sk, struct rte_mbuf *pkt[], + uint16_t num, const struct sockaddr *dst_addr); + + ssize_t (*readv)(struct tle_stream *s, struct msghdr *msg, int flags); + ssize_t (*writev)(struct sock *sk, const struct iovec *iov, + int iovcnt, const struct sockaddr *dst_addr); + + int (*shutdown)(struct sock *sk, int how); + int (*close)(struct tle_stream *s); + + void (*update_cfg)(struct sock *sk); + + char name[32]; +}; + +enum { + PROTO_TCP, + PROTO_UDP +}; + +#define RECV_SHUTDOWN 1 +#define SEND_SHUTDOWN 2 + +extern struct proto udp_prot; +extern struct proto tcp_prot; +extern struct proto *supported_proto_ops[]; + +struct sock { + int fd; + uint32_t cid:8, /* ctx id for indexing ctx_array */ + domain:8, /* for AF_INET, AF_INET6 */ + proto:8, /* PROTO_TCP, PROTO_UDP */ + valid:1, + epoll:1, + ubind:1, + ubindany:1, + nonblock:1, + tcp_connected:1, + shutdown:2; + struct tle_stream *s; + struct rte_mbuf *rx_left; + tle_stream_options_t option; + union { + struct epoll_event event; + int shadow_efd; + }; + struct tle_event txev; + struct tle_event rxev; + struct tle_event erev; +} __rte_cache_aligned; + +#define CTX(so) (&ctx_array[so->cid]) +#define OPS(so) (supported_proto_ops[so->proto]) +#define IS_TCP(so) (so->proto == PROTO_TCP) +#define IS_UDP(so) (so->proto == PROTO_UDP) + +static inline int +is_nonblock(struct sock *so, int flags) +{ + return (flags & MSG_DONTWAIT) || so->nonblock; +} + +static inline struct tle_ctx * +get_sock_ctx(struct sock *so) +{ + if (IS_TCP(so)) + return CTX(so)->tcp_ctx; + else + return CTX(so)->udp_ctx; +} + +static inline size_t +get_sockaddr_len(sa_family_t family) +{ + switch (family) { + case AF_INET: + return sizeof(struct sockaddr_in); + case AF_INET6: + return sizeof(struct sockaddr_in6); + case AF_UNSPEC: + return sizeof(sa_family_t); + default: + return 0; + } +} + +#ifdef __cplusplus +} +#endif + +#endif /*_SOCK_H_ */ diff --git a/lib/libtle_glue/socket.c b/lib/libtle_glue/socket.c new file mode 100644 index 0000000..31b28be --- /dev/null +++ b/lib/libtle_glue/socket.c @@ -0,0 +1,720 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "sym.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "tle_glue.h" +#include "fd.h" +#include "log.h" +#include "util.h" +#include "internal.h" +#include "sock.h" + +struct proto *supported_proto_ops[] = { + [PROTO_TCP] = &tcp_prot, + [PROTO_UDP] = &udp_prot, +}; + +/* for setup, settings, and destroy */ +int PRE(socket)(int domain, int type, int protocol) +{ + int fd; + struct sock *so; + + if ((domain != AF_INET && domain != AF_INET6) || + (type != SOCK_STREAM && type != SOCK_DGRAM)) + return k_socket(domain, type, protocol); + + if (domain == AF_INET) { + if (default_ctx->ipv4 == 0 && !default_ctx->lo4_enabled) { + errno = EAFNOSUPPORT; + return -1; + } + } else { + if (IN6_IS_ADDR_UNSPECIFIED(&default_ctx->ipv6) && + !default_ctx->lo6_enabled) { + errno = EAFNOSUPPORT; + return -1; + } + } + + fd = get_unused_fd(); + if (fd < 0) { + errno = ENFILE; + return -1; + } + so = fd2sock(fd); + so->cid = get_cid(); + if (type == SOCK_STREAM) + so->proto = PROTO_TCP; + else /* type == SOCK_DGRAM */ + so->proto = PROTO_UDP; + + so->domain = domain; + so->option.raw = 0; + so->option.mulloop = 1; + so->option.multtl = 1; + if (type == SOCK_STREAM) { + so->option.tcpquickack = 1; + /* linux default value: 2 hours */ + so->option.keepidle = 2 * 60 * 60; + /* linux default value: 75seconds */ + so->option.keepintvl = 75; + /* linux default value: 9 */ + so->option.keepcnt = 9; + } + + sock_alloc_events(so); + + GLUE_DEBUG("socket fd = %d", fd); + printf("socket fd = %d", fd); + return fd; +} + +int PRE(bind)(int sockfd, const struct sockaddr *addr, socklen_t addrlen) +{ + struct sock *so; + + if (is_kernel_fd(sockfd)) + return k_bind(sockfd, addr, addrlen); + + so = fd2sock(sockfd); + if (so->s) { + /* The socket is already bound to an address */ + errno = EINVAL; + return -1; + } + + if (addrlen < get_sockaddr_len(addr->sa_family)) { + errno = EINVAL; + return -1; + } + + so->cid = get_cid(); /* allow ctx reset as stream is null */ + if (OPS(so)->bind) + return OPS(so)->bind(so, addr); + + errno = EOPNOTSUPP; + return -1; +} + +int PRE(listen)(int sockfd, int backlog) +{ + struct sock *so; + + if (is_kernel_fd(sockfd)) + return k_listen(sockfd, backlog); + + so = fd2sock(sockfd); + + if (OPS(so)->listen) + return OPS(so)->listen(so, backlog); + + errno = EOPNOTSUPP; + return -1; +} + +int PRE(accept)(int sockfd, struct sockaddr *addr, socklen_t *addrlen) +{ + struct sock *so; + + if (is_kernel_fd(sockfd)) + return k_accept(sockfd, addr, addrlen); + + so = fd2sock(sockfd); + if (OPS(so)->accept) + return OPS(so)->accept(so, addr, addrlen, 0); + + errno = EOPNOTSUPP; + return -1; +} + +int PRE(accept4)(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags) +{ + int fd; + struct sock *so; + + if (is_kernel_fd(sockfd)) + return k_accept4(sockfd, addr, addrlen, flags); + + fd = PRE(accept)(sockfd, addr, addrlen); + + /* inherit NONBLOCK flag */ + if (fd >= 0 && (flags & SOCK_NONBLOCK)) { + so = fd2sock(fd); + so->nonblock = 1; + } + + return fd; +} + +int PRE(connect)(int sockfd, const struct sockaddr *addr, socklen_t addrlen) +{ + struct sock *so; + + if (is_kernel_fd(sockfd)) + return k_connect(sockfd, addr, addrlen); + + if (addrlen < get_sockaddr_len(addr->sa_family)) { + errno = EINVAL; + return -1; + } + + so = fd2sock(sockfd); + so->cid = get_cid(); + + if (!(is_nonblock(so, 0))) + mac_check(CTX(so), addr); + + if (OPS(so)->connect) + return OPS(so)->connect(so, addr); + + errno = EOPNOTSUPP; + return -1; +} + +unsigned int def_sndbuf = 212992; +unsigned int def_rcvbuf = 212992; +static struct linger ling; + +int PRE(getsockopt)(int sockfd, int level, int optname, + void *optval, socklen_t *optlen) +{ + struct sock *so; + union { + int val; + uint64_t val64; + struct linger ling; + struct timeval tm; + } *p = optval; + + + if (is_kernel_fd(sockfd)) + return k_getsockopt(sockfd, level, optname, optval, optlen); + + if (!optval && !optlen) + return -1; + + so = fd2sock(sockfd); + + switch (level) { + case IPPROTO_IP: + switch (optname) { + case IP_OPTIONS: + *optlen = 0; + return 0; + case IP_MULTICAST_LOOP: + p->val = so->option.mulloop; + return 0; + case IP_MULTICAST_TTL: + p->val = so->option.multtl; + return 0; + } + break; + case IPPROTO_IPV6: + switch (optname) { + case IPV6_V6ONLY: + p->val = so->option.ipv6only; + return 0; + } + break; + case SOL_SOCKET: + /* man socket(7), see /usr/include/asm-generic/socket.h */ + switch (optname) { + case SO_REUSEADDR: + p->val = so->option.reuseaddr; + return 0; + case SO_REUSEPORT: + p->val = so->option.reuseport; + return 0; + case SO_ERROR: + if (TLE_SEV_DOWN == tle_event_state(&so->erev)) + p->val = 0; + else + p->val = ECONNREFUSED; + /* fixe me: ETIMEDOUT */ + return 0; + case SO_LINGER: + p->ling.l_onoff = 0; + return 0; + case SO_SNDBUF: + p->val = def_sndbuf; + return 0; + case SO_RCVBUF: + p->val = def_rcvbuf; + return 0; + case SO_ACCEPTCONN: + if (IS_TCP(so) + && TCP_STREAM(so->s)->tcb.state == TCP_ST_LISTEN) + p->val = 1; + else + p->val = 0; + return 0; + case SO_KEEPALIVE: + p->val = so->option.keepalive; + return 0; + case SO_TYPE: + if (IS_TCP(so)) + p->val = SOCK_STREAM; + else + p->val = SOCK_DGRAM; + return 0; + case SO_OOBINLINE: + p->val = so->option.oobinline; + return 0; + case SO_TIMESTAMP: + p->val = so->option.timestamp; + return 0; + case SO_PROTOCOL: + if (so->proto == PROTO_TCP) + p->val = IPPROTO_TCP; + else + p->val = IPPROTO_UDP; + return 0; + default: + break; + } + + break; + case SOL_TCP: + case SOL_UDP: + return OPS(so)->getsockopt(so, optname, optval, optlen); + } + + GLUE_LOG(WARNING, "getsockopt(%d) with level = %d, optname = %d", + sockfd, level, optname); + errno = EOPNOTSUPP; + return -1; +} + +int PRE(setsockopt)(int sockfd, int level, int optname, + const void *optval, socklen_t optlen) +{ + int val; + struct sock *so; + if (is_kernel_fd(sockfd)) + return k_setsockopt(sockfd, level, optname, optval, optlen); + if (!optval && !optlen) + return -1; + + val = 0; /* just to make compiler happy */ + switch (optlen) { + case sizeof(char): + val = *(const char *)optval; + break; + case sizeof(int): + val = *(const int *)optval; + break; + } + + so = fd2sock(sockfd); + + switch (level) { + case IPPROTO_IP: + switch (optname) { + case IP_RECVERR: + /* needed by netperf */ + return 0; + case IP_MULTICAST_LOOP: + if (val == 0) + so->option.mulloop = 0; + else + so->option.mulloop = 1; + if (so->s != NULL) + so->s->option.mulloop = so->option.mulloop; + return 0; + case IP_MULTICAST_TTL: + if (val > 255 || val < -1) { + errno = EINVAL; + return -1; + } + if(val == -1) { + val = 1; + } + so->option.multtl = val; + if (so->s != NULL) + so->s->option.multtl = so->option.multtl; + return 0; + case IP_ADD_MEMBERSHIP: + if (optlen < sizeof(struct ip_mreq)) { + errno = EINVAL; + return -1; + } + const struct ip_mreq* mreq = (const struct ip_mreq*)optval; + if (mreq->imr_multiaddr.s_addr == INADDR_ANY) { + errno = EINVAL; + return -1; + } + errno = EOPNOTSUPP; + return -1; + case IP_MTU_DISCOVER: + return 0; + case IP_TOS: + return 0; + case IP_RECVTOS: + return 0; + } + break; + case IPPROTO_IPV6: + switch (optname) { + case IPV6_V6ONLY: + if (val == 0) + so->option.ipv6only = 0; + else + so->option.ipv6only = 1; + if (so->s != NULL) + so->s->option.ipv6only = so->option.ipv6only; + return 0; + case IPV6_TCLASS: + return 0; + case IPV6_RECVTCLASS: + return 0; + } + break; + case SOL_SOCKET: + switch (optname) { + case SO_REUSEADDR: + if (val == 0) + so->option.reuseaddr = 0; + else + so->option.reuseaddr = 1; + if (so->s != NULL) + so->s->option.reuseaddr = so->option.reuseaddr; + return 0; + case SO_LINGER: + ling = *(const struct linger *)optval; + if (ling.l_onoff == 0) + return 0; + else { + GLUE_LOG(ERR, "app is enabling SO_LINGER which is not really supported"); + return 0; + } + break; + case SO_KEEPALIVE: + if (val == 0) + so->option.keepalive = 0; + else + so->option.keepalive = 1; + if (so->s != NULL) { + so->s->option.keepalive = so->option.keepalive; + if (so->proto == PROTO_TCP) + tle_tcp_stream_set_keepalive(so->s); + } + return 0; + case SO_REUSEPORT: + if (val == 0) + so->option.reuseport = 0; + else + so->option.reuseport = 1; + if (so->s != NULL) + so->s->option.reuseport = so->option.reuseport; + return 0; + case SO_SNDBUF: + def_sndbuf = val; + return 0; + case SO_RCVBUF: + def_rcvbuf = val; + return 0; + case SO_DONTROUTE: + /* needed by netperf */ + return 0; + case SO_BROADCAST: + /* needed by nc */ + /* todo: only supported for DGRAM */ + return 0; + case SO_TIMESTAMP: + so->option.timestamp = !!val; + if (so->s != NULL) + so->s->option.timestamp = so->option.timestamp; + return 0; + case SO_OOBINLINE: + if (val == 0) + so->option.oobinline = 0; + else + so->option.oobinline = 1; + if (so->s != NULL) + so->s->option.oobinline = so->option.oobinline; + return 0; + default: + break; + } + break; + case IPPROTO_TCP: + case IPPROTO_UDP: + return OPS(so)->setsockopt(so, optname, optval, optlen); + } + + GLUE_LOG(WARNING, "setsockopt(%d) with level = %d, optname = %d\n", + sockfd, level, optname); + errno = EOPNOTSUPP; + return -1; +} + +/* + * Refer to glibc/sysdeps/unix/sysv/linux/fcntl.c + */ +int PRE(fcntl)(int fd, int cmd, ...) +{ + int rc; + void *arg; + va_list ap; + struct sock *so; + + va_start(ap, cmd); + arg = va_arg(ap, void *); + va_end(ap); + + if (is_kernel_fd(fd)) + return k_fcntl(fd, cmd, arg); + + so = fd2sock(fd); + switch (cmd) { + case F_SETFL: + if ((unsigned long)arg & O_NONBLOCK) + so->nonblock = 1; + else + so->nonblock = 0; + rc = 0; + break; + case F_GETFL: + if (so->nonblock) + rc = O_NONBLOCK | O_RDWR; + else + rc = O_RDWR; + break; + case F_SETFD: + rc = 0; + break; + default: + rc = -1; + errno = EOPNOTSUPP; + GLUE_LOG(WARNING, "fcntl(%d) with cmd = %d", fd, cmd); + } + + return rc; +} + +/* + * Refer to musl/src/misc/ioctl.c + */ +int PRE(ioctl)(int fd, unsigned long int request, ...) +{ + int rc; + void *arg; + va_list ap; + uint16_t left; + struct sock *so; + struct rte_mbuf *m; + + va_start(ap, request); + arg = va_arg(ap, void *); + va_end(ap); + + if (is_kernel_fd(fd)) + return k_ioctl(fd, request, arg); + + so = fd2sock(fd); + + switch (request) { + case FIONREAD: /* SIOCINQ */ + if (so->s == NULL) + *(int *)arg = 0; + else if (IS_TCP(so)) { + left = tle_tcp_stream_inq(so->s); + if (so->rx_left) + left += rte_pktmbuf_pkt_len(so->rx_left); + *(int *)arg = left; + } else { + if (so->rx_left) + *(int *)arg = rte_pktmbuf_pkt_len(so->rx_left); + else { + if (tle_udp_stream_recv(so->s, &m , 1) == 0) + *(int *)arg = 0; + else { + *(int *)arg = rte_pktmbuf_pkt_len(m); + so->rx_left = m; + } + } + } + rc = 0; + break; + case FIONBIO: + if (*(int *)arg) + so->nonblock = 1; + else + so->nonblock = 0; + rc = 0; + break; + case SIOCGSTAMP: + if (so->s->timestamp == 0) { + errno = ENOENT; + rc = -1; + } else { + ((struct timeval*)arg)->tv_sec = so->s->timestamp >> 20; + ((struct timeval*)arg)->tv_usec = so->s->timestamp & 0xFFFFFUL; + rc = 0; + } + break; + default: + errno = EOPNOTSUPP; + rc = -1; + GLUE_LOG(WARNING, "ioctl(%d) with request = %ld", fd, request); + } + + return rc; +} + +int PRE(shutdown)(int sockfd, int how) +{ + struct sock *so; + + if (is_kernel_fd(sockfd)) + return k_shutdown(sockfd, how); + + so = fd2sock(sockfd); + switch (how) { + case SHUT_RD: + so->shutdown |= RECV_SHUTDOWN; + break; + case SHUT_WR: + so->shutdown |= SEND_SHUTDOWN; + break; + case SHUT_RDWR: + so->shutdown = RECV_SHUTDOWN | SEND_SHUTDOWN; + break; + } + if (OPS(so)->shutdown) + return OPS(so)->shutdown(so, how); + + errno = EOPNOTSUPP; + return -1; +} + +static inline int +getname(int sockfd, struct sockaddr *uaddr, socklen_t *addrlen, int peer) +{ + struct sock *so; + size_t socklen; + int rc; + + so = fd2sock(sockfd); + + /* This is ugly, but netperf ask for local addr (before any + * connect or bind) to check family. + * + * To formally fix this, we shall bind a local address in advance + */ + socklen = get_sockaddr_len(so->domain); + /* fixme: It is not conform to linux standard, fix it later. */ + if (*addrlen < socklen) { + errno = EINVAL; + return -1; + } + *addrlen = socklen; + + if (so->s == NULL) { + if (peer) { + errno = ENOTCONN; + return -1; + } else { + memset(uaddr, 0, socklen); + uaddr->sa_family = so->domain; + return 0; + } + } + + if (OPS(so)->getname) { + rc = OPS(so)->getname(so, uaddr, peer); + if (rc < 0) + return rc; + if (peer) { + if ((uaddr->sa_family == AF_INET && + ((struct sockaddr_in*)uaddr)->sin_addr.s_addr == 0) || + (uaddr->sa_family == AF_INET6 && + IN6_IS_ADDR_UNSPECIFIED(&((struct sockaddr_in6*) + uaddr)->sin6_addr))) { + errno = ENOTCONN; + return -1; + } + } + if (uaddr->sa_family == AF_INET && so->domain == AF_INET6) + trans_4mapped6_addr(uaddr); + return rc; + } + + errno = EOPNOTSUPP; + return -1; +} + +int PRE(getsockname)(int sockfd, struct sockaddr *addr, socklen_t *addrlen) +{ + if (is_kernel_fd(sockfd)) + return k_getsockname(sockfd, addr, addrlen); + + return getname(sockfd, addr, addrlen, 0); +} + +int PRE(getpeername)(int sockfd, struct sockaddr *addr, socklen_t *addrlen) +{ + if (is_kernel_fd(sockfd)) + return k_getpeername(sockfd, addr, addrlen); + + return getname(sockfd, addr, addrlen, 1); +} + +int PRE(close)(int fd) +{ + struct sock *so; + + if (is_kernel_fd(fd)) + return k_close(fd); + + GLUE_DEBUG("close fd = %d", fd); + + so = fd2sock(fd); + if (unlikely(so->valid == 0)) { + errno = EBADF; + return -1; + } else if (unlikely(so->epoll)) { + k_close(so->shadow_efd); + glue_ctx_free(CTX(so)); + } else if (so->s) { + if (OPS(so)->close) + OPS(so)->close(so->s); + + if (IS_TCP(so)) + be_tx_with_lock(CTX(so)); + + if (so->rx_left) + rte_pktmbuf_free(so->rx_left); + } + + tle_event_idle_err(&so->erev); + tle_event_idle(&so->rxev); + tle_event_idle(&so->txev); + + memset(((int*)so) + 1, 0, sizeof(*so) - sizeof(int)); + put_free_fd(fd); + return 0; +} diff --git a/lib/libtle_glue/sym.c b/lib/libtle_glue/sym.c new file mode 100644 index 0000000..39b1707 --- /dev/null +++ b/lib/libtle_glue/sym.c @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#ifndef __USE_GNU +#define __USE_GNU +#endif +#include + +#include + +#include "sym.h" +#include "log.h" + +#ifdef PRELOAD +int (*k_epoll_create)(int size); +int (*k_epoll_create1)(int flags); +int (*k_epoll_create1)(int flags); +int (*k_epoll_ctl)(int epfd, int op, int fd, struct epoll_event *event); +int (*k_epoll_wait)(int epfd, struct epoll_event *events, int maxevents, int timeout); +int (*k_epoll_pwait)(int epfd, struct epoll_event *events, int maxevents, int timeout, const sigset_t *sigmask); +int (*k_poll)(struct pollfd *fds, nfds_t nfds, int timeout); +int (*k_select)(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, struct timeval *timeout); +int (*k_pselect)(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, const struct timespec *timeout, const sigset_t *sigmask); +int (*k_socket)(int domain, int type, int protocol); +int (*k_listen)(int sockfd, int backlog); +int (*k_bind)(int sockfd, const struct sockaddr *addr, socklen_t addrlen); +int (*k_accept)(int sockfd, struct sockaddr *addr, socklen_t *addrlen); +int (*k_accept4)(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags); +int (*k_connect)(int sockfd, const struct sockaddr *addr, socklen_t addrlen); +int (*k_getsockopt)(int sockfd, int level, int optname, void *optval, socklen_t *optlen); +int (*k_setsockopt)(int sockfd, int level, int optname, const void *optval, socklen_t optlen); +int (*k_fcntl)(int fd, int cmd, ... /* arg */ ); +int (*k_ioctl)(int d, int request, ...); +int (*k_shutdown)(int sockfd, int how); +int (*k_close)(int fd); +ssize_t (*k_recv)(int sockfd, void *buf, size_t len, int flags); +ssize_t (*k_recvfrom)(int sockfd, void *buf, size_t len, int flags, struct sockaddr *src_addr, socklen_t *addrlen); +ssize_t (*k_recvmsg)(int sockfd, struct msghdr *msg, int flags); +ssize_t (*k_read)(int fd, void *buf, size_t count); +ssize_t (*k_readv)(int fd, const struct iovec *iov, int iovcnt); +ssize_t (*k_send)(int sockfd, const void *buf, size_t len, int flags); +ssize_t (*k_sendto)(int sockfd, const void *buf, size_t len, int flags, const struct sockaddr *dest_addr, socklen_t addrlen); +ssize_t (*k_sendmsg)(int sockfd, const struct msghdr *msg, int flags); +ssize_t (*k_write)(int fd, const void *buf, size_t count); +ssize_t (*k_writev)(int fd, const struct iovec *iov, int iovcnt); +ssize_t (*k_splice)(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags); +ssize_t (*k_sendfile)(int out_fd, int in_fd, off_t *offset, size_t count); +int (*k_getsockname)(int sockfd, struct sockaddr *addr, socklen_t *addrlen); +int (*k_getpeername)(int sockfd, struct sockaddr *addr, socklen_t *addrlen); + +#define INIT_FUNC(func, handle) do { \ + k_##func = dlsym(handle, #func); \ + if ((error = dlerror()) != NULL) { \ + rte_panic(#func "is not init"); \ + } \ + RTE_ASSERT(k_##func); \ +} while (0) + +#endif + +void +symbol_init(void) +{ +#ifdef PRELOAD + void *handle; + char *error; + + TRACE("in %s", __func__); + + handle = dlopen("libc.so.6", RTLD_NOW); + error = dlerror(); + if (!handle) { + fprintf(stderr, "%s\n", error); + exit(EXIT_FAILURE); + } + + INIT_FUNC(epoll_create, handle); + INIT_FUNC(epoll_create1, handle); + INIT_FUNC(epoll_create1, handle); + INIT_FUNC(epoll_ctl, handle); + INIT_FUNC(epoll_wait, handle); + INIT_FUNC(epoll_pwait, handle); + INIT_FUNC(socket, handle); + INIT_FUNC(listen, handle); + INIT_FUNC(bind, handle); + INIT_FUNC(accept, handle); + INIT_FUNC(accept4, handle); + INIT_FUNC(connect, handle); + INIT_FUNC(getsockopt, handle); + INIT_FUNC(setsockopt, handle); + INIT_FUNC(fcntl, handle); + INIT_FUNC(ioctl, handle); + INIT_FUNC(shutdown, handle); + INIT_FUNC(close, handle); + INIT_FUNC(recv, handle); + INIT_FUNC(recvfrom, handle); + INIT_FUNC(recvmsg, handle); + INIT_FUNC(read, handle); + INIT_FUNC(readv, handle); + INIT_FUNC(send, handle); + INIT_FUNC(sendto, handle); + INIT_FUNC(sendmsg, handle); + INIT_FUNC(write, handle); + INIT_FUNC(writev, handle); + INIT_FUNC(splice, handle); + INIT_FUNC(sendfile, handle); + INIT_FUNC(poll, handle); + INIT_FUNC(getsockname, handle); + INIT_FUNC(getpeername, handle); + INIT_FUNC(select, handle); + INIT_FUNC(pselect, handle); + + dlclose(handle); +#endif +} diff --git a/lib/libtle_glue/sym.h b/lib/libtle_glue/sym.h new file mode 100644 index 0000000..b5a333d --- /dev/null +++ b/lib/libtle_glue/sym.h @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _TLE_KSYM_H_ +#define _TLE_KSYM_H_ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "tle_glue.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void symbol_init(void); + +#ifdef PRELOAD +int (*k_epoll_create)(int size); +int (*k_epoll_create1)(int flags); +int (*k_epoll_ctl)(int epfd, int op, int fd, struct epoll_event *event); +int (*k_epoll_wait)(int epfd, struct epoll_event *events, int maxevents, int timeout); +int (*k_epoll_pwait)(int epfd, struct epoll_event *events, int maxevents, int timeout, const sigset_t *sigmask); +int (*k_poll)(struct pollfd *fds, nfds_t nfds, int timeout); +int (*k_select)(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, struct timeval *timeout); +int (*k_pselect)(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, const struct timespec *timeout, const sigset_t *sigmask); + +int (*k_socket)(int domain, int type, int protocol); +int (*k_listen)(int sockfd, int backlog); +int (*k_bind)(int sockfd, const struct sockaddr *addr, socklen_t addrlen); +int (*k_accept)(int sockfd, struct sockaddr *addr, socklen_t *addrlen); +int (*k_accept4)(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags); +int (*k_connect)(int sockfd, const struct sockaddr *addr, socklen_t addrlen); +int (*k_getsockopt)(int sockfd, int level, int optname, void *optval, socklen_t *optlen); +int (*k_setsockopt)(int sockfd, int level, int optname, const void *optval, socklen_t optlen); +int (*k_fcntl)(int fd, int cmd, ... /* arg */ ); +int (*k_ioctl)(int d, int request, ...); +int (*k_shutdown)(int sockfd, int how); +int (*k_close)(int fd); +ssize_t (*k_recv)(int sockfd, void *buf, size_t len, int flags); +ssize_t (*k_recvfrom)(int sockfd, void *buf, size_t len, int flags, struct sockaddr *src_addr, socklen_t *addrlen); +ssize_t (*k_recvmsg)(int sockfd, struct msghdr *msg, int flags); +ssize_t (*k_read)(int fd, void *buf, size_t count); +ssize_t (*k_readv)(int fd, const struct iovec *iov, int iovcnt); +ssize_t (*k_send)(int sockfd, const void *buf, size_t len, int flags); +ssize_t (*k_sendto)(int sockfd, const void *buf, size_t len, int flags, const struct sockaddr *dest_addr, socklen_t addrlen); +ssize_t (*k_sendmsg)(int sockfd, const struct msghdr *msg, int flags); +ssize_t (*k_write)(int fd, const void *buf, size_t count); +ssize_t (*k_writev)(int fd, const struct iovec *iov, int iovcnt); +ssize_t (*k_splice)(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags); +ssize_t (*k_sendfile)(int out_fd, int in_fd, off_t *offset, size_t count); +int (*k_getsockname)(int sockfd, struct sockaddr *addr, socklen_t *addrlen); +int (*k_getpeername)(int sockfd, struct sockaddr *addr, socklen_t *addrlen); +#else +#define k_epoll_create epoll_create +#define k_epoll_create1 epoll_create1 +#define k_epoll_ctl epoll_ctl +#define k_epoll_wait epoll_wait +#define k_epoll_pwait epoll_pwait +#define k_poll poll +#define k_select select +#define k_pselect pselect +#define k_socket socket +#define k_listen listen +#define k_bind bind +#define k_accept accept +#define k_accept4 accept4 +#define k_connect connect +#define k_getsockopt getsockopt +#define k_setsockopt setsockopt +#define k_fcntl fcntl +#define k_ioctl ioctl +#define k_shutdown shutdown +#define k_close close +#define k_recv recv +#define k_recvfrom recvfrom +#define k_recvmsg recvmsg +#define k_read read +#define k_readv readv +#define k_send send +#define k_sendto sendto +#define k_sendmsg sendmsg +#define k_write write +#define k_writev writev +#define k_splice splice +#define k_sendfile sendfile +#define k_getsockname getsockname +#define k_getpeername getpeername +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _TLE_KSYM_H_ */ diff --git a/lib/libtle_glue/tcp.c b/lib/libtle_glue/tcp.c new file mode 100644 index 0000000..e5186c0 --- /dev/null +++ b/lib/libtle_glue/tcp.c @@ -0,0 +1,558 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "sym.h" +#include "fd.h" +#include "log.h" +#include "util.h" +#include "internal.h" +#include "sock.h" + +#define MAX_TCP_KEEPIDLE 32767 +#define MAX_TCP_KEEPINTVL 32767 +#define MAX_TCP_KEEPCNT 127 + +static inline void +foo_support(const char *msg) +{ + GLUE_LOG(WARNING, "%s, return ok without really supporting it", msg); +} + +static int +tcp_setsockopt(struct sock *sk, int optname, + const void *optval, socklen_t optlen) +{ + int val; + + val = 0; /* just to make compiler happy */ + if (optlen == sizeof(val)) + val = *(const int *)optval; + + /* man tcp(7) or see /usr/include/netinet/tcp.h */ + switch (optname) { + case TCP_NODELAY: /* antonym: TCP_CORK */ + if (val == 0) + sk->option.tcpnodelay = 0; + else + sk->option.tcpnodelay = 1; + if (sk->s != NULL) + sk->s->option.tcpnodelay = sk->option.tcpnodelay; + return 0; + case TCP_CORK: + if (val == 0) + sk->option.tcpcork = 0; + else + sk->option.tcpcork = 1; + if (sk->s != NULL) + sk->s->option.tcpcork = sk->option.tcpcork; + return 0; + case TCP_KEEPIDLE: + if (val <= 0 || val > MAX_TCP_KEEPIDLE) { + errno = EINVAL; + return -1; + } + sk->option.keepidle = val; + if (sk->s != NULL) { + sk->s->option.keepidle = sk->option.keepidle; + tle_tcp_stream_set_keepalive(sk->s); + } + return 0; + case TCP_KEEPINTVL: + if (val <= 0 || val > MAX_TCP_KEEPINTVL) { + errno = EINVAL; + return -1; + } + sk->option.keepintvl = val; + if (sk->s != NULL) { + sk->s->option.keepintvl = sk->option.keepintvl; + tle_tcp_stream_set_keepalive(sk->s); + } + return 0; + case TCP_KEEPCNT: + if (val <= 0 || val > MAX_TCP_KEEPCNT) { + errno = EINVAL; + return -1; + } + sk->option.keepcnt = val; + if (sk->s != NULL) + sk->s->option.keepcnt = sk->option.keepcnt; + return 0; + case TCP_USER_TIMEOUT: + foo_support("set TCP_USER_TIMEOUT"); + return 0; + case TCP_DEFER_ACCEPT: + if (val == 0) + return 0; + break; + case TCP_FASTOPEN: + case TCP_FASTOPEN_CONNECT: + if (val == 0) + return 0; + break; + case TCP_QUICKACK: + /* Based on below info, it's safe to just return 0: + * "This flag is not permanent, it only enables a + * switch to or from quickack mode. Subsequent + * operationof the TCP protocol will once again ..." + */ + if (val == 0) + sk->option.tcpquickack = 0; + else + sk->option.tcpquickack = 8; + if (sk->s != NULL) + sk->s->option.tcpquickack = sk->option.tcpquickack; + return 0; + case TCP_CONGESTION: + /* only support NewReno; but we return success for + * any kind of setting. + */ + foo_support("set TCP_CONGESTION"); + return 0; + default: + break; + } + + GLUE_LOG(WARNING, "setsockopt(%d) with level = SOL_TCP, optname = %d\n", + sock2fd(sk), optname); + errno = EOPNOTSUPP; + return -1; +} + +static int +tcp_getsockopt(struct sock *sk, int optname, + void *optval, socklen_t *optlen) +{ + int rc; + union { + int val; + uint64_t val64; + struct linger ling; + struct timeval tm; + } *p = optval; + + RTE_SET_USED(optlen); + + /* man tcp(7) or see /usr/include/netinet/tcp.h */ + switch (optname) { + case TCP_MAXSEG: + p->val = 64 * 1024; + return 0; + case TCP_FASTOPEN: + case TCP_FASTOPEN_CONNECT: + p->val = 0; + return 0; + case TCP_INFO: + /* needed by netperf */ + rc = tle_tcp_stream_get_info(sk->s, optval, optlen); + if (rc < 0) { + errno = -rc; + return -1; + } + return 0; + case TCP_CONGESTION: + strncpy(optval, "NewReno", *optlen); + ((char *)optval)[*optlen - 1] = '\0'; + return 0; + case TCP_CORK: + p->val = sk->option.tcpcork; + return 0; + case TCP_QUICKACK: + p->val = sk->option.tcpquickack != 0 ? 1 : 0; + return 0; + case TCP_NODELAY: + p->val = sk->option.tcpnodelay; + return 0; + case TCP_KEEPIDLE: + p->val = sk->option.keepidle; + return 0; + case TCP_KEEPINTVL: + p->val = sk->option.keepintvl; + return 0; + case TCP_KEEPCNT: + p->val = sk->option.keepcnt; + return 0; + default: + break; + } + + GLUE_LOG(WARNING, "getsockopt(%d) with level = SOL_TCP, optname = %d", + sock2fd(sk), optname); + errno = EOPNOTSUPP; + return -1; +} + +static int +tcp_getname(struct sock *sk, struct sockaddr *addr, int peer) +{ + int rc; + int addrlen; + struct tle_tcp_stream_addr a; + + rc = tle_tcp_stream_get_addr(sk->s, &a); + if (rc) { + errno = -rc; + return -1; + } + + if (a.local.ss_family == AF_INET) + addrlen = sizeof(struct sockaddr_in); + else + addrlen = sizeof(struct sockaddr_in6); + + if (peer) + memcpy(addr, &a.remote, addrlen); + else + memcpy(addr, &a.local, addrlen); + + addr->sa_family = a.local.ss_family; + + return 0; +} + +static int +tcp_bind(struct sock *sk, const struct sockaddr *addr) +{ + sk->s = open_bind(sk, addr, NULL); + if (sk->s == NULL) + return -1; + return 0; +} + +static int +tcp_listen(struct sock *sk, int backlog) +{ + int32_t rc; + + if (backlog < 0) { + errno = EINVAL; + return -1; + } + + /* + * if socket is unbind, should call open_bind to assign an ramdon addres + * before listening + */ + if (sk->s == NULL) { + sk->s = open_bind(sk, NULL, NULL); + if (sk->s == NULL) + return -1; + } + + rc = tle_tcp_stream_listen(sk->s); + if (rc) { + errno = -rc; + return -1; + } + + return 0; +} + +static int +tcp_connect(struct sock *sk, const struct sockaddr *addr) +{ + int rc; + int rx; + int ret; + struct epoll_event event; + struct sockaddr_storage laddr; + struct sockaddr_storage raddr; + struct sockaddr_in *addr4; + struct sockaddr_in6 *addr6; + struct sockaddr *local = NULL; + + /* TODO: For multi-thread case, we shall properly manage local + * L4 port so that packets coming back can be put into the same + * queue pair. + */ + if (sk->s) { + struct tle_tcp_stream *ts = TCP_STREAM(sk->s); + /* case 1: bind happens before connect; + * case 2: connect after a previous connect, failed + * or succeeded. + */ + if (ts->tcb.err != 0) { + errno = ts->tcb.err; + return -1; + } + + int state = ts->tcb.state; + + if (state >= TCP_ST_ESTABLISHED && sk->tcp_connected == 0) { + sk->tcp_connected = 1; + return 0; /* connect succeeds */ + } + + if (state == TCP_ST_CLOSED) { + if (tcp_getname(sk, (struct sockaddr *)&laddr, 0) == 0) + local = (struct sockaddr *)&laddr; + tle_tcp_stream_close(sk->s); + sk->s = NULL; + goto do_connect; /* case 1 */ + } else if (state >= TCP_ST_SYN_SENT && + state < TCP_ST_ESTABLISHED) + errno = EALREADY; + else if (state >= TCP_ST_ESTABLISHED) + errno = EISCONN; + else + errno = EINVAL; + return -1; + } + +do_connect: + sk->s = open_bind(sk, local, addr); + if (sk->s == NULL) /* errno is set */ + return -1; + + if (sk->domain == AF_INET) { + addr4 = (struct sockaddr_in*)&raddr; + addr4->sin_family = AF_INET; + addr4->sin_port = sk->s->port.src; + addr4->sin_addr.s_addr = sk->s->ipv4.addr.src; + } else { + addr6 = (struct sockaddr_in6*)&raddr; + addr6->sin6_family = AF_INET6; + addr6->sin6_port = sk->s->port.src; + rte_memcpy(&addr6->sin6_addr, &sk->s->ipv6.addr.src, + sizeof(struct in6_addr)); + } + rc = tle_tcp_stream_connect(sk->s, (const struct sockaddr*)&raddr); + if (rc < 0) { + errno = -rc; + return -1; + } + + if (is_nonblock(sk, 0)) { + be_tx_with_lock(CTX(sk)); + errno = EINPROGRESS; /* It could not be ready so fast */ + return -1; + } + + do { + be_process(CTX(sk)); + + if (tle_event_state(&sk->txev) == TLE_SEV_UP) { + sk->tcp_connected = 1; + tle_event_down(&sk->txev); + ret = 0; + break; + } + + if (tle_event_state(&sk->erev) == TLE_SEV_UP) { + tle_event_down(&sk->erev); + errno = ECONNREFUSED; + ret = -1; + break; + } + + /* fix me: timeout? */ + epoll_kernel_wait(CTX(sk), -1, &event, 1, 1, &rx); + } while (1); + + return ret; +} + +static void tcp_update_cfg(struct sock *sk); + +static int +tcp_accept(struct sock *sk, struct sockaddr *addr, + socklen_t *addrlen, int flags) +{ + int fd; + int rx; + struct sock *newsk; + struct tle_stream *rs; + struct epoll_event event; + struct tle_tcp_stream_addr a; + + if (sk->s == NULL) { + errno = EINVAL; + return -1; + } + + fd = get_unused_fd(); + if (fd < 0) { + errno = ENFILE; + return -1; + } + + newsk = fd2sock(fd); +again: + if (tle_tcp_stream_accept(sk->s, &rs, 1) == 0) { + if (rte_errno != EAGAIN) { + errno = rte_errno; + return -1; + } + + if (is_nonblock(sk, flags)) { + newsk->valid = 0; + put_free_fd(fd); + errno = EAGAIN; + return -1; + } + + epoll_kernel_wait(CTX(sk), -1, &event, 1, 1, &rx); + be_process(CTX(sk)); + goto again; + } + + newsk->s = rs; + newsk->cid = sk->cid; + newsk->domain = sk->domain; + newsk->proto = sk->proto; + newsk->option.raw = 0; + newsk->option.tcpquickack = 1; + newsk->option.mulloop = 1; + newsk->option.multtl = 1; + newsk->option.keepidle = 2 * 60 * 60; + newsk->option.keepintvl = 75; + newsk->option.keepcnt = 9; + newsk->s->option.raw = newsk->option.raw; + sock_alloc_events(newsk); + tcp_update_cfg(newsk); + + if (addr) { + /* We assume this function never fails */ + tle_tcp_stream_get_addr(rs, &a); + + *addrlen = sizeof(struct sockaddr_in); + memcpy(addr, &a.remote, *addrlen); + } + + GLUE_DEBUG("accept fd = %d", fd); + return fd; +} + +static ssize_t +tcp_send(struct sock *sk, struct rte_mbuf *pkt[], + uint16_t num, const struct sockaddr *dst_addr) +{ + uint16_t rc; + RTE_SET_USED(dst_addr); + + if (sk->s == NULL) { + errno = EPIPE; + return 0; + } + + rc = tle_tcp_stream_send(sk->s, pkt, num); + if (rc == 0) + errno = rte_errno; + return rc; +} + +static ssize_t +tcp_recv(struct tle_stream *s, struct rte_mbuf *pkt[], + uint16_t num, struct sockaddr *addr) +{ + uint16_t rc; + + RTE_SET_USED(addr); + + /* optimize me: merge multiple mbufs into one */ + rc = tle_tcp_stream_recv(s, pkt, num); + if (rc == 0) + errno = rte_errno; + + return rc; +} + +static ssize_t +tcp_readv(struct tle_stream *ts, struct msghdr *msg, int flags __rte_unused) +{ + ssize_t rc; + + rc = tle_tcp_stream_recvmsg(ts, msg); + if (rc < 0) + errno = rte_errno; + return rc; +} + +static ssize_t +tcp_writev(struct sock *sk, const struct iovec *iov, + int iovcnt, const struct sockaddr *dst_addr) +{ + ssize_t rc; + struct rte_mempool *mp = get_mempool_by_socket(0); /* fix me */ + + RTE_SET_USED(dst_addr); + + if (sk->s == NULL) { + errno = EPIPE; + return -1; + } + + rc = tle_tcp_stream_writev(sk->s, mp, iov, iovcnt); + if (rc < 0) + errno = rte_errno; + return rc; +} + +static int +tcp_shutdown(struct sock *sk, int how) +{ + int ret; + + /* Refer to linux/net/ipv4/tcp.c:tcp_shutdown() */ + if (how == SHUT_RD) + return 0; + + ret = tle_tcp_stream_shutdown(sk->s, how); + if (ret < 0) + errno = rte_errno; + else + be_tx_with_lock(CTX(sk)); /* Make sure fin is sent */ + return ret; + +} + +static void +tcp_update_cfg(struct sock *sk) +{ + struct tle_tcp_stream_cfg prm = {0}; + + prm.recv_ev = &sk->rxev; + prm.send_ev = &sk->txev; + prm.err_ev = &sk->erev; + tle_tcp_stream_update_cfg(&sk->s, &prm, 1); +} + +struct proto tcp_prot = { + .name = "TCP", + .setsockopt = tcp_setsockopt, + .getsockopt = tcp_getsockopt, + .getname = tcp_getname, + .bind = tcp_bind, + .listen = tcp_listen, + .connect = tcp_connect, + .accept = tcp_accept, + .recv = tcp_recv, + .send = tcp_send, + .readv = tcp_readv, + .writev = tcp_writev, + .shutdown = tcp_shutdown, + .close = tle_tcp_stream_close, + .update_cfg = tcp_update_cfg, +}; diff --git a/lib/libtle_glue/tle_glue.h b/lib/libtle_glue/tle_glue.h new file mode 100644 index 0000000..38357e4 --- /dev/null +++ b/lib/libtle_glue/tle_glue.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _TLE_GLUE_H_ +#define _TLE_GLUE_H_ + +#include +#include +#include +#include +#include + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef PRELOAD + +#define PRE(name) name + +#else + +#define PRE(name) tle_ ## name + +#endif + +void glue_init1(int argc, char **argv); + +/* epoll */ +int PRE(epoll_create)(int size); +int PRE(epoll_create1)(int flags); +int PRE(epoll_ctl)(int epfd, int op, int fd, struct epoll_event *event); +int PRE(epoll_wait)(int epfd, struct epoll_event *events, int maxevents, int timeout); +int PRE(epoll_pwait)(int epfd, struct epoll_event *events, + int maxevents, int timeout, const sigset_t *sigmask); + +/* for setup, settings, and destroy */ +int PRE(socket)(int domain, int type, int protocol); +int PRE(listen)(int sockfd, int backlog); +int PRE(bind)(int sockfd, const struct sockaddr *addr, socklen_t addrlen); +int PRE(accept)(int sockfd, struct sockaddr *addr, socklen_t *addrlen); +int PRE(accept4)(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags); +int PRE(connect)(int sockfd, const struct sockaddr *addr, socklen_t addrlen); +int PRE(getsockopt)(int sockfd, int level, int optname, + void *optval, socklen_t *optlen); +int PRE(setsockopt)(int sockfd, int level, int optname, + const void *optval, socklen_t optlen); +int PRE(getsockname)(int sockfd, struct sockaddr *addr, socklen_t *addrlen); +int PRE(getpeername)(int sockfd, struct sockaddr *addr, socklen_t *addrlen); +int PRE(fcntl)(int fd, int cmd, ... /* arg */ ); +int PRE(ioctl)(int d, unsigned long int request, ...); +int PRE(shutdown)(int sockfd, int how); +int PRE(close)(int fd); + +/* for recv */ +ssize_t PRE(recv)(int sockfd, void *buf, size_t len, int flags); +ssize_t PRE(recvfrom)(int sockfd, void *buf, size_t len, int flags, + struct sockaddr *src_addr, socklen_t *addrlen); +ssize_t PRE(recvmsg)(int sockfd, struct msghdr *msg, int flags); +ssize_t PRE(read)(int fd, void *buf, size_t count); +ssize_t PRE(readv)(int fd, const struct iovec *iov, int iovcnt); + +/* for send */ +ssize_t PRE(send)(int sockfd, const void *buf, size_t len, int flags); +ssize_t PRE(sendto)(int sockfd, const void *buf, size_t len, int flags, + const struct sockaddr *dest_addr, socklen_t addrlen); +ssize_t PRE(sendmsg)(int sockfd, const struct msghdr *msg, int flags); +ssize_t PRE(write)(int fd, const void *buf, size_t count); +ssize_t PRE(writev)(int fd, const struct iovec *iov, int iovcnt); + +/* advanced functions */ +ssize_t PRE(splice)(int fd_in, loff_t *off_in, int fd_out, + loff_t *off_out, size_t len, unsigned int flags); +ssize_t PRE(sendfile)(int out_fd, int in_fd, off_t *offset, size_t count); + +/* poll */ +int PRE(poll)(struct pollfd *fds, nfds_t nfds, int timeout); +int PRE(ppoll)(struct pollfd *fds, nfds_t nfds, + const struct timespec *tmo_p, const sigset_t *sigmask); + +/* select */ +int PRE(select)(int nfds, fd_set *readfds, fd_set *writefds, + fd_set *exceptfds, struct timeval *timeout); +int PRE(pselect)(int nfds, fd_set *readfds, fd_set *writefds, + fd_set *exceptfds, const struct timespec *timeout, + const sigset_t *sigmask); + +/* non-posix APIs */ +int fd_ready(int fd, int events); +void v_get_stats_snmp(unsigned long mibs[]); + +#ifdef __cplusplus +} +#endif + +#endif /* _TLE_GLUE_H_ */ diff --git a/lib/libtle_glue/udp.c b/lib/libtle_glue/udp.c new file mode 100644 index 0000000..9f199bc --- /dev/null +++ b/lib/libtle_glue/udp.c @@ -0,0 +1,419 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "sym.h" +#include "fd.h" +#include "log.h" +#include "util.h" +#include "internal.h" +#include "sock.h" + +static int +udp_setsockopt(__rte_unused struct sock *sk, __rte_unused int optname, + __rte_unused const void *optval, __rte_unused socklen_t optlen) +{ + return 0; +} + +static int +udp_getsockopt(__rte_unused struct sock *sk, __rte_unused int optname, + __rte_unused void *optval, __rte_unused socklen_t *optlen) +{ + return 0; +} + +static int +udp_getname(struct sock *sk, struct sockaddr *addr, int peer) +{ + struct tle_udp_stream_param p; + size_t addrlen; + int rc; + + rc = tle_udp_stream_get_param(sk->s, &p); + if (rc) { + errno = -rc; + return -1; + } + + addrlen = get_sockaddr_len(sk->domain); + if (peer) + memcpy(addr, &p.remote_addr, addrlen); + else + memcpy(addr, &p.local_addr, addrlen); + addr->sa_family = p.local_addr.ss_family; + return 0; +} + +static int +udp_bind(struct sock *sk, const struct sockaddr *addr) +{ + if (sk->ubind) { + errno = EINVAL; + return -1; + } + + sk->s = open_bind(sk, addr, NULL); + if (sk->s != NULL) { + sk->ubind = 1; + if (is_any_addr(addr)) + sk->ubindany = 1; + return 0; + } + + return -1; +} + +static int +udp_connect(struct sock *sk, const struct sockaddr *addr) +{ + struct sockaddr_storage laddr; + + /* According to linux manual, connectionless sockets may dissolve the + * association by connecting to an address with the sa_family member of + * sockaddr set to AF_UNSPEC (supported on Linux since kernel 2.2). + */ + if (sk->ubind) { + if (udp_getname(sk, (struct sockaddr *)&laddr, 0)) + return -1; + if (addr->sa_family == AF_UNSPEC) { + addr = NULL; + if (sk->ubindany) + set_any_addr((struct sockaddr *)&laddr); + } + sk->s = open_bind(sk, (const struct sockaddr *)&laddr, addr); + } else { + if (addr->sa_family == AF_UNSPEC) { + tle_udp_stream_close(sk->s); + sk->s = NULL; + return 0; + } + sk->s = open_bind(sk, NULL, addr); + } + + if (sk->s) + return 0; + + return -1; +} + +static int +udp_addr_prepare(struct sock *sk, const struct sockaddr **p_dst_addr, + struct sockaddr_storage *addr) +{ + const struct sockaddr *dst_addr = *p_dst_addr; + + if (dst_addr != NULL && + dst_addr->sa_family == AF_INET6 && + IN6_IS_ADDR_V4MAPPED(&((const struct sockaddr_in6 *)dst_addr)->sin6_addr)) { + rte_memcpy(addr, dst_addr, sizeof(struct sockaddr_in6)); + dst_addr = (const struct sockaddr*)(addr); + *p_dst_addr = dst_addr; + retrans_4mapped6_addr((struct sockaddr_storage*)(addr)); + } + + if (sk->s == NULL) { + if (dst_addr == NULL) { + errno = EDESTADDRREQ; + return -1; + } + + sk->s = open_bind(sk, NULL, dst_addr); + if (sk->s == NULL) /* errno is set */ + return -1; + } else if (dst_addr != NULL) { + if (dst_addr->sa_family == AF_INET6 && sk->domain == AF_INET) { + errno = EINVAL; + return -1; + } + if (dst_addr->sa_family == AF_INET && sk->domain == AF_INET6) { + if (IN6_IS_ADDR_UNSPECIFIED(&sk->s->ipv6.addr.dst)) { + sk->s->type = TLE_V4; + sk->s->ipv4.addr.dst = 0; + } else { + errno = ENETUNREACH; + return -1; + } + } + } + + return 0; +} + +/* abstract client info from mbuf into s */ +static inline void +udp_pkt_addr(const struct rte_mbuf *m, struct sockaddr *addr, + __rte_unused uint16_t family) +{ + const struct ipv4_hdr *ip4h; + const struct ipv6_hdr *ip6h; + const struct udp_hdr *udph; + struct sockaddr_in *in4; + struct sockaddr_in6 *in6; + int off = -(m->l4_len + m->l3_len); + + udph = rte_pktmbuf_mtod_offset(m, struct udp_hdr *, -m->l4_len); + ip4h = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr *, off); + if ((ip4h->version_ihl>>4) == 4) { + addr->sa_family = AF_INET; + in4 = (struct sockaddr_in *)addr; + in4->sin_port = udph->src_port; + in4->sin_addr.s_addr = ip4h->src_addr; + } else { + addr->sa_family = AF_INET6; + ip6h = (const struct ipv6_hdr*)ip4h; + in6 = (struct sockaddr_in6 *)addr; + in6->sin6_port = udph->src_port; + rte_memcpy(&in6->sin6_addr, ip6h->src_addr, + sizeof(in6->sin6_addr)); + } +} + +static ssize_t +udp_send(struct sock *sk, struct rte_mbuf *pkt[], + uint16_t num, const struct sockaddr *dst_addr) +{ + uint16_t i; + struct sockaddr_storage addr; + + if (udp_addr_prepare(sk, &dst_addr, &addr) != 0) + return 0; + + /* chain them together as *one* message */ + for (i = 1; i < num; ++i) { + pkt[i-1]->next = pkt[i]; + pkt[0]->pkt_len += pkt[i]->pkt_len; + } + pkt[0]->nb_segs = num; + + if (tle_udp_stream_send(sk->s, &pkt[0], 1, dst_addr) == 0) { + errno = rte_errno; + return 0; + } + + return num; +} + +static ssize_t +udp_readv(struct tle_stream *s, struct msghdr *msg, int flags) +{ + int i; + ssize_t sz; + uint16_t rc; + uint32_t fin; + struct iovec iv; + struct rte_mbuf *m; + const struct iovec *iov = msg->msg_iov; + int iovcnt = msg->msg_iovlen; + + rc = tle_udp_stream_recv(s, &m, 1); + if (rc == 0) { + errno = rte_errno; + return -1; + } + + if (!s->option.timestamp) + s->timestamp = m->timestamp; + if (msg != NULL && msg->msg_control != NULL) { + if (s->option.timestamp) + tle_set_timestamp(msg, m); + else + msg->msg_controllen = 0; + } + + if (msg != NULL && msg->msg_name != NULL) { + udp_pkt_addr(m, (struct sockaddr*)msg->msg_name, 0); + if (((struct sockaddr *)msg->msg_name)->sa_family == AF_INET) + msg->msg_namelen = sizeof(struct sockaddr_in); + else + msg->msg_namelen = sizeof(struct sockaddr_in6); + } + + for (i = 0, sz = 0; i != iovcnt; i++) { + iv = iov[i]; + sz += iv.iov_len; + fin = _mbus_to_iovec(&iv, &m, 1); + if (fin == 1) { + sz -= iv.iov_len; + break; + } + } + if (fin == 0) { + if (flags & MSG_TRUNC) + sz += m->pkt_len; + rte_pktmbuf_free_seg(m); + msg->msg_flags |= MSG_TRUNC; + } + return sz; +} + +static ssize_t +udp_writev(struct sock *sk, const struct iovec *iov, + int iovcnt, const struct sockaddr *dst_addr) +{ + struct rte_mempool *mp = get_mempool_by_socket(0); /* fix me */ + struct sockaddr_storage addr; + uint32_t slen, left_m, left_b, copy_len, left; + uint16_t i, rc, nb_mbufs; + char *dst, *src; + uint64_t ufo; + size_t total; + int j; + + if (udp_addr_prepare(sk, &dst_addr, &addr) != 0) + return -1; + + for (j = 0, total = 0; j < iovcnt; ++j) + total += iov[j].iov_len; + + ufo = tx_offload & DEV_TX_OFFLOAD_UDP_TSO; + if (ufo) + slen = RTE_MBUF_DEFAULT_DATAROOM; + else + slen = 1500 - 20; /* mtu - ip_hdr_len */ + + nb_mbufs = (total + 8 + slen - 1) / slen; + struct rte_mbuf *mbufs[nb_mbufs]; + if (unlikely(rte_pktmbuf_alloc_bulk(mp, mbufs, nb_mbufs) != 0)) { + errno = ENOMEM; + return -1; + } + + left_b = iov[0].iov_len; + for (i = 0, j = 0; i < nb_mbufs && j < iovcnt; ++i) { + /* first frag has udp hdr, its payload is 8 bytes less */ + if (i == 0) + slen -= 8; + else if (i == 1) + slen += 8; + left_m = slen; + while (left_m > 0 && j < iovcnt) { + copy_len = RTE_MIN(left_m, left_b); + dst = rte_pktmbuf_mtod_offset(mbufs[i], char *, + slen - left_m); + src = (char *)iov[j].iov_base + iov[j].iov_len - left_b; + rte_memcpy(dst, src, copy_len); + + left_m -= copy_len; + left_b -= copy_len; + if (left_b == 0) { + j++; + left_b = iov[j].iov_len; + } + } + mbufs[i]->data_len = slen; + mbufs[i]->pkt_len = slen; + } + + /* last seg */ + if (nb_mbufs == 1) { + mbufs[nb_mbufs - 1]->data_len = total; + mbufs[nb_mbufs - 1]->pkt_len = total; + } else { + mbufs[nb_mbufs - 1]->data_len = total - (nb_mbufs - 1) * slen + 8; + mbufs[nb_mbufs - 1]->pkt_len = total - (nb_mbufs - 1) * slen + 8; + } + + /* chain as *one* message */ + for (i = 1; i < nb_mbufs; ++i) + mbufs[i-1]->next = mbufs[i]; + mbufs[0]->nb_segs = nb_mbufs; + mbufs[0]->pkt_len = total; + nb_mbufs = 1; + + rc = tle_udp_stream_send(sk->s, mbufs, nb_mbufs, dst_addr); + for (i = rc, left = 0; i < nb_mbufs; ++i) { + left += mbufs[i]->pkt_len; + rte_pktmbuf_free(mbufs[i]); + } + + if (rc == 0) { + errno = rte_errno; + return -1; + } + + return total - left; +} + +static ssize_t +udp_recv(struct tle_stream *s, struct rte_mbuf *pkt[], uint16_t num, + struct sockaddr *addr) +{ + uint16_t rc; + + rc = tle_udp_stream_recv(s, pkt, num); + if (addr && num == 1 && rc == 1) + udp_pkt_addr(pkt[0], addr, 0); + + if (rc == 0) + errno = rte_errno; + return rc; +} + +static void +udp_update_cfg(struct sock *sk) +{ + struct tle_udp_stream_param prm; + memset(&prm, 0, sizeof(prm)); + + prm.recv_ev = &sk->rxev; + prm.send_ev = &sk->txev; + + tle_udp_stream_update_cfg(&sk->s, &prm, 1); +} + +static int +udp_shutdown(struct sock *sk, int how) +{ + int rc; + + if (sk->s == NULL) { + errno = ENOTCONN; + return -1; + } + + rc = tle_udp_stream_shutdown(sk->s, how); + if (rc < 0) { + errno = -rc; + return -1; + } + return 0; +} + +struct proto udp_prot = { + .name = "UDP", + .setsockopt = udp_setsockopt, + .getsockopt = udp_getsockopt, + .getname = udp_getname, + .bind = udp_bind, + .connect = udp_connect, + .recv = udp_recv, + .send = udp_send, + .readv = udp_readv, + .writev = udp_writev, + .shutdown = udp_shutdown, + .close = tle_udp_stream_close, + .update_cfg = udp_update_cfg, +}; diff --git a/lib/libtle_glue/util.c b/lib/libtle_glue/util.c new file mode 100644 index 0000000..69fc555 --- /dev/null +++ b/lib/libtle_glue/util.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include + +#include "util.h" + +#define NUMA_NODE_PATH "/sys/devices/system/node" + +static unsigned +eal_cpu_socket_id(unsigned lcore_id) +{ + unsigned socket; + char path[PATH_MAX]; + + for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) { + snprintf(path, sizeof(path), "%s/node%u/cpu%u", NUMA_NODE_PATH, + socket, lcore_id); + if (access(path, F_OK) == 0) + return socket; + } + return 0; +} + +uint32_t +get_socket_id(void) +{ + int err; + uint32_t i; + cpu_set_t cpuset; + + CPU_ZERO(&cpuset); + err = pthread_getaffinity_np(pthread_self(), + sizeof(cpuset), &cpuset); + if (err) + return 0; + + for (i = 0; i < CPU_SETSIZE; i++) + if (CPU_ISSET(i, &cpuset)) + break; + + return eal_cpu_socket_id(i); +} diff --git a/lib/libtle_glue/util.h b/lib/libtle_glue/util.h new file mode 100644 index 0000000..ac67d8b --- /dev/null +++ b/lib/libtle_glue/util.h @@ -0,0 +1,377 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _TLE_GLUE_UTIL_H_ +#define _TLE_GLUE_UTIL_H_ + +#include +#include +#include +#include + +#include +#include + +#include "../libtle_l4p/tcp_stream.h" + +#include "fd.h" +#include "ctx.h" +#include "sock.h" + +#ifdef __cplusplus +extern "C" { +#endif + +static inline void * +xstrdup(const void *old) +{ + void *new = strdup(old); + if (unlikely(new == NULL)) + rte_panic("Failed to strdup"); + return new; +} + +static inline void * +xmalloc(size_t size) +{ + void *p = malloc(size ? size : 1); + if (p == NULL) + rte_panic("Failed to malloc"); + return p; +} + +static inline char * +xvasprintf(const char *format, va_list args) +{ + va_list args2; + size_t needed; + char *s; + + va_copy(args2, args); + needed = vsnprintf(NULL, 0, format, args); + + s = xmalloc(needed + 1); + + vsnprintf(s, needed + 1, format, args2); + va_end(args2); + + return s; +} + +static inline char * +xasprintf(const char *format, ...) +{ + va_list args; + char *s; + + va_start(args, format); + s = xvasprintf(format, args); + va_end(args); + + return s; +} + +static inline char ** +grow_argv(char **argv, size_t cur_siz, size_t grow_by) +{ + char **p; + + p = realloc(argv, sizeof(char *) * (cur_siz + grow_by)); + if (unlikely(p == NULL)) + rte_panic("Failed to grow argv"); + return p; +} + +static inline void +release_argv(int argc, char **argv_to_release, char **argv) +{ + int i; + + for (i = 0; i < argc; ++i) + free(argv_to_release[i]); + + free(argv_to_release); + free(argv); +} + +static inline void +tle_event_attach(struct tle_event *ev, struct tle_evq *evq, const void *data) +{ + ev->head = evq; + ev->data = data; +} + +static inline void +sock_alloc_events(struct sock *so) +{ + tle_event_attach(&so->erev, CTX(so)->ereq, so); + tle_event_attach(&so->rxev, CTX(so)->rxeq, so); + tle_event_attach(&so->txev, CTX(so)->txeq, so); + tle_event_active(&so->erev, TLE_SEV_DOWN); +#ifndef LOOK_ASIDE_BACKEND + tle_event_active(&so->rxev, TLE_SEV_DOWN); + tle_event_active(&so->txev, TLE_SEV_DOWN); +#endif +} + +static inline void +sock_active_events(struct sock *so) +{ + tle_event_active(&so->erev, TLE_SEV_DOWN); + tle_event_active(&so->rxev, TLE_SEV_DOWN); + tle_event_active(&so->txev, TLE_SEV_DOWN); +} + +static inline const struct in6_addr* +select_local_addr_v6(const struct sockaddr *remote, struct glue_ctx *ctx) +{ + /* todo: implement route table to decide local address */ + + if (IN6_IS_ADDR_LOOPBACK(&((const struct sockaddr_in6 *)remote) + ->sin6_addr)) + return &in6addr_loopback; + else + return &ctx->ipv6; +} + +static inline in_addr_t +select_local_addr(const struct sockaddr *remote, struct glue_ctx *ctx) +{ + /* todo: implement route table to decide local address */ + in_addr_t remote_addr; + + remote_addr = ((const struct sockaddr_in*)remote)->sin_addr.s_addr; + if (remote_addr == htonl(INADDR_LOOPBACK)) + return htonl(INADDR_LOOPBACK); + else + return ctx->ipv4; +} + +static inline bool +is_any_addr(const struct sockaddr *addr) +{ + const struct sockaddr_in *addr4; + const struct sockaddr_in6 *addr6; + + if (addr->sa_family == AF_INET) { + addr4 = (const struct sockaddr_in *)addr; + if (addr4->sin_addr.s_addr == htonl(INADDR_ANY)) + return true; + else + return false; + } else if (addr->sa_family == AF_INET6) { + addr6 = (const struct sockaddr_in6 *)addr; + if (IN6_IS_ADDR_UNSPECIFIED(&addr6->sin6_addr)) + return true; + else + return false; + } else + return false; +} + +static inline void +set_any_addr(struct sockaddr *addr) +{ + struct sockaddr_in *addr4; + struct sockaddr_in6 *addr6; + + if (addr->sa_family == AF_INET) { + addr4 = (struct sockaddr_in *)addr; + addr4->sin_addr.s_addr = htonl(INADDR_ANY); + } else if (addr->sa_family == AF_INET6) { + addr6 = (struct sockaddr_in6 *)addr; + addr6->sin6_addr = in6addr_any; + } +} + +/* transform an IPv4 address(in struct sockaddr_in) to + * an IPv4 mapped IPv6 address(in struct sockaddr_in6) */ +static inline void +trans_4mapped6_addr(struct sockaddr *addr) +{ + struct sockaddr_in6 *addr6; + + if (addr->sa_family != AF_INET) + return; + + addr6 = (struct sockaddr_in6*)addr; + addr6->sin6_family = AF_INET6; + addr6->sin6_addr.s6_addr32[0] = 0; + addr6->sin6_addr.s6_addr32[1] = 0; + addr6->sin6_addr.s6_addr32[2] = 0xffff0000; + addr6->sin6_addr.s6_addr32[3] = ((struct sockaddr_in*)addr)->sin_addr.s_addr; +} + +/* transform an IPv4 mapped IPv6 address(in struct sockaddr_in6) to + * an IPv4 address(in struct sockaddr_in) */ +static inline void +retrans_4mapped6_addr(struct sockaddr_storage * addr) +{ + struct in6_addr* addr6; + if (addr->ss_family == AF_INET) + return; + + addr6 = &((struct sockaddr_in6*)addr)->sin6_addr; + if(IN6_IS_ADDR_V4MAPPED(addr6)) { + addr->ss_family = AF_INET; + ((struct sockaddr_in*)addr)->sin_addr.s_addr = addr6->__in6_u.__u6_addr32[3]; + } +} + +static inline struct tle_stream * +open_bind(struct sock *so, const struct sockaddr *local, + const struct sockaddr *remote) +{ + struct tle_stream *s; + struct sockaddr_storage *l, *r; + struct sockaddr_in *addr4; + struct sockaddr_in6 *addr6; + struct tle_tcp_stream_param pt = {0}; + struct tle_udp_stream_param pu = {0}; + + if (IS_TCP(so)) { + pt.option = so->option.raw; + l = &pt.addr.local; + r = &pt.addr.remote; + pt.cfg.err_ev = &so->erev; + pt.cfg.recv_ev = &so->rxev; + pt.cfg.send_ev = &so->txev; + } else { + pu.option = so->option.raw; + l = &pu.local_addr; + r = &pu.remote_addr; + pu.recv_ev = &so->rxev; + pu.send_ev = &so->txev; + } + + if (remote) { + memcpy(r, remote, get_sockaddr_len(remote->sa_family)); + retrans_4mapped6_addr(r); + if(r->ss_family == AF_INET) { + addr4 = (struct sockaddr_in*)r; + if (addr4->sin_addr.s_addr == 0) + addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK); + } else { + addr6 = (struct sockaddr_in6*)r; + if (IN6_IS_ADDR_UNSPECIFIED(&addr6->sin6_addr)) + rte_memcpy(&addr6->sin6_addr, &in6addr_loopback, + sizeof(struct in6_addr)); + } + } + + if (local) { + memcpy(l, local, get_sockaddr_len(local->sa_family)); + retrans_4mapped6_addr(l); + } else { + if (remote) + l->ss_family = r->ss_family; + else + l->ss_family = so->domain; + } + + if (!remote) + r->ss_family = l->ss_family; + + /* Endpoints of stream have different socket families */ + if (r->ss_family != l->ss_family) { + if (l->ss_family == AF_INET) { + errno = EINVAL; + return NULL; + } else { + /* if local addr is unbound, convert into remote family */ + if (IN6_IS_ADDR_UNSPECIFIED(&((struct sockaddr_in6*)l)->sin6_addr)) { + l->ss_family = AF_INET; + ((struct sockaddr_in*)l)->sin_addr.s_addr = 0; + } else { + errno = ENETUNREACH; + return NULL; + } + } + } + + if (l->ss_family == AF_INET) { + addr4 = (struct sockaddr_in*)l; + if (addr4->sin_addr.s_addr == htonl(INADDR_ANY) && remote) { + addr4->sin_addr.s_addr = + select_local_addr((struct sockaddr*)r, CTX(so)); + if (addr4->sin_addr.s_addr == htonl(INADDR_ANY)) { + errno = EADDRNOTAVAIL; + return NULL; + } + } + else if (addr4->sin_addr.s_addr != CTX(so)->ipv4 && + addr4->sin_addr.s_addr != htonl(INADDR_LOOPBACK) && + addr4->sin_addr.s_addr != htonl(INADDR_ANY)) { + errno = EADDRNOTAVAIL; + return NULL; + } + } else { + addr6 = (struct sockaddr_in6 *)l; + if (IN6_IS_ADDR_UNSPECIFIED(&addr6->sin6_addr) && remote) { + memcpy(&addr6->sin6_addr, + select_local_addr_v6((struct sockaddr*)r, CTX(so)), + sizeof(struct in6_addr)); + if (IN6_IS_ADDR_UNSPECIFIED(&addr6->sin6_addr)) { + errno = EADDRNOTAVAIL; + return NULL; + } + } + else if (memcmp(&addr6->sin6_addr, &CTX(so)->ipv6, + sizeof(struct in6_addr)) != 0 && + (!IN6_IS_ADDR_LOOPBACK(&addr6->sin6_addr)) && + (!IN6_IS_ADDR_UNSPECIFIED(&addr6->sin6_addr))) { + errno = EADDRNOTAVAIL; + return NULL; + } + } + + if (IS_TCP(so)) + s = tle_tcp_stream_open(CTX(so)->tcp_ctx, &pt); + else { + if (so->s == NULL) + s = tle_udp_stream_open(CTX(so)->udp_ctx, &pu); + else + s = tle_udp_stream_set(so->s, CTX(so)->udp_ctx, &pu); + } + + if (s == NULL) + errno = rte_errno; + + return s; +} + +static inline struct tle_stream * +open_bind_listen(struct sock *so, const struct sockaddr *local) +{ + struct tle_stream *s = open_bind(so, local, NULL); + + if (s == NULL) + return NULL; + + if (tle_tcp_stream_listen(s) != 0) { + tle_tcp_stream_close(s); + return NULL; + } + + return s; +} + +uint32_t get_socket_id(void); + +#ifdef __cplusplus +} +#endif + +#endif /*_TLE_GLUE_UTIL_H_ */ diff --git a/lib/libtle_glue/zerocopy.h b/lib/libtle_glue/zerocopy.h new file mode 100644 index 0000000..a37f8f5 --- /dev/null +++ b/lib/libtle_glue/zerocopy.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _TLE_GLUE_ZEROCOPY_H_ +#define _TLE_GLUE_ZEROCOPY_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * This API performs recv operation on specified socket, and it's + * optimized for zero copy, which means the caller does not need to + * prepare the buffer, instead, it will get a pointer on success. + * @param sockfd + * the file descriptor for the socket. + * @param buf + * after successfully receiving some payload, the pointer of the + * received buffer will be stored in *buf. + * @return + * the number of bytes received, or -1 if an error occurred, or 0 + * if a stream socket peer has performed an orderly shutdown. + * + */ +ssize_t recv_zc(int sockfd, void **buf); + +/** + * This API performs send operation on specified socket, and it's + * optimized for zero copy, which means the caller does not need to + * free the buffer, not even touch that buffer even after calling this + * API; the buffer will be freed after an ack from the socket peer. + * @param sockfd + * the file descriptor for the socket. + * @param buf + * The pointer to the payload buffer to be sent. + * @param len + * The length of the payload buffer to be sent. + * @return + * the number of bytes sent, or -1 if an error occurred. + */ +ssize_t send_zc(int sockfd, const void *buf, size_t len); + +#ifdef __cplusplus +} +#endif + +#endif /*_TLE_GLUE_ZEROCOPY_H_ */ diff --git a/lib/libtle_l4p/Makefile b/lib/libtle_l4p/Makefile index e1357d1..ee81d4a 100644 --- a/lib/libtle_l4p/Makefile +++ b/lib/libtle_l4p/Makefile @@ -45,6 +45,7 @@ SYMLINK-y-include += tle_ctx.h SYMLINK-y-include += tle_event.h SYMLINK-y-include += tle_tcp.h SYMLINK-y-include += tle_udp.h +SYMLINK-y-include += tle_stats.h # this lib dependencies DEPDIRS-y += lib/libtle_misc diff --git a/lib/libtle_l4p/ctx.c b/lib/libtle_l4p/ctx.c index b8067f0..d6bde48 100644 --- a/lib/libtle_l4p/ctx.c +++ b/lib/libtle_l4p/ctx.c @@ -21,9 +21,14 @@ #include #include "stream.h" +#include "stream_table.h" #include "misc.h" #include +struct tle_mib default_mib; + +RTE_DEFINE_PER_LCORE(struct tle_mib *, mib) = &default_mib; + #define LPORT_START 0x8000 #define LPORT_END MAX_PORT_NUM @@ -103,6 +108,16 @@ tle_ctx_create(const struct tle_ctx_param *ctx_prm) ctx->prm = *ctx_prm; + rc = bhash_init(ctx); + if (rc != 0) { + UDP_LOG(ERR, "create bhash table (ctx=%p, proto=%u) failed " + "with error code: %d;\n", + ctx, ctx_prm->proto, rc); + tle_ctx_destroy(ctx); + rte_errno = -rc; + return NULL; + } + rc = tle_stream_ops[ctx_prm->proto].init_streams(ctx); if (rc != 0) { UDP_LOG(ERR, "init_streams(ctx=%p, proto=%u) failed " @@ -114,9 +129,10 @@ tle_ctx_create(const struct tle_ctx_param *ctx_prm) } for (i = 0; i != RTE_DIM(ctx->use); i++) - tle_pbm_init(ctx->use + i, LPORT_START_BLK); + tle_psm_init(ctx->use + i); - ctx->streams.nb_free = ctx->prm.max_streams; + ctx->streams.nb_free = ctx->prm.min_streams; + ctx->streams.nb_cur = ctx->prm.min_streams; /* Initialization of siphash state is done here to speed up the * fastpath processing. @@ -124,6 +140,11 @@ tle_ctx_create(const struct tle_ctx_param *ctx_prm) if (ctx->prm.hash_alg == TLE_SIPHASH) siphash_initialization(&ctx->prm.secret_key, &ctx->prm.secret_key); + + rte_spinlock_init(&ctx->dev_lock); + rte_spinlock_init(&ctx->bhash_lock[TLE_V4]); + rte_spinlock_init(&ctx->bhash_lock[TLE_V6]); + return ctx; } @@ -137,6 +158,8 @@ tle_ctx_destroy(struct tle_ctx *ctx) return; } + bhash_fini(ctx); + for (i = 0; i != RTE_DIM(ctx->dev); i++) tle_del_dev(ctx->dev + i); @@ -150,37 +173,6 @@ tle_ctx_invalidate(struct tle_ctx *ctx) RTE_SET_USED(ctx); } -static void -fill_pbm(struct tle_pbm *pbm, const struct tle_bl_port *blp) -{ - uint32_t i; - - for (i = 0; i != blp->nb_port; i++) - tle_pbm_set(pbm, blp->port[i]); -} - -static int -init_dev_proto(struct tle_dev *dev, uint32_t idx, int32_t socket_id, - const struct tle_bl_port *blp) -{ - size_t sz; - - sz = sizeof(*dev->dp[idx]); - dev->dp[idx] = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE, - socket_id); - - if (dev->dp[idx] == NULL) { - UDP_LOG(ERR, "allocation of %zu bytes on " - "socket %d for %u-th device failed\n", - sz, socket_id, idx); - return ENOMEM; - } - - tle_pbm_init(&dev->dp[idx]->use, LPORT_START_BLK); - fill_pbm(&dev->dp[idx]->use, blp); - return 0; -} - static struct tle_dev * find_free_dev(struct tle_ctx *ctx) { @@ -214,27 +206,8 @@ tle_add_dev(struct tle_ctx *ctx, const struct tle_dev_param *dev_prm) return NULL; rc = 0; - /* device can handle IPv4 traffic */ - if (dev_prm->local_addr4.s_addr != INADDR_ANY) { - rc = init_dev_proto(dev, TLE_V4, ctx->prm.socket_id, - &dev_prm->bl4); - if (rc == 0) - fill_pbm(&ctx->use[TLE_V4], &dev_prm->bl4); - } - - /* device can handle IPv6 traffic */ - if (rc == 0 && memcmp(&dev_prm->local_addr6, &tle_ipv6_any, - sizeof(tle_ipv6_any)) != 0) { - rc = init_dev_proto(dev, TLE_V6, ctx->prm.socket_id, - &dev_prm->bl6); - if (rc == 0) - fill_pbm(&ctx->use[TLE_V6], &dev_prm->bl6); - } - if (rc != 0) { /* cleanup and return an error. */ - rte_free(dev->dp[TLE_V4]); - rte_free(dev->dp[TLE_V6]); rte_errno = rc; return NULL; } @@ -246,16 +219,19 @@ tle_add_dev(struct tle_ctx *ctx, const struct tle_dev_param *dev_prm) if ((dev_prm->tx_offload & DEV_TX_OFFLOAD_UDP_CKSUM) != 0 && ctx->prm.proto == TLE_PROTO_UDP) { - dev->tx.ol_flags[TLE_V4] |= PKT_TX_IPV4 | PKT_TX_UDP_CKSUM; - dev->tx.ol_flags[TLE_V6] |= PKT_TX_IPV6 | PKT_TX_UDP_CKSUM; + dev->tx.ol_flags[TLE_V4] |= PKT_TX_UDP_CKSUM; + dev->tx.ol_flags[TLE_V6] |= PKT_TX_UDP_CKSUM; } else if ((dev_prm->tx_offload & DEV_TX_OFFLOAD_TCP_CKSUM) != 0 && ctx->prm.proto == TLE_PROTO_TCP) { - dev->tx.ol_flags[TLE_V4] |= PKT_TX_IPV4 | PKT_TX_TCP_CKSUM; - dev->tx.ol_flags[TLE_V6] |= PKT_TX_IPV6 | PKT_TX_TCP_CKSUM; + dev->tx.ol_flags[TLE_V4] |= PKT_TX_TCP_CKSUM; + dev->tx.ol_flags[TLE_V6] |= PKT_TX_TCP_CKSUM; } if ((dev_prm->tx_offload & DEV_TX_OFFLOAD_IPV4_CKSUM) != 0) - dev->tx.ol_flags[TLE_V4] |= PKT_TX_IPV4 | PKT_TX_IP_CKSUM; + dev->tx.ol_flags[TLE_V4] |= PKT_TX_IP_CKSUM; + + dev->tx.ol_flags[TLE_V4] |= PKT_TX_IPV4; + dev->tx.ol_flags[TLE_V6] |= PKT_TX_IPV6; dev->prm = *dev_prm; dev->ctx = ctx; @@ -300,220 +276,97 @@ tle_del_dev(struct tle_dev *dev) ctx = dev->ctx; p = dev - ctx->dev; - if (p >= RTE_DIM(ctx->dev) || - (dev->dp[TLE_V4] == NULL && - dev->dp[TLE_V6] == NULL)) + if (p >= RTE_DIM(ctx->dev)) return -EINVAL; /* emtpy TX queues. */ empty_dring(&dev->tx.dr, ctx->prm.proto); - rte_free(dev->dp[TLE_V4]); - rte_free(dev->dp[TLE_V6]); memset(dev, 0, sizeof(*dev)); ctx->nb_dev--; return 0; } -static struct tle_dev * -find_ipv4_dev(struct tle_ctx *ctx, const struct in_addr *addr) -{ - uint32_t i; - - for (i = 0; i != RTE_DIM(ctx->dev); i++) { - if (ctx->dev[i].prm.local_addr4.s_addr == addr->s_addr && - ctx->dev[i].dp[TLE_V4] != NULL) - return ctx->dev + i; - } - - return NULL; -} - -static struct tle_dev * -find_ipv6_dev(struct tle_ctx *ctx, const struct in6_addr *addr) +int +stream_fill_ctx(struct tle_ctx *ctx, struct tle_stream *s, + const struct sockaddr *laddr, const struct sockaddr *raddr) { - uint32_t i; + struct sockaddr_storage addr; + int32_t rc = 0; - for (i = 0; i != RTE_DIM(ctx->dev); i++) { - if (memcmp(&ctx->dev[i].prm.local_addr6, addr, - sizeof(*addr)) == 0 && - ctx->dev[i].dp[TLE_V6] != NULL) - return ctx->dev + i; + if (laddr->sa_family == AF_INET) { + s->type = TLE_V4; + } else if (laddr->sa_family == AF_INET6) { + s->type = TLE_V6; } - return NULL; -} - -static int -stream_fill_dev(struct tle_ctx *ctx, struct tle_stream *s, - const struct sockaddr *addr) -{ - struct tle_dev *dev; - struct tle_pbm *pbm; - const struct sockaddr_in *lin4; - const struct sockaddr_in6 *lin6; - uint32_t i, p, sp, t; - - if (addr->sa_family == AF_INET) { - lin4 = (const struct sockaddr_in *)addr; - t = TLE_V4; - p = lin4->sin_port; - } else if (addr->sa_family == AF_INET6) { - lin6 = (const struct sockaddr_in6 *)addr; - t = TLE_V6; - p = lin6->sin6_port; - } else - return EINVAL; - + uint16_t p = ((const struct sockaddr_in *)laddr)->sin_port; p = ntohs(p); - - /* if local address is not wildcard, find device it belongs to. */ - if (t == TLE_V4 && lin4->sin_addr.s_addr != INADDR_ANY) { - dev = find_ipv4_dev(ctx, &lin4->sin_addr); - if (dev == NULL) - return ENODEV; - } else if (t == TLE_V6 && memcmp(&tle_ipv6_any, &lin6->sin6_addr, - sizeof(tle_ipv6_any)) != 0) { - dev = find_ipv6_dev(ctx, &lin6->sin6_addr); - if (dev == NULL) - return ENODEV; - } else - dev = NULL; - - if (dev != NULL) - pbm = &dev->dp[t]->use; - else - pbm = &ctx->use[t]; - + struct tle_psm *psm = &ctx->use[s->type]; /* try to acquire local port number. */ + rte_spinlock_lock(&ctx->dev_lock); if (p == 0) { - p = tle_pbm_find_range(pbm, pbm->blk, LPORT_END_BLK); - if (p == 0 && pbm->blk > LPORT_START_BLK) - p = tle_pbm_find_range(pbm, LPORT_START_BLK, pbm->blk); - } else if (tle_pbm_check(pbm, p) != 0) - return EEXIST; - - if (p == 0) - return ENFILE; - - /* fill socket's dst port and type */ - - sp = htons(p); - s->type = t; - s->port.dst = sp; - - /* mark port as in-use */ - - tle_pbm_set(&ctx->use[t], p); - if (dev != NULL) { - tle_pbm_set(pbm, p); - dev->dp[t]->streams[sp] = s; - } else { - for (i = 0; i != RTE_DIM(ctx->dev); i++) { - if (ctx->dev[i].dp[t] != NULL) { - tle_pbm_set(&ctx->dev[i].dp[t]->use, p); - ctx->dev[i].dp[t]->streams[sp] = s; - } + if (s->type == TLE_V6 && is_empty_addr(laddr) && !s->option.ipv6only) + p = tle_psm_alloc_dual_port(&ctx->use[TLE_V4], psm); + else + p = tle_psm_alloc_port(psm); + if (p == 0) { + rte_spinlock_unlock(&ctx->dev_lock); + return ENFILE; } + rte_memcpy(&addr, laddr, sizeof(struct sockaddr_storage)); + ((struct sockaddr_in *)&addr)->sin_port = htons(p); + laddr = (const struct sockaddr*)&addr; } - return 0; -} + if (tle_psm_set(psm, p, s->option.reuseport) != 0) { + rte_spinlock_unlock(&ctx->dev_lock); + return EADDRINUSE; + } -static int -stream_clear_dev(struct tle_ctx *ctx, const struct tle_stream *s) -{ - struct tle_dev *dev; - uint32_t i, p, sp, t; - - t = s->type; - sp = s->port.dst; - p = ntohs(sp); - - /* if local address is not wildcard, find device it belongs to. */ - if (t == TLE_V4 && s->ipv4.addr.dst != INADDR_ANY) { - dev = find_ipv4_dev(ctx, - (const struct in_addr *)&s->ipv4.addr.dst); - if (dev == NULL) - return ENODEV; - } else if (t == TLE_V6 && memcmp(&tle_ipv6_any, &s->ipv6.addr.dst, - sizeof(tle_ipv6_any)) != 0) { - dev = find_ipv6_dev(ctx, - (const struct in6_addr *)&s->ipv6.addr.dst); - if (dev == NULL) - return ENODEV; - } else - dev = NULL; - - tle_pbm_clear(&ctx->use[t], p); - if (dev != NULL) { - if (dev->dp[t]->streams[sp] == s) { - tle_pbm_clear(&dev->dp[t]->use, p); - dev->dp[t]->streams[sp] = NULL; - } - } else { - for (i = 0; i != RTE_DIM(ctx->dev); i++) { - if (ctx->dev[i].dp[t] != NULL && - ctx->dev[i].dp[t]->streams[sp] == s) { - tle_pbm_clear(&ctx->dev[i].dp[t]->use, p); - ctx->dev[i].dp[t]->streams[sp] = NULL; + if (is_empty_addr(laddr)) { + if (s->type == TLE_V6 && !s->option.ipv6only) { + rc = tle_psm_set(&ctx->use[TLE_V4], p, s->option.reuseport); + if (rc != 0) { + tle_psm_clear(psm, p); + rte_spinlock_unlock(&ctx->dev_lock); + return EADDRINUSE; } } } - return 0; -} - -static void -fill_ipv4_am(const struct sockaddr_in *in, uint32_t *addr, uint32_t *mask) -{ - *addr = in->sin_addr.s_addr; - *mask = (*addr == INADDR_ANY) ? INADDR_ANY : INADDR_NONE; -} + if (is_empty_addr(raddr)) + rc = bhash_add_entry(ctx, laddr, s); -static void -fill_ipv6_am(const struct sockaddr_in6 *in, rte_xmm_t *addr, rte_xmm_t *mask) -{ - const struct in6_addr *pm; - - memcpy(addr, &in->sin6_addr, sizeof(*addr)); - if (memcmp(&tle_ipv6_any, addr, sizeof(*addr)) == 0) - pm = &tle_ipv6_any; - else - pm = &tle_ipv6_none; - - memcpy(mask, pm, sizeof(*mask)); -} + if (rc) { + tle_psm_clear(psm, p); + } -int -stream_fill_ctx(struct tle_ctx *ctx, struct tle_stream *s, - const struct sockaddr *laddr, const struct sockaddr *raddr) -{ - const struct sockaddr_in *rin; - int32_t rc; + rte_spinlock_unlock(&ctx->dev_lock); + /* fill socket's dst (src actually) port */ + s->port.dst = htons(p); - /* setup ports and port mask fields (except dst port). */ - rin = (const struct sockaddr_in *)raddr; - s->port.src = rin->sin_port; - s->pmsk.src = (s->port.src == 0) ? 0 : UINT16_MAX; - s->pmsk.dst = UINT16_MAX; + if (rc) + return rc; - /* setup src and dst addresses. */ + /* setup src, dst addresses, and src port. */ if (laddr->sa_family == AF_INET) { fill_ipv4_am((const struct sockaddr_in *)laddr, &s->ipv4.addr.dst, &s->ipv4.mask.dst); fill_ipv4_am((const struct sockaddr_in *)raddr, &s->ipv4.addr.src, &s->ipv4.mask.src); + s->port.src = ((const struct sockaddr_in *)raddr)->sin_port; } else if (laddr->sa_family == AF_INET6) { fill_ipv6_am((const struct sockaddr_in6 *)laddr, &s->ipv6.addr.dst, &s->ipv6.mask.dst); fill_ipv6_am((const struct sockaddr_in6 *)raddr, &s->ipv6.addr.src, &s->ipv6.mask.src); + s->port.src = ((const struct sockaddr_in6 *)raddr)->sin6_port; } - rte_spinlock_lock(&ctx->dev_lock); - rc = stream_fill_dev(ctx, s, laddr); - rte_spinlock_unlock(&ctx->dev_lock); + /* setup port mask fields. */ + s->pmsk.src = (s->port.src == 0) ? 0 : UINT16_MAX; + s->pmsk.dst = UINT16_MAX; return rc; } @@ -522,11 +375,41 @@ stream_fill_ctx(struct tle_ctx *ctx, struct tle_stream *s, int stream_clear_ctx(struct tle_ctx *ctx, struct tle_stream *s) { - int32_t rc; + bool is_any = false; + struct sockaddr_storage addr; + struct sockaddr_in *addr4; + struct sockaddr_in6 *addr6; + + if (s->type == TLE_V4) { + if (s->ipv4.addr.src == INADDR_ANY) { + is_any = true; + addr4 = (struct sockaddr_in *)&addr; + addr4->sin_addr.s_addr = s->ipv4.addr.dst; + addr4->sin_port = s->port.dst; + addr.ss_family = AF_INET; + bhash_del_entry(ctx, s, (struct sockaddr*)&addr); + } + } else { + if (IN6_IS_ADDR_UNSPECIFIED(&s->ipv6.addr.src)) { + is_any = true; + addr6 = (struct sockaddr_in6 *)&addr; + memcpy(&addr6->sin6_addr, &s->ipv6.addr.dst, + sizeof(tle_ipv6_any)); + addr6->sin6_port = s->port.dst; + addr.ss_family = AF_INET6; + bhash_del_entry(ctx, s, (struct sockaddr*)&addr); + } + } rte_spinlock_lock(&ctx->dev_lock); - rc = stream_clear_dev(ctx, s); + /* strange behaviour to match linux stack */ + if (is_any) { + if (s->type == TLE_V6 && !s->option.ipv6only) + tle_psm_clear(&ctx->use[TLE_V4], ntohs(s->port.dst)); + } + + tle_psm_clear(&ctx->use[s->type], ntohs(s->port.dst)); rte_spinlock_unlock(&ctx->dev_lock); - return rc; + return 0; } diff --git a/lib/libtle_l4p/ctx.h b/lib/libtle_l4p/ctx.h index f18060b..9483976 100644 --- a/lib/libtle_l4p/ctx.h +++ b/lib/libtle_l4p/ctx.h @@ -21,7 +21,7 @@ #include #include -#include "port_bitmap.h" +#include "port_statmap.h" #include "osdep.h" #include "net_misc.h" @@ -29,11 +29,6 @@ extern "C" { #endif -struct tle_dport { - struct tle_pbm use; /* ports in use. */ - struct tle_stream *streams[MAX_PORT_NUM]; /* port to stream. */ -}; - struct tle_dev { struct tle_ctx *ctx; struct { @@ -45,7 +40,6 @@ struct tle_dev { struct tle_dring dr; } tx; struct tle_dev_param prm; /* copy of device parameters. */ - struct tle_dport *dp[TLE_VNUM]; /* device L4 ports */ }; struct tle_ctx { @@ -54,18 +48,23 @@ struct tle_ctx { struct { rte_spinlock_t lock; uint32_t nb_free; /* number of free streams. */ + uint32_t nb_cur; /* number of allocated streams. */ STAILQ_HEAD(, tle_stream) free; void *buf; /* space allocated for streams */ } streams; - rte_spinlock_t dev_lock; + rte_spinlock_t bhash_lock[TLE_VNUM]; + struct rte_hash *bhash[TLE_VNUM]; /* bind and listen hash table */ + uint32_t nb_dev; - struct tle_pbm use[TLE_VNUM]; /* all ports in use. */ + rte_spinlock_t dev_lock; + struct tle_psm use[TLE_VNUM]; /* all ports in use. */ struct tle_dev dev[RTE_MAX_ETHPORTS]; }; struct stream_ops { int (*init_streams)(struct tle_ctx *); + uint32_t (*more_streams)(struct tle_ctx *); void (*fini_streams)(struct tle_ctx *); void (*free_drbs)(struct tle_stream *, struct tle_drb *[], uint32_t); }; @@ -77,6 +76,27 @@ int stream_fill_ctx(struct tle_ctx *ctx, struct tle_stream *s, int stream_clear_ctx(struct tle_ctx *ctx, struct tle_stream *s); +static inline void +fill_ipv4_am(const struct sockaddr_in *in, uint32_t *addr, uint32_t *mask) +{ + *addr = in->sin_addr.s_addr; + *mask = (*addr == INADDR_ANY) ? INADDR_ANY : INADDR_NONE; +} + +static inline void +fill_ipv6_am(const struct sockaddr_in6 *in, rte_xmm_t *addr, rte_xmm_t *mask) +{ + const struct in6_addr *pm; + + memcpy(addr, &in->sin6_addr, sizeof(*addr)); + if (IN6_IS_ADDR_UNSPECIFIED(addr)) + pm = &tle_ipv6_any; + else + pm = &tle_ipv6_none; + + memcpy(mask, pm, sizeof(*mask)); +} + #ifdef __cplusplus } #endif diff --git a/lib/libtle_l4p/misc.h b/lib/libtle_l4p/misc.h index 327296f..d39e5a1 100644 --- a/lib/libtle_l4p/misc.h +++ b/lib/libtle_l4p/misc.h @@ -16,12 +16,34 @@ #ifndef _MISC_H_ #define _MISC_H_ +#include #include #ifdef __cplusplus extern "C" { #endif +union typflg { + uint16_t raw; + struct { + uint8_t type; /* TLE_V4/TLE_V6 */ + uint8_t flags; /* TCP header flags */ + }; +}; + +union pkt_info { + rte_xmm_t raw; + struct { + union typflg tf; + uint16_t csf; /* checksum flags */ + union l4_ports port; + union { + union ipv4_addrs addr4; + const union ipv6_addrs *addr6; + }; + }; +}; + static inline int xmm_cmp(const rte_xmm_t *da, const rte_xmm_t *sa) { @@ -286,43 +308,41 @@ _ipv4x_cksum(const void *iph, size_t len) return (cksum == 0xffff) ? cksum : ~cksum; } -/* - * helper function to check csum. - */ static inline int -check_pkt_csum(const struct rte_mbuf *m, uint64_t ol_flags, uint32_t type, - uint32_t proto) +check_pkt_csum(const struct rte_mbuf *m, uint32_t type, uint32_t proto) { const struct ipv4_hdr *l3h4; const struct ipv6_hdr *l3h6; const struct udp_hdr *l4h; - uint64_t fl3, fl4; - uint16_t csum; int32_t ret; - - fl4 = ol_flags & PKT_RX_L4_CKSUM_MASK; - fl3 = (type == TLE_V4) ? - (ol_flags & PKT_RX_IP_CKSUM_MASK) : PKT_RX_IP_CKSUM_GOOD; + uint16_t csum; + uint64_t ol_flags = m->ol_flags; /* case 0: both ip and l4 cksum is verified or data is valid */ - if ((fl3 | fl4) == (PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD)) + if ((ol_flags & PKT_RX_IP_CKSUM_GOOD) && + (ol_flags & PKT_RX_L4_CKSUM_GOOD)) return 0; /* case 1: either ip or l4 cksum bad */ - if (fl3 == PKT_RX_IP_CKSUM_BAD || fl4 == PKT_RX_L4_CKSUM_BAD) + if ((ol_flags & PKT_RX_IP_CKSUM_MASK) == PKT_RX_IP_CKSUM_BAD) + return 1; + + if ((ol_flags & PKT_RX_L4_CKSUM_MASK) == PKT_RX_L4_CKSUM_BAD) return 1; /* case 2: either ip or l4 or both cksum is unknown */ + ret = 0; l3h4 = rte_pktmbuf_mtod_offset(m, const struct ipv4_hdr *, m->l2_len); l3h6 = rte_pktmbuf_mtod_offset(m, const struct ipv6_hdr *, m->l2_len); - ret = 0; - if (fl3 == PKT_RX_IP_CKSUM_UNKNOWN && l3h4->hdr_checksum != 0) { + if ((ol_flags & PKT_RX_IP_CKSUM_MASK) == PKT_RX_IP_CKSUM_UNKNOWN && + l3h4->hdr_checksum != 0) { csum = _ipv4x_cksum(l3h4, m->l3_len); ret = (csum != UINT16_MAX); } - if (ret == 0 && fl4 == PKT_RX_L4_CKSUM_UNKNOWN) { + if (ret == 0 && (ol_flags & PKT_RX_L4_CKSUM_MASK) == + PKT_RX_L4_CKSUM_UNKNOWN) { /* * for IPv4 it is allowed to have zero UDP cksum, @@ -376,8 +396,20 @@ rwl_acquire(rte_atomic32_t *p) static inline void rwl_down(rte_atomic32_t *p) { - while (rte_atomic32_cmpset((volatile uint32_t *)p, 0, INT32_MIN) == 0) + while (rte_atomic32_cmpset((volatile uint32_t *)p, 0, INT32_MIN) == 0) + rte_pause(); +} + +static inline int +rwl_try_down(rte_atomic32_t *p) +{ + while (rte_atomic32_cmpset((volatile uint32_t *)p, 0, INT32_MIN) == 0) { + /* Already down */ + if (rte_atomic32_read(p) == INT32_MIN) + return -1; rte_pause(); + } + return 0; } static inline void diff --git a/lib/libtle_l4p/net_misc.h b/lib/libtle_l4p/net_misc.h index 2d8dac2..c1d946b 100644 --- a/lib/libtle_l4p/net_misc.h +++ b/lib/libtle_l4p/net_misc.h @@ -16,6 +16,7 @@ #ifndef _NET_MISC_H_ #define _NET_MISC_H_ +#include #include #include #include "osdep.h" @@ -71,6 +72,26 @@ union ip_addrs { union ipv6_addrs v6; }; +static inline bool +is_empty_addr(const struct sockaddr *addr) +{ + bool any = false; + const struct sockaddr_in *in4; + const struct sockaddr_in6 *in6; + + if (addr->sa_family == AF_INET) { + in4 = (const struct sockaddr_in *)addr; + if (in4->sin_addr.s_addr == INADDR_ANY) + any = true; + } else if (addr->sa_family == AF_INET6) { + in6 = (const struct sockaddr_in6 *)addr; + if (IN6_IS_ADDR_UNSPECIFIED(&in6->sin6_addr)) + any = true; + } + + return any; +} + #ifdef __cplusplus } #endif diff --git a/lib/libtle_l4p/port_statmap.h b/lib/libtle_l4p/port_statmap.h new file mode 100644 index 0000000..8bbb0ba --- /dev/null +++ b/lib/libtle_l4p/port_statmap.h @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2019 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _PORT_STATMAP_H_ +#define _PORT_STATMAP_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_PORT_NUM (UINT16_MAX + 1) +#define ALLOC_PORT_START 0x8000 + +struct tle_psm { + uint32_t nb_used; /* Number of ports already in use. */ + uint32_t next_alloc; /* Next port to try allocate. */ + uint8_t stat[MAX_PORT_NUM]; /* Status of the port: + * 1) the most significant bit indicates + * if SO_REUSEPORT is allowed; + * 2) lowest 7 bits indicate # of streams + * using the port. + */ +}; + +static inline void +tle_psm_init(struct tle_psm *psm) +{ + memset(psm, 0, sizeof(struct tle_psm)); + psm->next_alloc = ALLOC_PORT_START; +} + +static inline int +tle_psm_set(struct tle_psm *psm, uint16_t port, uint8_t reuseport) +{ + if (psm->stat[port] == 0) { + /* port has not been used */ + psm->stat[port]++; + if (reuseport) + psm->stat[port] |= 0x80; + } else { + /* port is used by some socket */ + if (reuseport && (psm->stat[port] & 0x80)) { + /* all sockets set reuseport */ + psm->stat[port]++; + } else + return -1; + } + + return 0; +} + +static inline void +tle_psm_clear(struct tle_psm *psm, uint16_t port) +{ + psm->stat[port]--; + if ((psm->stat[port] & 0x7f) == 0) + psm->stat[port] = 0; +} + + +static inline uint8_t +tle_psm_check(const struct tle_psm *psm, uint16_t port) +{ + return psm->stat[port]; +} + +static inline uint16_t +tle_psm_alloc_port(struct tle_psm *psm) +{ + uint32_t i = psm->next_alloc; + + for (; i < MAX_PORT_NUM; i++) { + if (psm->stat[i] == 0) { + psm->next_alloc = i + 1; + return (uint16_t)i; + } + } + + for (i = ALLOC_PORT_START; i < psm->next_alloc; i++) { + if (psm->stat[i] == 0) { + psm->next_alloc = i + 1; + return (uint16_t)i; + } + } + + return 0; +} + +static inline uint16_t +tle_psm_alloc_dual_port(struct tle_psm *psm4, struct tle_psm *psm6) +{ + uint32_t i = psm6->next_alloc; + + for (; i < MAX_PORT_NUM; i++) { + if (psm6->stat[i] == 0 && psm4->stat[i] == 0) { + psm6->next_alloc = i + 1; + return (uint16_t)i; + } + } + + for (i = ALLOC_PORT_START; i < psm6->next_alloc; i++) { + if (psm6->stat[i] == 0 && psm4->stat[i] == 0) { + psm6->next_alloc = i + 1; + return (uint16_t)i; + } + } + + return 0; +} + +#ifdef __cplusplus +} +#endif + +#endif /* _PORT_STATMAP_H_ */ diff --git a/lib/libtle_l4p/stream.h b/lib/libtle_l4p/stream.h index 49a2809..9f2bbc1 100644 --- a/lib/libtle_l4p/stream.h +++ b/lib/libtle_l4p/stream.h @@ -31,7 +31,11 @@ struct tle_stream { STAILQ_ENTRY(tle_stream) link; struct tle_ctx *ctx; - uint8_t type; /* TLE_V4 or TLE_V6 */ + tle_stream_options_t option; + unsigned long timestamp; + uint16_t reuseport_seed; + uint8_t type; /* TLE_V4 or TLE_V6 */ + uint8_t padding; /* Stream address information. */ union l4_ports port; @@ -53,15 +57,25 @@ static inline uint32_t get_streams(struct tle_ctx *ctx, struct tle_stream *s[], uint32_t num) { struct tle_stream *p; - uint32_t i, n; + uint32_t i, n, inc; rte_spinlock_lock(&ctx->streams.lock); - n = RTE_MIN(ctx->streams.nb_free, num); - for (i = 0, p = STAILQ_FIRST(&ctx->streams.free); - i != n; - i++, p = STAILQ_NEXT(p, link)) + n = ctx->streams.nb_free; + if (n < num) { + inc = tle_stream_ops[ctx->prm.proto].more_streams(ctx); + ctx->streams.nb_free += inc; + ctx->streams.nb_cur += inc; + n = ctx->streams.nb_free; + } + n = RTE_MIN(n, num); + + for (i = 0, p = STAILQ_FIRST(&ctx->streams.free); i != n; ) { s[i] = p; + p = STAILQ_NEXT(p, link); + s[i]->link.stqe_next = NULL; + i++; + } if (p == NULL) /* we retrieved all free entries */ @@ -80,9 +94,6 @@ get_stream(struct tle_ctx *ctx) struct tle_stream *s; s = NULL; - if (ctx->streams.nb_free == 0) - return s; - get_streams(ctx, &s, 1); return s; } @@ -120,8 +131,8 @@ drb_nb_elem(const struct tle_ctx *ctx) } static inline int32_t -stream_get_dest(struct tle_stream *s, const void *dst_addr, - struct tle_dest *dst) +stream_get_dest(uint8_t type, struct tle_stream *s, const void *src_addr, + const void *dst_addr, struct tle_dest *dst) { int32_t rc; const struct in_addr *d4; @@ -133,12 +144,13 @@ stream_get_dest(struct tle_stream *s, const void *dst_addr, /* it is here just to keep gcc happy. */ d4 = NULL; + /* it is here just to keep gcc happy. */ d6 = NULL; - if (s->type == TLE_V4) { + if (type == TLE_V4) { d4 = dst_addr; rc = ctx->prm.lookup4(ctx->prm.lookup4_data, d4, dst); - } else if (s->type == TLE_V6) { + } else if (type == TLE_V6) { d6 = dst_addr; rc = ctx->prm.lookup6(ctx->prm.lookup6_data, d6, dst); } else @@ -148,18 +160,25 @@ stream_get_dest(struct tle_stream *s, const void *dst_addr, return -ENOENT; dev = dst->dev; - dst->ol_flags = dev->tx.ol_flags[s->type]; + dst->ol_flags = dev->tx.ol_flags[type]; - if (s->type == TLE_V4) { + if (type == TLE_V4) { struct ipv4_hdr *l3h; l3h = (struct ipv4_hdr *)(dst->hdr + dst->l2_len); - l3h->src_addr = dev->prm.local_addr4.s_addr; + if (((const struct in_addr*)src_addr)->s_addr != INADDR_ANY) + l3h->src_addr = ((const struct in_addr*)src_addr)->s_addr; + else + l3h->src_addr = dev->prm.local_addr4.s_addr; l3h->dst_addr = d4->s_addr; } else { struct ipv6_hdr *l3h; l3h = (struct ipv6_hdr *)(dst->hdr + dst->l2_len); - rte_memcpy(l3h->src_addr, &dev->prm.local_addr6, - sizeof(l3h->src_addr)); + if (!IN6_IS_ADDR_UNSPECIFIED(src_addr)) + rte_memcpy(l3h->src_addr, src_addr, + sizeof(l3h->src_addr)); + else + rte_memcpy(l3h->src_addr, &dev->prm.local_addr6, + sizeof(l3h->src_addr)); rte_memcpy(l3h->dst_addr, d6, sizeof(l3h->dst_addr)); } diff --git a/lib/libtle_l4p/stream_table.c b/lib/libtle_l4p/stream_table.c index 5a89553..e029306 100644 --- a/lib/libtle_l4p/stream_table.c +++ b/lib/libtle_l4p/stream_table.c @@ -13,68 +13,47 @@ * limitations under the License. */ #include -#include #include #include "stream_table.h" void -stbl_fini(struct stbl *st) +bhash_fini(struct tle_ctx *ctx) { uint32_t i; - for (i = 0; i != RTE_DIM(st->ht); i++) { - rte_hash_free(st->ht[i].t); - rte_free(st->ht[i].ent); - } - - memset(st, 0, sizeof(*st)); + for (i = 0; i != RTE_DIM(ctx->bhash); i++) + rte_hash_free(ctx->bhash[i]); } int -stbl_init(struct stbl *st, uint32_t num, int32_t socket) +bhash_init(struct tle_ctx *ctx) { - int32_t rc; - size_t i, sz; - struct rte_hash_parameters hprm; + int rc = 0; + struct rte_hash_parameters hprm = {0}; + bool ipv6 = ctx->prm.lookup6 != NULL; char buf[RTE_HASH_NAMESIZE]; - num = RTE_MAX(5 * num / 4, 0x10U); - - memset(&hprm, 0, sizeof(hprm)); hprm.name = buf; - hprm.entries = num; - hprm.socket_id = socket; - - rc = 0; - - snprintf(buf, sizeof(buf), "stbl4@%p", st); - hprm.key_len = sizeof(struct stbl4_key); - st->ht[TLE_V4].t = rte_hash_create(&hprm); - if (st->ht[TLE_V4].t == NULL) + hprm.entries = 4096; + hprm.extra_flag = RTE_HASH_EXTRA_FLAGS_EXT_TABLE; + hprm.socket_id = ctx->prm.socket_id; + + snprintf(buf, sizeof(buf), "bhash4@%p", ctx); + hprm.key_len = sizeof(struct bhash4_key); + ctx->bhash[TLE_V4] = rte_hash_create(&hprm); + if (ctx->bhash[TLE_V4] == NULL) rc = (rte_errno != 0) ? -rte_errno : -ENOMEM; - if (rc == 0) { - snprintf(buf, sizeof(buf), "stbl6@%p", st); - hprm.key_len = sizeof(struct stbl6_key); - st->ht[TLE_V6].t = rte_hash_create(&hprm); - if (st->ht[TLE_V6].t == NULL) + if (rc == 0 && ipv6) { + snprintf(buf, sizeof(buf), "bhash6@%p", ctx); + hprm.key_len = sizeof(struct bhash6_key); + ctx->bhash[TLE_V6] = rte_hash_create(&hprm); + if (ctx->bhash[TLE_V6] == NULL) { + rte_hash_free(ctx->bhash[TLE_V4]); rc = (rte_errno != 0) ? -rte_errno : -ENOMEM; + } } - for (i = 0; i != RTE_DIM(st->ht) && rc == 0; i++) { - - sz = sizeof(*st->ht[i].ent) * num; - st->ht[i].ent = rte_zmalloc_socket(NULL, sz, - RTE_CACHE_LINE_SIZE, socket); - if (st->ht[i].ent == NULL) - rc = -ENOMEM; - else - st->ht[i].nb_ent = num; - } - - if (rc != 0) - stbl_fini(st); - return rc; } diff --git a/lib/libtle_l4p/stream_table.h b/lib/libtle_l4p/stream_table.h index 033c306..ba8d165 100644 --- a/lib/libtle_l4p/stream_table.h +++ b/lib/libtle_l4p/stream_table.h @@ -16,199 +16,415 @@ #ifndef _STREAM_TABLE_H_ #define _STREAM_TABLE_H_ +#include #include -#include "tcp_misc.h" +#include "stream.h" +#include "misc.h" #ifdef __cplusplus extern "C" { #endif +#define HASH_SIZE_32K 32771 +#define HASH_SIZE_64K 65537 +#define HASH_SIZE_128K 131071 + +#define HASH_SIZE HASH_SIZE_64K + struct stbl_entry { void *data; }; -struct shtbl { - uint32_t nb_ent; /* max number of entries in the table. */ - rte_spinlock_t l; /* lock to protect the hash table */ - struct rte_hash *t; - struct stbl_entry *ent; +struct stbl { + rte_spinlock_t l; + uint32_t need_lock; + struct stbl_entry head[HASH_SIZE]; } __rte_cache_aligned; -struct stbl { - struct shtbl ht[TLE_VNUM]; -}; +static inline int +stbl_init(struct stbl *st, uint32_t lock) +{ + st->need_lock = lock; + return 0; +} -struct stbl4_key { - union l4_ports port; - union ipv4_addrs addr; -} __attribute__((__packed__)); +static inline int +stbl_fini(struct stbl *st) +{ + st->need_lock = 0; + return 0; +} -struct stbl6_key { - union l4_ports port; - union ipv6_addrs addr; -} __attribute__((__packed__)); +static inline uint8_t +compare_pkt(const struct tle_stream *s, const union pkt_info *pi) +{ + if (s->type != pi->tf.type) + return -1; -struct stbl_key { - union l4_ports port; - union { - union ipv4_addrs addr4; - union ipv6_addrs addr6; - }; -} __attribute__((__packed__)); + if (s->port.raw != pi->port.raw) + return -1; -extern void stbl_fini(struct stbl *st); + if (s->type == TLE_V4) { + if (s->ipv4.addr.raw != pi->addr4.raw) + return -1; + } else { + if (memcmp(&s->ipv6.addr, pi->addr6, sizeof(union ipv6_addrs))) + return -1; + } -extern int stbl_init(struct stbl *st, uint32_t num, int32_t socket); + return 0; +} -static inline void -stbl_pkt_fill_key(struct stbl_key *k, const union pkt_info *pi, uint32_t type) +static inline uint32_t +stbl_hash_stream(const struct tle_stream *s) { - static const struct stbl_key zero = { - .port.raw = 0, - }; - - k->port = pi->port; - if (type == TLE_V4) - k->addr4 = pi->addr4; - else if (type == TLE_V6) - k->addr6 = *pi->addr6; - else - *k = zero; + int i; + unsigned int hash; + + if (s->type == TLE_V4) { + hash = s->ipv4.addr.src ^ s->ipv4.addr.dst + ^ s->port.src ^ s->port.dst; + } else { + hash = s->port.src ^ s->port.dst; + for (i = 0; i < 4; i++) { + hash ^= s->ipv6.addr.src.u32[i]; + hash ^= s->ipv6.addr.dst.u32[i]; + } + } + + return hash % HASH_SIZE; } -static inline void -stbl_lock(struct stbl *st, uint32_t type) +static inline uint32_t +stbl_hash_pkt(const union pkt_info* pi) { - rte_spinlock_lock(&st->ht[type].l); + int i; + unsigned int hash; + + if (pi->tf.type == TLE_V4) { + hash = pi->addr4.src ^ pi->addr4.dst ^ pi->port.src ^ pi->port.dst; + } else { + hash = pi->port.src ^ pi->port.dst; + for (i = 0; i < 4; i++) { + hash ^= pi->addr6->src.u32[i]; + hash ^= pi->addr6->dst.u32[i]; + } + } + + return hash % HASH_SIZE; } -static inline void -stbl_unlock(struct stbl *st, uint32_t type) +static inline struct stbl_entry* +stbl_add_stream(struct stbl *st, struct tle_stream *s) { - rte_spinlock_unlock(&st->ht[type].l); + struct stbl_entry* entry; + + if (st->need_lock) + rte_spinlock_lock(&st->l); + entry = &st->head[stbl_hash_stream(s)]; + s->link.stqe_next = (struct tle_stream*)entry->data; + entry->data = s; + if (st->need_lock) + rte_spinlock_unlock(&st->l); + + return entry; } -static inline struct stbl_entry * -stbl_add_entry(struct stbl *st, const union pkt_info *pi) +static inline struct tle_stream * +stbl_find_stream(struct stbl *st, const union pkt_info *pi) { - int32_t rc; - uint32_t type; - struct shtbl *ht; - struct stbl_key k; - - type = pi->tf.type; - stbl_pkt_fill_key(&k, pi, type); - ht = st->ht + type; - - rc = rte_hash_add_key(ht->t, &k); - if ((uint32_t)rc >= ht->nb_ent) - return NULL; - return ht->ent + rc; + struct tle_stream* head; + + if (st->need_lock) + rte_spinlock_lock(&st->l); + head = (struct tle_stream*)st->head[stbl_hash_pkt(pi)].data; + while (head != NULL) { + if (compare_pkt(head, pi) == 0) + break; + + head = head->link.stqe_next; + } + if (st->need_lock) + rte_spinlock_unlock(&st->l); + return head; } -static inline struct stbl_entry * -stbl_add_stream(struct stbl *st, const union pkt_info *pi, const void *s) +static inline void +stbl_del_stream(struct stbl *st, struct stbl_entry *se, + struct tle_stream *s) { - struct stbl_entry *se; + struct tle_stream *prev, *current; - se = stbl_add_entry(st, pi); - if (se != NULL) - se->data = (void *)(uintptr_t)s; - return se; + if (st->need_lock) + rte_spinlock_lock(&st->l); + if (se == NULL) + se = &st->head[stbl_hash_stream(s)]; + prev = NULL; + current = (struct tle_stream*)se->data; + while (current != NULL) { + if (current != s) { + prev = current; + current = current->link.stqe_next; + continue; + } + + if (prev) + prev->link.stqe_next = current->link.stqe_next; + else + se->data = current->link.stqe_next; + break; + } + if (st->need_lock) + rte_spinlock_unlock(&st->l); + + s->link.stqe_next = NULL; } -static inline struct stbl_entry * -stbl_find_entry(struct stbl *st, const union pkt_info *pi) +struct bhash4_key { + uint16_t port; + uint32_t addr; +} __attribute__((__packed__)); + +struct bhash6_key { + uint16_t port; + rte_xmm_t addr; +} __attribute__((__packed__)); + +struct bhash_key { + uint16_t port; + union { + uint32_t addr4; + rte_xmm_t addr6; + }; +} __attribute__((__packed__)); + +void bhash_fini(struct tle_ctx *ctx); + +int bhash_init(struct tle_ctx *ctx); + +static inline int +bhash_sockaddr2key(const struct sockaddr *addr, struct bhash_key *key) { - int32_t rc; - uint32_t type; - struct shtbl *ht; - struct stbl_key k; - - type = pi->tf.type; - stbl_pkt_fill_key(&k, pi, type); - ht = st->ht + type; - - rc = rte_hash_lookup(ht->t, &k); - if ((uint32_t)rc >= ht->nb_ent) - return NULL; - return ht->ent + rc; + int t; + const struct sockaddr_in *lin4; + const struct sockaddr_in6 *lin6; + + if (addr->sa_family == AF_INET) { + lin4 = (const struct sockaddr_in *)addr; + key->port = lin4->sin_port; + key->addr4 = lin4->sin_addr.s_addr; + t = TLE_V4; + } else { + lin6 = (const struct sockaddr_in6 *)addr; + memcpy(&key->addr6, &lin6->sin6_addr, sizeof(key->addr6)); + key->port = lin6->sin6_port; + t = TLE_V6; + } + + return t; } -static inline void * -stbl_find_data(struct stbl *st, const union pkt_info *pi) +/* Return 0 on success; + * Return errno on failure. + */ +static inline int +bhash_add_entry(struct tle_ctx *ctx, const struct sockaddr *addr, + struct tle_stream *s) { - struct stbl_entry *ent; - - ent = stbl_find_entry(st, pi); - return (ent == NULL) ? NULL : ent->data; + int t; + int rc; + int is_first; + struct bhash_key key; + struct rte_hash *bhash; + struct tle_stream *old, *tmp; + + is_first = 0; + t = bhash_sockaddr2key(addr, &key); + + rte_spinlock_lock(&ctx->bhash_lock[t]); + bhash = ctx->bhash[t]; + rc = rte_hash_lookup_data(bhash, &key, (void **)&old); + if (rc == -ENOENT) { + is_first = 1; + s->link.stqe_next = NULL; /* just to avoid follow */ + rc = rte_hash_add_key_data(bhash, &key, s); + } else if (rc >= 0) { + if (t == TLE_V4 && old->type == TLE_V6) { + /* V6 stream may listen V4 address, assure V4 stream + * is ahead of V6 stream in the list + */ + s->link.stqe_next = old; + rte_hash_add_key_data(bhash, &key, s); + } else { + tmp = old->link.stqe_next; + old->link.stqe_next = s; + s->link.stqe_next = tmp; + } + } + rte_spinlock_unlock(&ctx->bhash_lock[t]); + + /* IPv6 socket with unspecified address could receive IPv4 packets. + * So the stream should also be recorded in IPv4 table. + * Only the first stream need be inserted into V4 list, otherwise + * the V6 list is already following V4 list. + */ + if (t == TLE_V6 && !s->option.ipv6only && is_first && + IN6_IS_ADDR_UNSPECIFIED(&key.addr6)) { + t = TLE_V4; + rte_spinlock_lock(&ctx->bhash_lock[t]); + bhash = ctx->bhash[t]; + rc = rte_hash_lookup_data(bhash, &key, (void **)&old); + if (rc == -ENOENT) + rc = rte_hash_add_key_data(bhash, &key, s); + else if (rc >= 0) { + while(old->link.stqe_next != NULL) + old = old->link.stqe_next; + old->link.stqe_next = s; + s->link.stqe_next = NULL; + } + rte_spinlock_unlock(&ctx->bhash_lock[t]); + } + + return (rc >= 0) ? 0 : (-rc); } -#include "tcp_stream.h" - static inline void -stbl_stream_fill_key(struct stbl_key *k, const struct tle_stream *s, - uint32_t type) +bhash_del_entry(struct tle_ctx *ctx, struct tle_stream *s, + const struct sockaddr *addr) { - static const struct stbl_key zero = { - .port.raw = 0, - }; + int t; + int rc; + struct bhash_key key; + struct tle_stream *f, *cur, *pre = NULL; + + t = bhash_sockaddr2key(addr, &key); + + rte_spinlock_lock(&ctx->bhash_lock[t]); + rc = rte_hash_lookup_data(ctx->bhash[t], &key, (void **)&f); + if (rc >= 0) { + cur = f; + pre = NULL; + while (cur != s) { + pre = cur; + cur = cur->link.stqe_next; + } + + if (pre == NULL) { + cur = cur->link.stqe_next; + if (cur == NULL) + rte_hash_del_key(ctx->bhash[t], &key); + else /* change data */ + rte_hash_add_key_data(ctx->bhash[t], &key, cur); + } else + pre->link.stqe_next = cur->link.stqe_next; + } + + rte_spinlock_unlock(&ctx->bhash_lock[t]); + + if (rc < 0) + return; + + s->link.stqe_next = NULL; + + /* IPv6 socket with unspecified address could receive IPv4 packets. + * So the stream should also be recorded in IPv4 table*/ + if (t == TLE_V6 && !s->option.ipv6only && pre == NULL && + IN6_IS_ADDR_UNSPECIFIED(&key.addr6)) { + t = TLE_V4; + rte_spinlock_lock(&ctx->bhash_lock[t]); + rc = rte_hash_lookup_data(ctx->bhash[t], &key, (void **)&f); + if (rc >= 0) { + cur = f; + pre = NULL; + while (cur != s) { + pre = cur; + cur = cur->link.stqe_next; + } + + if (pre == NULL) { + cur = cur->link.stqe_next; + if (cur == NULL) + rte_hash_del_key(ctx->bhash[t], &key); + else /* change data */ + rte_hash_add_key_data(ctx->bhash[t], &key, cur); + } else + pre->link.stqe_next = cur->link.stqe_next; + } + + rte_spinlock_unlock(&ctx->bhash_lock[t]); + } - k->port = s->port; - if (type == TLE_V4) - k->addr4 = s->ipv4.addr; - else if (type == TLE_V6) - k->addr6 = s->ipv6.addr; - else - *k = zero; } -static inline struct stbl_entry * -stbl_add_stream_lock(struct stbl *st, const struct tle_tcp_stream *s) +static inline void * +bhash_reuseport_get_stream(struct tle_stream *s) { - uint32_t type; - struct stbl_key k; - struct stbl_entry *se; - struct shtbl *ht; - int32_t rc; - - type = s->s.type; - stbl_stream_fill_key(&k, &s->s, type); - ht = st->ht + type; + int n = 0; + struct tle_stream *e, *all[32]; + + e = s; + while(e && n < 32) { + all[n++] = e; + e = e->link.stqe_next; + } + + /* for each connection, this function will be called twice + * 1st time for the first handshake: SYN + * 2nd time for the third handshake: ACK + */ + return all[(s->reuseport_seed++) % n]; +} - stbl_lock(st, type); - rc = rte_hash_add_key(ht->t, &k); - stbl_unlock(st, type); +static inline void * +bhash_lookup4(struct rte_hash *t, uint32_t addr, uint16_t port, uint8_t reuse) +{ + int rc; + void *s = NULL; + struct bhash_key key = { + .port = port, + .addr4 = addr, + }; - if ((uint32_t)rc >= ht->nb_ent) - return NULL; + rc = rte_hash_lookup_data(t, &key, &s); + if (rc == -ENOENT) { + key.addr4 = INADDR_ANY; + rc = rte_hash_lookup_data(t, &key, &s); + } - se = ht->ent + rc; - if (se != NULL) - se->data = (void *)(uintptr_t)s; + if (rc >= 0) { + if (reuse) + return bhash_reuseport_get_stream(s); + else + return s; + } - return se; + return NULL; } -static inline void -stbl_del_stream(struct stbl *st, struct stbl_entry *se, - const struct tle_tcp_stream *s, uint32_t lock) +static inline void * +bhash_lookup6(struct rte_hash *t, rte_xmm_t addr, uint16_t port, uint8_t reuse) { - uint32_t type; - struct stbl_key k; + int rc; + void *s = NULL; + struct bhash_key key = { + .port = port, + .addr6 = addr, + }; - if (se == NULL) - return; + rc = rte_hash_lookup_data(t, &key, &s); + if (rc == -ENOENT) { + memcpy(&key.addr6, &tle_ipv6_any, sizeof(key.addr6)); + rc = rte_hash_lookup_data(t, &key, &s); + } - se->data = NULL; + if (rc >= 0) { + if (reuse) + return bhash_reuseport_get_stream(s); + else + return s; + } - type = s->s.type; - stbl_stream_fill_key(&k, &s->s, type); - if (lock != 0) - stbl_lock(st, type); - rte_hash_del_key(st->ht[type].t, &k); - if (lock != 0) - stbl_unlock(st, type); + return NULL; } #ifdef __cplusplus diff --git a/lib/libtle_l4p/syncookie.h b/lib/libtle_l4p/syncookie.h index 61bfce4..bf01e78 100644 --- a/lib/libtle_l4p/syncookie.h +++ b/lib/libtle_l4p/syncookie.h @@ -182,9 +182,12 @@ sync_fill_tcb(struct tcb *tcb, const union seg_info *si, const union tsopt *to) { uint32_t ack, mss, seq, wscale; + tcb->err = 0; + seq = si->seq; tcb->rcv.nxt = seq; + tcb->rcv.cpy = seq; tcb->rcv.irs = seq - 1; tcb->snd.wu.wl1 = seq; @@ -202,6 +205,7 @@ sync_fill_tcb(struct tcb *tcb, const union seg_info *si, const union tsopt *to) tcb->so.mss = mss; tcb->snd.ts = to->ecr; + tcb->snd.cork_ts = 0; tcb->rcv.ts = to->val; tcb->so.ts.raw = to->raw; diff --git a/lib/libtle_l4p/tcp_ctl.h b/lib/libtle_l4p/tcp_ctl.h index bec1e76..3196470 100644 --- a/lib/libtle_l4p/tcp_ctl.h +++ b/lib/libtle_l4p/tcp_ctl.h @@ -22,6 +22,7 @@ #include "tcp_stream.h" #include "tcp_ofo.h" +#include "tcp_timer.h" #ifdef __cplusplus extern "C" { @@ -97,10 +98,10 @@ calc_rx_wnd(const struct tle_tcp_stream *s, uint32_t scale) /* peer doesn't support WSCALE option, wnd size is limited to 64K */ if (scale == TCP_WSCALE_NONE) { - wnd = _rte_ring_get_mask(s->rx.q) << TCP_WSCALE_DEFAULT; + wnd = rte_ring_free_count(s->rx.q) << TCP_WSCALE_DEFAULT; return RTE_MIN(wnd, (uint32_t)UINT16_MAX); } else - return _rte_ring_get_mask(s->rx.q) << scale; + return rte_ring_free_count(s->rx.q) << scale; } /* empty stream's send queue */ @@ -144,31 +145,34 @@ static inline void tcp_stream_reset(struct tle_ctx *ctx, struct tle_tcp_stream *s) { struct stbl *st; - uint16_t uop; + uint16_t state; + uint8_t i; st = CTX_TCP_STLB(ctx); - /* reset TX armed */ - rte_atomic32_set(&s->tx.arm, 0); + for (i = 0; i < TIMER_NUM; i++) + timer_stop(s, i); /* reset TCB */ - uop = s->tcb.uop & ~TCP_OP_CLOSE; + state = s->tcb.state; memset(&s->tcb, 0, sizeof(s->tcb)); /* reset cached destination */ memset(&s->tx.dst, 0, sizeof(s->tx.dst)); - if (uop != TCP_OP_ACCEPT) { + /* state could be ESTABLISHED, CLOSED or LISTEN + * stream in CLOSED state has already been cleared by stream_term + * stream in ESTABLISHED state is accepted stream, and doesn't need clear + */ + if (state == TCP_ST_LISTEN) { /* free stream's destination port */ stream_clear_ctx(ctx, &s->s); - if (uop == TCP_OP_LISTEN) - empty_lq(s); + empty_lq(s); } if (s->ste != NULL) { /* remove entry from RX streams table */ - stbl_del_stream(st, s->ste, s, - (s->flags & TLE_CTX_FLAG_ST) == 0); + stbl_del_stream(st, s->ste, &s->s); s->ste = NULL; empty_rq(s); } @@ -184,6 +188,48 @@ tcp_stream_reset(struct tle_ctx *ctx, struct tle_tcp_stream *s) put_stream(ctx, &s->s, TCP_STREAM_TX_FINISHED(s)); } +static inline void +stream_term(struct tle_tcp_stream *s) +{ + struct sdr *dr; + + /* 1) recv a RST packet; 2) keepalive timeout */ + if (s->tcb.state == TCP_ST_ESTABLISHED) { + TCP_DEC_STATS_ATOMIC(TCP_MIB_CURRESTAB); + TCP_INC_STATS(TCP_MIB_ESTABRESETS); + } + + s->tcb.state = TCP_ST_CLOSED; + rte_smp_wmb(); + + /* close() was already invoked, schedule final cleanup */ + if ((s->tcb.uop & TCP_OP_CLOSE) != 0) { + if ((s->tcb.uop & TCP_OP_ACCEPT) == 0) { + /* free stream's destination port */ + stream_clear_ctx(s->s.ctx, &s->s); + if ((s->tcb.uop & TCP_OP_LISTEN) != 0) + empty_lq(s); + } + + if (s->ste != NULL) { + /* remove entry from RX streams table */ + stbl_del_stream(CTX_TCP_STLB(s->s.ctx), s->ste, &s->s); + s->ste = NULL; + empty_rq(s); + } + + dr = CTX_TCP_SDR(s->s.ctx); + rte_spinlock_lock(&dr->lock); + STAILQ_INSERT_TAIL(&dr->be, &s->s, link); + rte_spinlock_unlock(&dr->lock); + + /* notify user that stream need to be closed */ + } else if (s->err.ev != NULL) + tle_event_raise(s->err.ev); + else if (s->err.cb.func != NULL) + s->err.cb.func(s->err.cb.data, &s->s); +} + #ifdef __cplusplus } #endif diff --git a/lib/libtle_l4p/tcp_misc.h b/lib/libtle_l4p/tcp_misc.h index 0cef8b2..1f7974e 100644 --- a/lib/libtle_l4p/tcp_misc.h +++ b/lib/libtle_l4p/tcp_misc.h @@ -30,7 +30,7 @@ extern "C" { * of protocol related data. */ -#define TCP_WSCALE_DEFAULT 7 +#define TCP_WSCALE_DEFAULT 10 #define TCP_WSCALE_NONE 0 #define TCP_TX_HDR_MAX (sizeof(struct tcp_hdr) + TCP_TX_OPT_LEN_MAX) @@ -71,27 +71,6 @@ extern "C" { /* TCP flags mask. */ #define TCP_FLAG_MASK UINT8_MAX -union typflg { - uint16_t raw; - struct { - uint8_t type; /* TLE_V4/TLE_V6 */ - uint8_t flags; /* TCP header flags */ - }; -}; - -union pkt_info { - rte_xmm_t raw; - struct { - union typflg tf; - uint16_t csf; /* checksum flags */ - union l4_ports port; - union { - union ipv4_addrs addr4; - const union ipv6_addrs *addr6; - }; - }; -}; - union seg_info { rte_xmm_t raw; struct { @@ -226,7 +205,7 @@ struct dack_info { }; /* get current timestamp in ms */ -static inline uint32_t +static inline uint64_t tcp_get_tms(uint32_t mshift) { uint64_t ts; @@ -344,7 +323,9 @@ fill_syn_opts(void *p, const struct syn_opts *so) opt = (struct tcpopt *)to; } - to[0] = TCP_OPT_KIND_EOL; + to[0] = TCP_OPT_KIND_NOP; + to[1] = TCP_OPT_KIND_NOP; + to[2] = TCP_OPT_KIND_NOP; } /* @@ -390,6 +371,8 @@ get_tms_opts(uintptr_t p, uint32_t len) else if (kind == TCP_OPT_KIND_NOP) i += sizeof(to->kl.kind); else { + if (to->kl.len == 0) + break; i += to->kl.len; if (i <= len && to->kl.raw == TCP_OPT_KL_TMS) { ts.val = rte_be_to_cpu_32(to->ts.val); @@ -449,7 +432,6 @@ get_pkt_info(const struct rte_mbuf *m, union pkt_info *pi, union seg_info *si) ((uintptr_t)tcph + offsetof(struct tcp_hdr, src_port)); pi->tf.flags = tcph->tcp_flags; pi->tf.type = type; - pi->csf = m->ol_flags & (PKT_RX_IP_CKSUM_MASK | PKT_RX_L4_CKSUM_MASK); pi->port.raw = prt->raw; get_seg_info(tcph, si); @@ -462,7 +444,7 @@ tcp_mbuf_seq_free(struct rte_mbuf *mb[], uint32_t num) len = 0; for (i = 0; i != num; i++) { - len += mb[i]->pkt_len; + len += PKT_L4_PLEN(mb[i]); rte_pktmbuf_free(mb[i]); } diff --git a/lib/libtle_l4p/tcp_ofo.c b/lib/libtle_l4p/tcp_ofo.c index 1565445..b31f2b5 100644 --- a/lib/libtle_l4p/tcp_ofo.c +++ b/lib/libtle_l4p/tcp_ofo.c @@ -12,7 +12,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include #include #include "tcp_stream.h" @@ -28,12 +27,6 @@ #define OFO_OBJ_MAX (OFODB_OBJ_MAX * OFO_DB_MAX) void -tcp_ofo_free(struct ofo *ofo) -{ - rte_free(ofo); -} - -static void calc_ofo_elems(uint32_t nbufs, uint32_t *nobj, uint32_t *ndb) { uint32_t n, nd, no; @@ -51,35 +44,3 @@ calc_ofo_elems(uint32_t nbufs, uint32_t *nobj, uint32_t *ndb) *nobj = no; *ndb = nd; } - -struct ofo * -tcp_ofo_alloc(uint32_t nbufs, int32_t socket) -{ - uint32_t i, ndb, nobj; - size_t dsz, osz, sz; - struct ofo *ofo; - struct rte_mbuf **obj; - - calc_ofo_elems(nbufs, &nobj, &ndb); - osz = sizeof(*ofo) + sizeof(ofo->db[0]) * ndb; - dsz = sizeof(ofo->db[0].obj[0]) * nobj * ndb; - sz = osz + dsz; - - ofo = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE, socket); - if (ofo == NULL) { - TCP_LOG(ERR, "%s: allocation of %zu bytes on socket %d " - "failed with error code: %d\n", - __func__, sz, socket, rte_errno); - return NULL; - } - - obj = (struct rte_mbuf **)&ofo->db[ndb]; - for (i = 0; i != ndb; i++) { - ofo->db[i].nb_max = nobj; - ofo->db[i].obj = obj + i * nobj; - } - - ofo->nb_max = ndb; - return ofo; -} - diff --git a/lib/libtle_l4p/tcp_ofo.h b/lib/libtle_l4p/tcp_ofo.h index 9d88266..0857f17 100644 --- a/lib/libtle_l4p/tcp_ofo.h +++ b/lib/libtle_l4p/tcp_ofo.h @@ -20,8 +20,6 @@ extern "C" { #endif -#include - struct ofodb { uint32_t nb_elem; uint32_t nb_max; @@ -103,7 +101,7 @@ _ofo_insert_mbuf(struct ofo* ofo, uint32_t pos, union seqlen* sl, db->obj[k + i] = mb[i]; } if (tcp_seq_lt(end, seq)) - rte_pktmbuf_trim(mb[i - 1], seq - end); + _rte_pktmbuf_trim(mb[i - 1], seq - end); db->nb_elem += i; db->sl.len += tcp_seq_min(seq, end) - sl->seq; @@ -157,7 +155,7 @@ _ofo_insert_right(struct ofo *ofo, uint32_t pos, union seqlen *sl, plen = mb[i]->pkt_len; if (n < plen) { /* adjust partially overlapped packet. */ - rte_pktmbuf_adj(mb[i], n); + mb[i] = _rte_pktmbuf_adj(mb[i], n); break; } } @@ -258,7 +256,7 @@ static inline uint32_t _ofodb_enqueue(struct rte_ring *r, const struct ofodb *db, uint32_t *seq) { uint32_t i, n, num, begin, end; - struct rte_mbuf *pkt; + struct rte_mbuf* pkt; n = 0; num = db->nb_elem; @@ -289,11 +287,7 @@ _ofodb_enqueue(struct rte_ring *r, const struct ofodb *db, uint32_t *seq) return num - n; } -struct ofo * -tcp_ofo_alloc(uint32_t nbufs, int32_t socket); - -void -tcp_ofo_free(struct ofo *ofo); +void calc_ofo_elems(uint32_t nbufs, uint32_t *nobj, uint32_t *ndb); #ifdef __cplusplus } diff --git a/lib/libtle_l4p/tcp_rxq.h b/lib/libtle_l4p/tcp_rxq.h index 2351ee6..be092f9 100644 --- a/lib/libtle_l4p/tcp_rxq.h +++ b/lib/libtle_l4p/tcp_rxq.h @@ -17,6 +17,7 @@ #define _TCP_RXQ_H_ #include "tcp_ofo.h" +#include "tcp_ctl.h" #ifdef __cplusplus extern "C" { @@ -74,6 +75,7 @@ rx_ofo_reduce(struct tle_tcp_stream *s) s->tcb.rcv.nxt = seq; _ofo_remove(ofo, 0, i); + return n; } @@ -133,6 +135,8 @@ rx_data_enqueue(struct tle_tcp_stream *s, uint32_t seq, uint32_t len, } n = rte_ring_count(s->rx.q); + /* update receive window with left recv buffer*/ + s->tcb.rcv.wnd = calc_rx_wnd(s, s->tcb.rcv.wscale); if (r != n) { /* raise RX event */ if (s->rx.ev != NULL) diff --git a/lib/libtle_l4p/tcp_rxtx.c b/lib/libtle_l4p/tcp_rxtx.c index a519645..5d7e0d1 100644 --- a/lib/libtle_l4p/tcp_rxtx.c +++ b/lib/libtle_l4p/tcp_rxtx.c @@ -28,8 +28,30 @@ #include "tcp_rxq.h" #include "tcp_txq.h" #include "tcp_tx_seg.h" +#include "tcp_rxtx.h" -#define TCP_MAX_PKT_SEG 0x20 +/* Uncomment below line to debug cwnd */ +// #define DEBUG_CWND + +#ifdef DEBUG_CWND +#define CWND_INFO(msg, value) printf("CWND: %s: %d\n", msg, value) +#else +#define CWND_INFO(msg, value) do {} while (0) +#endif + +#define TCP_MAX_PKT_SEG 0x20 +#define DELAY_ACK_CHECK_INTERVAL 100 + +/* must larger than l2_len(14)+l3_len(20)+l4_len(20)+tms_option(12) */ +#define RESERVE_HEADER_LEN 128 + +/* If we encounter exhaustion of recv win, we set this thresh to + * update recv win to the remote. It's not set to 1 or some smaller + * value to avoid too-frequent update. + */ +#define RECV_WIN_NOTIFY_THRESH 64 + +static inline int stream_fill_dest(struct tle_tcp_stream *s); /* * checks if input TCP ports and IP addresses match given stream. @@ -54,11 +76,17 @@ rx_check_stream(const struct tle_tcp_stream *s, const union pkt_info *pi) static inline struct tle_tcp_stream * rx_obtain_listen_stream(const struct tle_dev *dev, const union pkt_info *pi, - uint32_t type) + uint32_t type, uint8_t reuse) { struct tle_tcp_stream *s; - s = (struct tle_tcp_stream *)dev->dp[type]->streams[pi->port.dst]; + if (type == TLE_V4) + s = bhash_lookup4(dev->ctx->bhash[type], + pi->addr4.dst, pi->port.dst, reuse); + else + s = bhash_lookup6(dev->ctx->bhash[type], + pi->addr6->dst, pi->port.dst, reuse); + if (s == NULL || tcp_stream_acquire(s) < 0) return NULL; @@ -77,10 +105,10 @@ rx_obtain_stream(const struct tle_dev *dev, struct stbl *st, { struct tle_tcp_stream *s; - s = stbl_find_data(st, pi); + s = TCP_STREAM(stbl_find_stream(st, pi)); if (s == NULL) { - if (pi->tf.flags == TCP_FLAG_ACK) - return rx_obtain_listen_stream(dev, pi, type); + if (pi->tf.flags & TCP_FLAG_ACK) + return rx_obtain_listen_stream(dev, pi, type, 1); return NULL; } @@ -150,131 +178,6 @@ pkt_info_bulk_syneq(const union pkt_info pi[], uint32_t num) return i; } -static inline void -stream_drb_free(struct tle_tcp_stream *s, struct tle_drb *drbs[], - uint32_t nb_drb) -{ - _rte_ring_enqueue_burst(s->tx.drb.r, (void **)drbs, nb_drb); -} - -static inline uint32_t -stream_drb_alloc(struct tle_tcp_stream *s, struct tle_drb *drbs[], - uint32_t nb_drb) -{ - return _rte_ring_dequeue_burst(s->tx.drb.r, (void **)drbs, nb_drb); -} - -static inline uint32_t -get_ip_pid(struct tle_dev *dev, uint32_t num, uint32_t type, uint32_t st) -{ - uint32_t pid; - rte_atomic32_t *pa; - - pa = &dev->tx.packet_id[type]; - - if (st == 0) { - pid = rte_atomic32_add_return(pa, num); - return pid - num; - } else { - pid = rte_atomic32_read(pa); - rte_atomic32_set(pa, pid + num); - return pid; - } -} - -static inline void -fill_tcph(struct tcp_hdr *l4h, const struct tcb *tcb, union l4_ports port, - uint32_t seq, uint8_t hlen, uint8_t flags) -{ - uint16_t wnd; - - l4h->src_port = port.dst; - l4h->dst_port = port.src; - - wnd = (flags & TCP_FLAG_SYN) ? - RTE_MIN(tcb->rcv.wnd, (uint32_t)UINT16_MAX) : - tcb->rcv.wnd >> tcb->rcv.wscale; - - /* ??? use sse shuffle to hton all remaining 16 bytes at once. ??? */ - l4h->sent_seq = rte_cpu_to_be_32(seq); - l4h->recv_ack = rte_cpu_to_be_32(tcb->rcv.nxt); - l4h->data_off = hlen / TCP_DATA_ALIGN << TCP_DATA_OFFSET; - l4h->tcp_flags = flags; - l4h->rx_win = rte_cpu_to_be_16(wnd); - l4h->cksum = 0; - l4h->tcp_urp = 0; - - if (flags & TCP_FLAG_SYN) - fill_syn_opts(l4h + 1, &tcb->so); - else if ((flags & TCP_FLAG_RST) == 0 && tcb->so.ts.raw != 0) - fill_tms_opts(l4h + 1, tcb->snd.ts, tcb->rcv.ts); -} - -static inline int -tcp_fill_mbuf(struct rte_mbuf *m, const struct tle_tcp_stream *s, - const struct tle_dest *dst, uint64_t ol_flags, - union l4_ports port, uint32_t seq, uint32_t flags, - uint32_t pid, uint32_t swcsm) -{ - uint32_t l4, len, plen; - struct tcp_hdr *l4h; - char *l2h; - - len = dst->l2_len + dst->l3_len; - plen = m->pkt_len; - - if (flags & TCP_FLAG_SYN) - l4 = sizeof(*l4h) + TCP_TX_OPT_LEN_MAX; - else if ((flags & TCP_FLAG_RST) == 0 && s->tcb.rcv.ts != 0) - l4 = sizeof(*l4h) + TCP_TX_OPT_LEN_TMS; - else - l4 = sizeof(*l4h); - - /* adjust mbuf to put L2/L3/L4 headers into it. */ - l2h = rte_pktmbuf_prepend(m, len + l4); - if (l2h == NULL) - return -EINVAL; - - /* copy L2/L3 header */ - rte_memcpy(l2h, dst->hdr, len); - - /* setup TCP header & options */ - l4h = (struct tcp_hdr *)(l2h + len); - fill_tcph(l4h, &s->tcb, port, seq, l4, flags); - - /* setup mbuf TX offload related fields. */ - m->tx_offload = _mbuf_tx_offload(dst->l2_len, dst->l3_len, l4, 0, 0, 0); - m->ol_flags |= ol_flags; - - /* update proto specific fields. */ - - if (s->s.type == TLE_V4) { - struct ipv4_hdr *l3h; - l3h = (struct ipv4_hdr *)(l2h + dst->l2_len); - l3h->packet_id = rte_cpu_to_be_16(pid); - l3h->total_length = rte_cpu_to_be_16(plen + dst->l3_len + l4); - - if ((ol_flags & PKT_TX_TCP_CKSUM) != 0) - l4h->cksum = _ipv4x_phdr_cksum(l3h, m->l3_len, - ol_flags); - else if (swcsm != 0) - l4h->cksum = _ipv4_udptcp_mbuf_cksum(m, len, l3h); - - if ((ol_flags & PKT_TX_IP_CKSUM) == 0 && swcsm != 0) - l3h->hdr_checksum = _ipv4x_cksum(l3h, m->l3_len); - } else { - struct ipv6_hdr *l3h; - l3h = (struct ipv6_hdr *)(l2h + dst->l2_len); - l3h->payload_len = rte_cpu_to_be_16(plen + l4); - if ((ol_flags & PKT_TX_TCP_CKSUM) != 0) - l4h->cksum = rte_ipv6_phdr_cksum(l3h, ol_flags); - else if (swcsm != 0) - l4h->cksum = _ipv6_udptcp_mbuf_cksum(m, len, l3h); - } - - return 0; -} - /* * That function supposed to be used only for data packets. * Assumes that L2/L3/L4 headers and mbuf fields already setup properly. @@ -355,6 +258,9 @@ tx_data_pkts(struct tle_tcp_stream *s, struct rte_mbuf *const m[], uint32_t num) i = tle_dring_mp_enqueue(&dev->tx.dr, (const void * const*)m, num, drb, &nb); + if (i > 0) + timer_stop(s, TIMER_DACK); + /* free unused drbs. */ if (nb != 0) stream_drb_free(s, drb + nbm - nb, nb); @@ -362,6 +268,113 @@ tx_data_pkts(struct tle_tcp_stream *s, struct rte_mbuf *const m[], uint32_t num) return i; } +/* + * case 0: pkt is not split yet, (indicate plen > sl->len) + * case 1: pkt is split, but left packet > sl->len + * case 2: pkt is split, but left packet <= sl->len + */ +static inline struct rte_mbuf * +get_indirect_mbuf(struct tle_tcp_stream *s, + struct rte_mbuf *m, uint32_t *p_plen, + union seqlen *sl, uint32_t type, + uint32_t mss) +{ + uint32_t hdr_len = PKT_L234_HLEN(m), plen, left; + struct rte_mbuf *f, *t; + uint16_t i, nb_segs, adj; + void *hdr; + + if (s->tcb.snd.nxt_pkt) { + f = s->tcb.snd.nxt_pkt; + plen = f->data_len - s->tcb.snd.nxt_offset; + if (f == m) /* 1st segment contains net headers */ + plen -= hdr_len; + } else { + f = m; + plen = f->data_len - hdr_len; + } + + TCP_LOG(DEBUG, "m(%p): pkt_len=%u, nb_segs=%u, sl->len = %u\n", + m, m->pkt_len, m->nb_segs, sl->len); + + nb_segs = 1; + if (sl->len < plen) { + /* Segment split needed: sometimes, cwnd will be reset to + * 1 or 2 mss. In this case, we send part of this seg, and + * record which segment we've sent, and the offset of sent + * data in tcb. + */ + left = plen - sl->len; + plen = sl->len; + s->tcb.snd.nxt_pkt = f; + } else { + left = 0; + t = f->next; + while (t && plen + t->data_len <= sl->len) { + plen += t->data_len; + t = t->next; + nb_segs++; + } + s->tcb.snd.nxt_pkt = t; + } + + struct rte_mbuf *pkts[1 + nb_segs]; + if (rte_pktmbuf_alloc_bulk(s->tx.dst.head_mp, pkts, 1 + nb_segs) < 0) + return NULL; + + rte_pktmbuf_attach(pkts[1], f); + + /* remove bytes in the beginning */ + adj = s->tcb.snd.nxt_offset; + if (f == m) + adj += hdr_len; + if (adj) + rte_pktmbuf_adj(pkts[1], adj); + + /* remove bytes in the end */ + if (left > 0) { + rte_pktmbuf_trim(pkts[1], left); + s->tcb.snd.nxt_offset += plen; + } else + s->tcb.snd.nxt_offset = 0; + + /* attach chaining segment if we have */ + for (i = 1, t = f->next; i < nb_segs; ++i) { + rte_pktmbuf_attach(pkts[i+1], t); + pkts[i]->next = pkts[i+1]; + t = t->next; + } + + /* prepare l2/l3/l4 header */ + hdr = rte_pktmbuf_append(pkts[0], hdr_len); + rte_memcpy(hdr, rte_pktmbuf_mtod(m, void *), hdr_len); + pkts[0]->nb_segs = nb_segs + 1; + pkts[0]->pkt_len = plen + hdr_len; + pkts[0]->ol_flags = m->ol_flags; + pkts[0]->tx_offload = m->tx_offload; + if (type == TLE_V4) { + struct ipv4_hdr *l3h; + + l3h = rte_pktmbuf_mtod_offset(pkts[0], + struct ipv4_hdr *, m->l2_len); + l3h->total_length = + rte_cpu_to_be_16(plen + m->l3_len + m->l4_len); + } else { + struct ipv6_hdr *l3h; + + l3h = rte_pktmbuf_mtod_offset(pkts[0], + struct ipv6_hdr *, m->l2_len); + l3h->payload_len = + rte_cpu_to_be_16(plen + m->l4_len); + } + if (plen <= mss) + pkts[0]->ol_flags &= ~PKT_TX_TCP_SEG; + pkts[0]->next = pkts[1]; + + *p_plen = plen; + return pkts[0]; +} + static inline uint32_t tx_data_bulk(struct tle_tcp_stream *s, union seqlen *sl, struct rte_mbuf *mi[], uint32_t num) @@ -371,11 +384,13 @@ tx_data_bulk(struct tle_tcp_stream *s, union seqlen *sl, struct rte_mbuf *mi[], struct rte_mbuf *mb; struct rte_mbuf *mo[MAX_PKT_BURST + TCP_MAX_PKT_SEG]; + /* check stream has drb to send pkts */ + if (stream_drb_empty(s)) + return 0; + mss = s->tcb.snd.mss; type = s->s.type; - dev = s->tx.dst.dev; - pid = get_ip_pid(dev, num, type, (s->flags & TLE_CTX_FLAG_ST) != 0); k = 0; tn = 0; @@ -383,26 +398,64 @@ tx_data_bulk(struct tle_tcp_stream *s, union seqlen *sl, struct rte_mbuf *mi[], for (i = 0; i != num && sl->len != 0 && fail == 0; i++) { mb = mi[i]; - sz = RTE_MIN(sl->len, mss); plen = PKT_L4_PLEN(mb); /*fast path, no need to use indirect mbufs. */ - if (plen <= sz) { - + if (s->tcb.snd.nxt_pkt == NULL && plen <= sl->len) { + pid = get_ip_pid(dev, calc_seg_cnt(plen, s->tcb.snd.mss), + type, (s->flags & TLE_CTX_FLAG_ST) != 0); /* update pkt TCP header */ - tcp_update_mbuf(mb, type, &s->tcb, sl->seq, pid + i); + tcp_update_mbuf(mb, type, &s->tcb, sl->seq, pid); /* keep mbuf till ACK is received. */ rte_pktmbuf_refcnt_update(mb, 1); sl->len -= plen; sl->seq += plen; mo[k++] = mb; - /* remaining snd.wnd is less them MSS, send nothing */ - } else if (sz < mss) + if (sl->seq <= s->tcb.snd.rcvr) + TCP_INC_STATS(TCP_MIB_RETRANSSEGS); + /* remaining snd.wnd is less than MSS, send nothing */ + } else if (sl->len < mss) { + break; + /* some data to send already */ + } else if (k != 0 || tn != 0) { break; /* packet indirection needed */ - else - RTE_VERIFY(0); + } else { + struct rte_mbuf *out; + + out = get_indirect_mbuf(s, mb, &plen, sl, type, mss); + if (out == NULL) + return 0; + + pid = get_ip_pid(dev, calc_seg_cnt(plen, s->tcb.snd.mss), + type, (s->flags & TLE_CTX_FLAG_ST) != 0); + /* update pkt TCP header */ + tcp_update_mbuf(out, type, &s->tcb, sl->seq, pid); + + /* no need to bump refcnt !!! */ + + sl->len -= plen; + sl->seq += plen; + + if (tx_data_pkts(s, &out, 1) == 0) { + /* should not happen, we have checked at least one + * drb is available to send this mbuf + */ + rte_pktmbuf_free(out); + return 0; + } + + if (sl->seq <= s->tcb.snd.rcvr) + TCP_INC_STATS(TCP_MIB_RETRANSSEGS); + + if (s->tcb.snd.nxt_pkt) + return 0; + else { + tn = 1; + continue; + } + } if (k >= MAX_PKT_BURST) { n = tx_data_pkts(s, mo, k); @@ -466,14 +519,17 @@ tx_nxt_data(struct tle_tcp_stream *s, uint32_t tms) tcp_txq_set_nxt_head(s, n); } while (n == num); - s->tcb.snd.nxt += sl.seq - (uint32_t)s->tcb.snd.nxt; + if (sl.seq != (uint32_t)s->tcb.snd.nxt) { + s->tcb.snd.nxt += sl.seq - (uint32_t)s->tcb.snd.nxt; + s->tcb.snd.ack = s->tcb.rcv.nxt; + } return tn; } static inline void free_una_data(struct tle_tcp_stream *s, uint32_t len) { - uint32_t i, num, plen; + uint32_t i, num, plen, una_data; struct rte_mbuf **mi; plen = 0; @@ -487,14 +543,18 @@ free_una_data(struct tle_tcp_stream *s, uint32_t len) /* free acked data */ for (i = 0; i != num && plen != len; i++) { - uint32_t next_pkt_len = PKT_L4_PLEN(mi[i]); - if (plen + next_pkt_len > len) { - /* keep SND.UNA at the start of the packet */ - len = plen; + una_data = PKT_L4_PLEN(mi[i]) - s->tcb.snd.una_offset; + + /* partial ack */ + if (plen + una_data > len) { + s->tcb.snd.una_offset += len - plen; + plen = len; break; - } else { - plen += next_pkt_len; } + + /* monolithic ack */ + s->tcb.snd.una_offset = 0; + plen += una_data; rte_pktmbuf_free(mi[i]); } @@ -503,6 +563,7 @@ free_una_data(struct tle_tcp_stream *s, uint32_t len) } while (plen < len); s->tcb.snd.una += len; + s->tcb.snd.waitlen -= len; /* * that could happen in case of retransmit, @@ -519,7 +580,7 @@ calc_smss(uint16_t mss, const struct tle_dest *dst) { uint16_t n; - n = dst->mtu - dst->l2_len - dst->l3_len - TCP_TX_HDR_DACK; + n = dst->mtu - dst->l3_len - sizeof(struct tcp_hdr); mss = RTE_MIN(n, mss); return mss; } @@ -537,71 +598,53 @@ initial_cwnd(uint32_t smss, uint32_t icw) return RTE_MIN(10 * smss, RTE_MAX(2 * smss, icw)); } -/* - * queue standalone packet to he particular output device - * It assumes that: - * - L2/L3/L4 headers should be already set. - * - packet fits into one segment. - */ -static inline int -send_pkt(struct tle_tcp_stream *s, struct tle_dev *dev, struct rte_mbuf *m) +void +tle_tcp_stream_kill(struct tle_stream *ts) { - uint32_t n, nb; - struct tle_drb *drb; - - if (stream_drb_alloc(s, &drb, 1) == 0) - return -ENOBUFS; - - /* enqueue pkt for TX. */ - nb = 1; - n = tle_dring_mp_enqueue(&dev->tx.dr, (const void * const*)&m, 1, - &drb, &nb); - - /* free unused drbs. */ - if (nb != 0) - stream_drb_free(s, &drb, 1); - - return (n == 1) ? 0 : -ENOBUFS; -} + struct tle_tcp_stream *s; -static inline int -send_ctrl_pkt(struct tle_tcp_stream *s, struct rte_mbuf *m, uint32_t seq, - uint32_t flags) -{ - const struct tle_dest *dst; - uint32_t pid, type; - int32_t rc; + s = TCP_STREAM(ts); + if (ts == NULL || s->s.type >= TLE_VNUM) + return; - dst = &s->tx.dst; - type = s->s.type; - pid = get_ip_pid(dst->dev, 1, type, (s->flags & TLE_CTX_FLAG_ST) != 0); + if (s->tcb.state > TCP_ST_LISTEN) + send_rst(s, s->tcb.snd.nxt); - rc = tcp_fill_mbuf(m, s, dst, 0, s->s.port, seq, flags, pid, 1); - if (rc == 0) - rc = send_pkt(s, dst->dev, m); + if (s->tcb.state == TCP_ST_ESTABLISHED) + TCP_DEC_STATS_ATOMIC(TCP_MIB_CURRESTAB); - return rc; + s->tcb.state = TCP_ST_CLOSED; + rte_smp_wmb(); + timer_stop(s, TIMER_RTO); } static inline int -send_rst(struct tle_tcp_stream *s, uint32_t seq) +send_ack(struct tle_tcp_stream *s, uint32_t tms, uint32_t flags) { struct rte_mbuf *m; + uint32_t seq; int32_t rc; m = rte_pktmbuf_alloc(s->tx.dst.head_mp); if (m == NULL) return -ENOMEM; - rc = send_ctrl_pkt(s, m, seq, TCP_FLAG_RST); - if (rc != 0) + seq = s->tcb.snd.nxt - ((flags & (TCP_FLAG_FIN | TCP_FLAG_SYN)) != 0); + s->tcb.snd.ts = tms; + + rc = send_ctrl_pkt(s, m, seq, flags); + if (rc != 0) { rte_pktmbuf_free(m); + return rc; + } - return rc; + timer_stop(s, TIMER_DACK); + s->tcb.snd.ack = s->tcb.rcv.nxt; + return 0; } static inline int -send_ack(struct tle_tcp_stream *s, uint32_t tms, uint32_t flags) +send_keepalive(struct tle_tcp_stream *s) { struct rte_mbuf *m; uint32_t seq; @@ -611,20 +654,16 @@ send_ack(struct tle_tcp_stream *s, uint32_t tms, uint32_t flags) if (m == NULL) return -ENOMEM; - seq = s->tcb.snd.nxt - ((flags & (TCP_FLAG_FIN | TCP_FLAG_SYN)) != 0); - s->tcb.snd.ts = tms; + seq = s->tcb.snd.una - 1; - rc = send_ctrl_pkt(s, m, seq, flags); + rc = send_ctrl_pkt(s, m, seq, TCP_FLAG_ACK); if (rc != 0) { rte_pktmbuf_free(m); return rc; } - - s->tcb.snd.ack = s->tcb.rcv.nxt; return 0; } - static int sync_ack(struct tle_tcp_stream *s, const union pkt_info *pi, const union seg_info *si, uint32_t ts, struct rte_mbuf *m) @@ -633,19 +672,23 @@ sync_ack(struct tle_tcp_stream *s, const union pkt_info *pi, int32_t rc; uint32_t pid, seq, type; struct tle_dev *dev; - const void *da; + const void *sa, *da; struct tle_dest dst; const struct tcp_hdr *th; - type = s->s.type; + type = pi->tf.type; /* get destination information. */ - if (type == TLE_V4) + if (type == TLE_V4) { da = &pi->addr4.src; - else + sa = &pi->addr4.dst; + } + else { da = &pi->addr6->src; + sa = &pi->addr6->dst; + } - rc = stream_get_dest(&s->s, da, &dst); + rc = stream_get_dest(type, &s->s, sa, da, &dst); if (rc < 0) return rc; @@ -654,11 +697,16 @@ sync_ack(struct tle_tcp_stream *s, const union pkt_info *pi, get_syn_opts(&s->tcb.so, (uintptr_t)(th + 1), m->l4_len - sizeof(*th)); s->tcb.rcv.nxt = si->seq + 1; + s->tcb.rcv.cpy = si->seq + 1; seq = sync_gen_seq(pi, s->tcb.rcv.nxt, ts, s->tcb.so.mss, s->s.ctx->prm.hash_alg, &s->s.ctx->prm.secret_key); - s->tcb.so.ts.ecr = s->tcb.so.ts.val; - s->tcb.so.ts.val = sync_gen_ts(ts, s->tcb.so.wscale); + + if (s->tcb.so.ts.raw) { + s->tcb.so.ts.ecr = s->tcb.so.ts.val; + s->tcb.so.ts.val = sync_gen_ts(ts, s->tcb.so.wscale); + } + s->tcb.so.wscale = (s->tcb.so.wscale == TCP_WSCALE_NONE) ? TCP_WSCALE_NONE : TCP_WSCALE_DEFAULT; s->tcb.so.mss = calc_smss(dst.mtu, &dst); @@ -672,11 +720,13 @@ sync_ack(struct tle_tcp_stream *s, const union pkt_info *pi, dev = dst.dev; pid = get_ip_pid(dev, 1, type, (s->flags & TLE_CTX_FLAG_ST) != 0); - rc = tcp_fill_mbuf(m, s, &dst, 0, pi->port, seq, - TCP_FLAG_SYN | TCP_FLAG_ACK, pid, 1); + rc = tcp_fill_mbuf(m, s, &dst, TCP_OLFLAGS_CKSUM(dst.ol_flags), + pi->port, seq, TCP_FLAG_SYN | TCP_FLAG_ACK, pid, 1); if (rc == 0) rc = send_pkt(s, dev, m); + TCP_INC_STATS(TCP_MIB_PASSIVEOPENS); + return rc; } @@ -800,43 +850,24 @@ restore_syn_opt(union seg_info *si, union tsopt *to, return 0; } -static inline void -stream_term(struct tle_tcp_stream *s) -{ - struct sdr *dr; - - s->tcb.state = TCP_ST_CLOSED; - rte_smp_wmb(); - - timer_stop(s); - - /* close() was already invoked, schedule final cleanup */ - if ((s->tcb.uop & TCP_OP_CLOSE) != 0) { - - dr = CTX_TCP_SDR(s->s.ctx); - STAILQ_INSERT_TAIL(&dr->be, &s->s, link); - - /* notify user that stream need to be closed */ - } else if (s->err.ev != NULL) - tle_event_raise(s->err.ev); - else if (s->err.cb.func != NULL) - s->err.cb.func(s->err.cb.data, &s->s); -} - static inline int stream_fill_dest(struct tle_tcp_stream *s) { int32_t rc; uint32_t type; - const void *da; + const void *sa, *da; - type = s->s.type; - if (type == TLE_V4) + type = s->s.type; + if (type == TLE_V4) { + sa = &s->s.ipv4.addr.dst; da = &s->s.ipv4.addr.src; - else + } + else { + sa = &s->s.ipv6.addr.dst; da = &s->s.ipv6.addr.src; + } - rc = stream_get_dest(&s->s, da, &s->tx.dst); + rc = stream_get_dest(type, &s->s, sa, da, &s->tx.dst); return (rc < 0) ? rc : 0; } @@ -851,19 +882,17 @@ accept_prep_stream(struct tle_tcp_stream *ps, struct stbl *st, int32_t rc; uint32_t rtt; - /* some TX still pending for that stream. */ - if (TCP_STREAM_TX_PENDING(cs)) - return -EAGAIN; - /* setup L4 ports and L3 addresses fields. */ cs->s.port.raw = pi->port.raw; cs->s.pmsk.raw = UINT32_MAX; if (pi->tf.type == TLE_V4) { + cs->s.type = TLE_V4; cs->s.ipv4.addr = pi->addr4; cs->s.ipv4.mask.src = INADDR_NONE; cs->s.ipv4.mask.dst = INADDR_NONE; } else if (pi->tf.type == TLE_V6) { + cs->s.type = TLE_V6; cs->s.ipv6.addr = *pi->addr6; rte_memcpy(&cs->s.ipv6.mask.src, &tle_ipv6_none, sizeof(cs->s.ipv6.mask.src)); @@ -887,7 +916,7 @@ accept_prep_stream(struct tle_tcp_stream *ps, struct stbl *st, cs->tcb.snd.rto = TCP_RTO_DEFAULT; /* copy streams type & flags. */ - cs->s.type = ps->s.type; + cs->s.type = pi->tf.type; cs->flags = ps->flags; /* retrive and cache destination information. */ @@ -897,16 +926,23 @@ accept_prep_stream(struct tle_tcp_stream *ps, struct stbl *st, /* update snd.mss with SMSS value */ cs->tcb.snd.mss = calc_smss(cs->tcb.snd.mss, &cs->tx.dst); + if (cs->tcb.so.ts.raw != 0) { + cs->tcb.snd.mss -= TCP_TX_OPT_LEN_TMS; + } /* setup congestion variables */ cs->tcb.snd.cwnd = initial_cwnd(cs->tcb.snd.mss, ps->tcb.snd.cwnd); + CWND_INFO("accept", cs->tcb.snd.cwnd); + cs->tcb.snd.ssthresh = cs->tcb.snd.wnd; cs->tcb.snd.rto_tw = ps->tcb.snd.rto_tw; + cs->tcb.snd.rto_fw = ps->tcb.snd.rto_fw; cs->tcb.state = TCP_ST_ESTABLISHED; + TCP_INC_STATS_ATOMIC(TCP_MIB_CURRESTAB); /* add stream to the table */ - cs->ste = stbl_add_stream(st, pi, cs); + cs->ste = stbl_add_stream(st, &cs->s); if (cs->ste == NULL) return -ENOBUFS; @@ -937,7 +973,7 @@ rx_ack_listen(struct tle_tcp_stream *s, struct stbl *st, *csp = NULL; - if (pi->tf.flags != TCP_FLAG_ACK || rx_check_stream(s, pi) != 0) + if ((pi->tf.flags & TCP_FLAG_ACK) == 0|| rx_check_stream(s, pi) != 0) return -EINVAL; ctx = s->s.ctx; @@ -964,7 +1000,8 @@ rx_ack_listen(struct tle_tcp_stream *s, struct stbl *st, /* cleanup on failure */ tcp_stream_down(cs); - stbl_del_stream(st, cs->ste, cs, 0); + TCP_DEC_STATS_ATOMIC(TCP_MIB_CURRESTAB); + stbl_del_stream(st, cs->ste, &cs->s); cs->ste = NULL; } @@ -982,6 +1019,10 @@ data_pkt_adjust(const struct tcb *tcb, struct rte_mbuf **mb, uint32_t hlen, len = *plen; rte_pktmbuf_adj(*mb, hlen); + /* header is removed, so we clear tx_offload here to make sure + * we can get correct payload length with PKT_L4_PLEN. + */ + (*mb)->tx_offload = 0; if (len == 0) return -ENODATA; /* cut off the start of the packet */ @@ -1018,7 +1059,8 @@ rx_ackdata(struct tle_tcp_stream *s, uint32_t ack) tle_event_raise(s->tx.ev); else if (k == 0 && s->tx.cb.func != NULL) s->tx.cb.func(s->tx.cb.data, &s->s); - } + } else + txs_enqueue(s->s.ctx, s); } return n; @@ -1029,8 +1071,7 @@ stream_timewait(struct tle_tcp_stream *s, uint32_t rto) { if (rto != 0) { s->tcb.state = TCP_ST_TIME_WAIT; - s->tcb.snd.rto = rto; - timer_reset(s); + timer_reset(s, TIMER_RTO, rto); } else stream_term(s); } @@ -1041,20 +1082,30 @@ rx_fin_state(struct tle_tcp_stream *s, struct resp_info *rsp) uint32_t state; int32_t ackfin; + s->tcb.rcv.frs.on = 2; s->tcb.rcv.nxt += 1; ackfin = (s->tcb.snd.una == s->tcb.snd.fss); state = s->tcb.state; if (state == TCP_ST_ESTABLISHED) { + TCP_DEC_STATS_ATOMIC(TCP_MIB_CURRESTAB); s->tcb.state = TCP_ST_CLOSE_WAIT; /* raise err.ev & err.cb */ - if (s->err.ev != NULL) + /* raise error event only when recvbuf is empty, to inform + * that the stream will not receive data any more. + */ + if (rte_ring_count(s->rx.q) == 0 && s->err.ev != NULL) tle_event_raise(s->err.ev); else if (s->err.cb.func != NULL) s->err.cb.func(s->err.cb.data, &s->s); } else if (state == TCP_ST_FIN_WAIT_1 || state == TCP_ST_CLOSING) { rsp->flags |= TCP_FLAG_ACK; + + /* shutdown instead of close happens */ + if (rte_ring_count(s->rx.q) == 0 && s->err.ev != NULL) + tle_event_raise(s->err.ev); + if (ackfin != 0) stream_timewait(s, s->tcb.snd.rto_tw); else @@ -1089,8 +1140,10 @@ rx_fin(struct tle_tcp_stream *s, uint32_t state, ts = rx_tms_opt(&s->tcb, mb); ret = rx_check_seqack(&s->tcb, seq, si->ack, plen, ts); - if (ret != 0) + if (ret != 0) { + rsp->flags |= TCP_FLAG_ACK; return ret; + } if (state < TCP_ST_ESTABLISHED) return -EINVAL; @@ -1108,9 +1161,10 @@ rx_fin(struct tle_tcp_stream *s, uint32_t state, * fast-path: all data & FIN was already sent out * and now is acknowledged. */ - if (s->tcb.snd.fss == s->tcb.snd.nxt && - si->ack == (uint32_t)s->tcb.snd.nxt) { + if (s->tcb.snd.fss >= s->tcb.snd.nxt && + si->ack == (uint32_t)s->tcb.snd.fss) { s->tcb.snd.una = s->tcb.snd.fss; + s->tcb.snd.nxt = s->tcb.snd.una; empty_tq(s); /* conventional ACK processiing */ } else @@ -1148,8 +1202,25 @@ rx_rst(struct tle_tcp_stream *s, uint32_t state, uint32_t flags, else rc = check_seqn(&s->tcb, si->seq, 0); - if (rc == 0) + if (rc == 0) { + /* receive rst, connection is closed abnormal + * and should return errno in later operations. + */ + switch (state) { + case TCP_ST_SYN_SENT: + TCP_INC_STATS(TCP_MIB_ATTEMPTFAILS); + s->tcb.err = ECONNREFUSED; + break; + case TCP_ST_CLOSE_WAIT: + s->tcb.err = EPIPE; + break; + case TCP_ST_CLOSED: + return rc; + default: + s->tcb.err = ECONNRESET; + } stream_term(s); + } return rc; } @@ -1222,6 +1293,7 @@ rto_cwnd_update(struct tcb *tcb) * no more than 1 full-sized segment. */ tcb->snd.cwnd = tcb->snd.mss; + CWND_INFO("update", tcb->snd.cwnd); } static inline void @@ -1330,13 +1402,17 @@ rx_data_ack(struct tle_tcp_stream *s, struct dack_info *tack, ret = rx_check_seqack(&s->tcb, si[j].seq, si[j].ack, plen, ts); - if (ret != 0) - break; - /* account for segment received */ ack_info_update(tack, &si[j], ret != 0, plen, ts); + if (ret != 0) + break; + rte_pktmbuf_adj(mb[j], hlen); + /* header is removed, so we clear tx_offload here to make sure + * we can get correct payload length with PKT_L4_PLEN. + */ + mb[j]->tx_offload = 0; } n = j - i; @@ -1377,6 +1453,7 @@ start_fast_retransmit(struct tle_tcp_stream *s) tcp_txq_rst_nxt_head(s); tcb->snd.nxt = tcb->snd.una; tcb->snd.cwnd = tcb->snd.ssthresh + 3 * tcb->snd.mss; + CWND_INFO("start fast retrans", tcb->snd.cwnd); } static inline void @@ -1389,6 +1466,7 @@ stop_fast_retransmit(struct tle_tcp_stream *s) n = tcb->snd.nxt - tcb->snd.una; tcb->snd.cwnd = RTE_MIN(tcb->snd.ssthresh, RTE_MAX(n, tcb->snd.mss) + tcb->snd.mss); + CWND_INFO("stop fast retrans", tcb->snd.cwnd); tcb->snd.fastack = 0; } @@ -1415,8 +1493,10 @@ in_fast_retransmit(struct tle_tcp_stream *s, uint32_t ack_len, uint32_t ack_num, * during fast recovery, also reset the * retransmit timer. */ - if (tcb->snd.fastack == 1) - timer_reset(s); + if (tcb->snd.fastack == 1) { + timer_reset(s, TIMER_RTO, s->tcb.snd.rto); + s->tcb.snd.nb_retx = 0; + } tcb->snd.fastack += ack_num; return 1; @@ -1456,7 +1536,8 @@ process_ack(struct tle_tcp_stream *s, uint32_t acked, /* remain in normal mode */ } else if (acked != 0) { ack_cwnd_update(&s->tcb, acked, tack); - timer_stop(s); + timer_stop(s, TIMER_RTO); + s->tcb.snd.nb_retx = 0; } /* fast retransmit mode */ @@ -1470,7 +1551,7 @@ process_ack(struct tle_tcp_stream *s, uint32_t acked, } else { /* RFC 5682 3.2.3 full ACK */ stop_fast_retransmit(s); - timer_stop(s); + timer_stop(s, TIMER_RTO); /* if we have another series of dup ACKs */ if (tack->dup3.seg != 0 && @@ -1501,17 +1582,22 @@ rx_ackfin(struct tle_tcp_stream *s) uint32_t state; s->tcb.snd.una = s->tcb.snd.fss; + s->tcb.snd.nxt = s->tcb.snd.una; empty_tq(s); state = s->tcb.state; if (state == TCP_ST_LAST_ACK) stream_term(s); else if (state == TCP_ST_FIN_WAIT_1) { - timer_stop(s); + timer_stop(s, TIMER_RTO); s->tcb.state = TCP_ST_FIN_WAIT_2; - } else if (state == TCP_ST_CLOSING) { + /* if stream is closed, should be released + * before timeout even without fin from peer + */ + if (s->tcb.uop & TCP_OP_CLOSE) + timer_start(s, TIMER_RTO, s->tcb.snd.rto_fw); + } else if (state == TCP_ST_CLOSING) stream_timewait(s, s->tcb.snd.rto_tw); - } } static inline void @@ -1532,7 +1618,7 @@ rx_process_ack(struct tle_tcp_stream *s, uint32_t ts, /* restart RTO timer. */ if (s->tcb.snd.nxt != s->tcb.snd.una) - timer_start(s); + timer_start(s, TIMER_RTO, s->tcb.snd.rto); /* update rto, if fresh packet is here then calculate rtt */ if (tack->ts.ecr != 0) @@ -1554,15 +1640,9 @@ rx_synack(struct tle_tcp_stream *s, uint32_t ts, uint32_t state, if (state != TCP_ST_SYN_SENT) return -EINVAL; - /* - * RFC 793 3.9: in the SYN-SENT state - * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send a reset - * - * and discard the segment. - * The connection remains in the same state. - */ + /* invalid SEG.SEQ */ if (si->ack != (uint32_t)s->tcb.snd.nxt) { - send_rst(s, si->ack); + rsp->flags = TCP_FLAG_RST; return 0; } @@ -1574,18 +1654,25 @@ rx_synack(struct tle_tcp_stream *s, uint32_t ts, uint32_t state, s->tcb.snd.una = s->tcb.snd.nxt; s->tcb.snd.mss = calc_smss(so.mss, &s->tx.dst); + if (s->tcb.so.ts.raw != 0) { + s->tcb.snd.mss -= TCP_TX_OPT_LEN_TMS; + } s->tcb.snd.wnd = si->wnd << so.wscale; s->tcb.snd.wu.wl1 = si->seq; s->tcb.snd.wu.wl2 = si->ack; s->tcb.snd.wscale = so.wscale; + s->tcb.snd.cork_ts = 0; /* setup congestion variables */ s->tcb.snd.cwnd = initial_cwnd(s->tcb.snd.mss, s->tcb.snd.cwnd); + CWND_INFO("synack", s->tcb.snd.cwnd); + s->tcb.snd.ssthresh = s->tcb.snd.wnd; s->tcb.rcv.ts = so.ts.val; s->tcb.rcv.irs = si->seq; s->tcb.rcv.nxt = si->seq + 1; + s->tcb.rcv.cpy = si->seq + 1; /* if peer doesn't support WSCALE opt, recalculate RCV.WND */ s->tcb.rcv.wscale = (so.wscale == TCP_WSCALE_NONE) ? @@ -1597,9 +1684,14 @@ rx_synack(struct tle_tcp_stream *s, uint32_t ts, uint32_t state, rsp->flags |= TCP_FLAG_ACK; - timer_stop(s); + timer_stop(s, TIMER_RTO); + s->tcb.snd.nb_retx = 0; s->tcb.state = TCP_ST_ESTABLISHED; rte_smp_wmb(); + TCP_INC_STATS_ATOMIC(TCP_MIB_CURRESTAB); + + if (s->s.option.keepalive) + timer_start(s, TIMER_KEEPALIVE, s->s.option.keepidle * MS_PER_S); if (s->tx.ev != NULL) tle_event_raise(s->tx.ev); @@ -1689,8 +1781,8 @@ rx_stream(struct tle_tcp_stream *s, uint32_t ts, * fast-path: all data & FIN was already sent out * and now is acknowledged. */ - if (s->tcb.snd.fss == s->tcb.snd.nxt && - tack.ack == (uint32_t)s->tcb.snd.nxt) + if (s->tcb.snd.fss >= s->tcb.snd.nxt && + tack.ack == (uint32_t)s->tcb.snd.fss) rx_ackfin(s); else rx_process_ack(s, ts, &tack); @@ -1702,27 +1794,44 @@ rx_stream(struct tle_tcp_stream *s, uint32_t ts, * - received segment with INO data and no TX is scheduled * for that stream. */ - if (tack.segs.badseq != 0 || tack.segs.ofo != 0 || - (tack.segs.data != 0 && - rte_atomic32_read(&s->tx.arm) == 0)) + if (tack.segs.badseq != 0 || tack.segs.ofo != 0) rsp.flags |= TCP_FLAG_ACK; + else if (tack.segs.data != 0 && + rte_atomic32_read(&s->tx.arm) == 0 && + (s->s.option.tcpquickack || + s->tcb.rcv.nxt - s->tcb.snd.ack > 8 * s->tcb.so.mss)) { + rsp.flags |= TCP_FLAG_ACK; + if (s->s.option.tcpquickack > 0) + s->s.option.tcpquickack--; + } + else if (tack.segs.data && rsp.flags == 0) + timer_start(s, TIMER_DACK, DELAY_ACK_CHECK_INTERVAL); rx_ofo_fin(s, &rsp); k += num - n; i = num; + if (s->s.option.keepalive) { + s->tcb.snd.nb_keepalive = 0; + timer_reset(s, TIMER_KEEPALIVE, s->s.option.keepidle * MS_PER_S); + } /* unhandled state, drop all packets. */ } else i = 0; /* we have a response packet to send. */ - if (rsp.flags != 0) { + if (rsp.flags == TCP_FLAG_RST) { + send_rst(s, si[i].ack); + stream_term(s); + } else if (rsp.flags != 0) { send_ack(s, ts, rsp.flags); /* start the timer for FIN packet */ - if ((rsp.flags & TCP_FLAG_FIN) != 0) - timer_reset(s); + if ((rsp.flags & TCP_FLAG_FIN) != 0) { + timer_reset(s, TIMER_RTO, s->tcb.snd.rto); + s->tcb.snd.nb_retx = 0; + } } /* unprocessed packets */ @@ -1778,7 +1887,6 @@ rx_postsyn(struct tle_dev *dev, struct stbl *st, uint32_t type, uint32_t ts, state = s->tcb.state; if (state == TCP_ST_LISTEN) { - /* one connection per flow */ cs = NULL; ret = -EINVAL; @@ -1835,6 +1943,74 @@ rx_postsyn(struct tle_dev *dev, struct stbl *st, uint32_t type, uint32_t ts, return num - k; } +static inline void +sync_refuse(struct tle_tcp_stream *s, struct tle_dev *dev, + const union pkt_info *pi, struct rte_mbuf *m) +{ + struct ether_hdr *eth_h; + struct ether_addr eth_addr; + struct ipv4_hdr *ip_h; + uint32_t ip_addr; + struct ipv6_hdr *ipv6_h; + struct in6_addr ipv6_addr; + struct tcp_hdr *th; + uint16_t port; + + /* rst pkt should not contain options for syn */ + rte_pktmbuf_trim(m, m->l4_len - sizeof(*th)); + + eth_h = rte_pktmbuf_mtod(m, struct ether_hdr*); + ether_addr_copy(ð_h->s_addr, ð_addr); + ether_addr_copy(ð_h->d_addr, ð_h->s_addr); + ether_addr_copy(ð_addr, ð_h->d_addr); + + th = rte_pktmbuf_mtod_offset(m, struct tcp_hdr*, + m->l2_len + m->l3_len); + port = th->src_port; + th->src_port = th->dst_port; + th->dst_port = port; + th->tcp_flags = TCP_FLAG_RST | TCP_FLAG_ACK; + th->recv_ack = rte_cpu_to_be_32(rte_be_to_cpu_32(th->sent_seq) + 1); + th->sent_seq = 0; + th->data_off &= 0x0f; + th->data_off |= (sizeof(*th) / 4) << 4; + th->cksum = 0; + + if (pi->tf.type == TLE_V4) { + ip_h = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr*, + m->l2_len); + ip_addr = ip_h->src_addr; + ip_h->src_addr = ip_h->dst_addr; + ip_h->dst_addr = ip_addr; + ip_h->total_length = rte_cpu_to_be_16( + rte_be_to_cpu_16(ip_h->total_length) - + (m->l4_len - sizeof(*th))); + ip_h->hdr_checksum = 0; + th->cksum = rte_ipv4_udptcp_cksum(ip_h, th); + ip_h->hdr_checksum = rte_ipv4_cksum(ip_h); + } else { + ipv6_h = rte_pktmbuf_mtod_offset(m, struct ipv6_hdr*, + m->l2_len); + rte_memcpy(&ipv6_addr, ipv6_h->src_addr, + sizeof(struct in6_addr)); + rte_memcpy(ipv6_h->src_addr, ipv6_h->dst_addr, + sizeof(struct in6_addr)); + rte_memcpy(ipv6_h->dst_addr, &ipv6_addr, + sizeof(struct in6_addr)); + ipv6_h->payload_len = rte_cpu_to_be_16( + rte_be_to_cpu_16(ipv6_h->payload_len) - + (m->l4_len - sizeof(*th))); + th->cksum = rte_ipv6_udptcp_cksum(ipv6_h, th); + } + + if (m->pkt_len < ETHER_MIN_LEN) + rte_pktmbuf_append(m, ETHER_MIN_LEN - m->pkt_len); + + if (send_pkt(s, dev, m) != 0) + rte_pktmbuf_free(m); + else + TCP_INC_STATS(TCP_MIB_OUTRSTS); +} static inline uint32_t rx_syn(struct tle_dev *dev, uint32_t type, uint32_t ts, @@ -1846,20 +2022,35 @@ rx_syn(struct tle_dev *dev, uint32_t type, uint32_t ts, uint32_t i, k; int32_t ret; - s = rx_obtain_listen_stream(dev, &pi[0], type); + s = rx_obtain_listen_stream(dev, &pi[0], type, 0); if (s == NULL) { - for (i = 0; i != num; i++) { - rc[i] = ENOENT; - rp[i] = mb[i]; + /* no socket listening this syn, send rst to refuse connect */ + s = TCP_STREAM(get_stream(dev->ctx)); + if (s != NULL) { + sync_refuse(s, dev, &pi[0], mb[0]); + put_stream(dev->ctx, &s->s, 0); + i = 1; + } else { + i = 0; } - return 0; + k = 0; + for (; i != num; i++) { + rc[k] = ENOENT; + rp[k] = mb[i]; + k++; + } + return num - k; } k = 0; for (i = 0; i != num; i++) { - + /* check if stream has space to maintain new connection */ + if (rte_ring_free_count(s->rx.q) == 0 || + (s->s.ctx->streams.nb_free == 0 && + s->s.ctx->streams.nb_cur >= s->s.ctx->prm.max_streams - 1)) + ret = -ENOSPC; /* check that this remote is allowed to connect */ - if (rx_check_stream(s, &pi[i]) != 0) + else if (rx_check_stream(s, &pi[i]) != 0) ret = -ENOENT; else /* syncokie: reply with */ @@ -1882,43 +2073,34 @@ tle_tcp_rx_bulk(struct tle_dev *dev, struct rte_mbuf *pkt[], { struct stbl *st; struct tle_ctx *ctx; - uint32_t i, j, k, mt, n, t, ts; + uint32_t i, j, k, n, t; + uint64_t ts; union pkt_info pi[num]; union seg_info si[num]; - union { - uint8_t t[TLE_VNUM]; - uint32_t raw; - } stu; + + TCP_ADD_STATS(TCP_MIB_INSEGS, num); ctx = dev->ctx; ts = tcp_get_tms(ctx->cycles_ms_shift); st = CTX_TCP_STLB(ctx); - mt = ((ctx->prm.flags & TLE_CTX_FLAG_ST) == 0); - - stu.raw = 0; /* extract packet info and check the L3/L4 csums */ for (i = 0; i != num; i++) { get_pkt_info(pkt[i], &pi[i], &si[i]); - t = pi[i].tf.type; - pi[i].csf = check_pkt_csum(pkt[i], pi[i].csf, t, IPPROTO_TCP); - stu.t[t] = mt; + pi[i].csf = check_pkt_csum(pkt[i], t, IPPROTO_TCP); } - if (stu.t[TLE_V4] != 0) - stbl_lock(st, TLE_V4); - if (stu.t[TLE_V6] != 0) - stbl_lock(st, TLE_V6); - k = 0; for (i = 0; i != num; i += j) { - t = pi[i].tf.type; /*basic checks for incoming packet */ - if (t >= TLE_VNUM || pi[i].csf != 0 || dev->dp[t] == NULL) { + if (t >= TLE_VNUM || pi[i].csf != 0) { + TCP_INC_STATS(TCP_MIB_INERRS); + if (t < TLE_VNUM) + TCP_INC_STATS(TCP_MIB_CSUMERRORS); rc[k] = EINVAL; rp[k] = pkt[i]; j = 1; @@ -1937,11 +2119,6 @@ tle_tcp_rx_bulk(struct tle_dev *dev, struct rte_mbuf *pkt[], } } - if (stu.t[TLE_V4] != 0) - stbl_unlock(st, TLE_V4); - if (stu.t[TLE_V6] != 0) - stbl_unlock(st, TLE_V6); - return num - k; } @@ -1953,21 +2130,37 @@ tle_tcp_stream_accept(struct tle_stream *ts, struct tle_stream *rs[], struct tle_tcp_stream *s; s = TCP_STREAM(ts); - n = _rte_ring_dequeue_burst(s->rx.q, (void **)rs, num); - if (n == 0) - return 0; - /* - * if we still have packets to read, - * then rearm stream RX event. - */ - if (n == num && rte_ring_count(s->rx.q) != 0) { - if (tcp_stream_try_acquire(s) > 0 && s->rx.ev != NULL) - tle_event_raise(s->rx.ev); + if (tcp_stream_try_acquire(s) > 0) { + if (s->tcb.state != TCP_ST_LISTEN) { + tcp_stream_release(s); + rte_errno = EINVAL; + return 0; + } + + n = _rte_ring_dequeue_burst(s->rx.q, (void **)rs, num); + if (n == 0) + { + tcp_stream_release(s); + rte_errno = EAGAIN; + return 0; + } + + /* + * if we still have packets to read, + * then rearm stream RX event. + */ + if (n == num && rte_ring_count(s->rx.q) != 0) { + if (s->rx.ev != NULL) + tle_event_raise(s->rx.ev); + } + tcp_stream_release(s); + return n; + } else { tcp_stream_release(s); + rte_errno = EINVAL; + return 0; } - - return n; } uint16_t @@ -1995,6 +2188,7 @@ tle_tcp_tx_bulk(struct tle_dev *dev, struct rte_mbuf *pkt[], uint16_t num) stream_drb_free(s, drb + i, j - i); } + TCP_ADD_STATS(TCP_MIB_OUTSEGS, n); return n; } @@ -2010,73 +2204,17 @@ stream_fill_pkt_info(const struct tle_tcp_stream *s, union pkt_info *pi) pi->tf.type = s->s.type; } -static int -stream_fill_addr(struct tle_tcp_stream *s, const struct sockaddr *addr) -{ - const struct sockaddr_in *in4; - const struct sockaddr_in6 *in6; - const struct tle_dev_param *prm; - int32_t rc; - - rc = 0; - s->s.pmsk.raw = UINT32_MAX; - - /* setup L4 src ports and src address fields. */ - if (s->s.type == TLE_V4) { - in4 = (const struct sockaddr_in *)addr; - if (in4->sin_addr.s_addr == INADDR_ANY || in4->sin_port == 0) - return -EINVAL; - - s->s.port.src = in4->sin_port; - s->s.ipv4.addr.src = in4->sin_addr.s_addr; - s->s.ipv4.mask.src = INADDR_NONE; - s->s.ipv4.mask.dst = INADDR_NONE; - - } else if (s->s.type == TLE_V6) { - in6 = (const struct sockaddr_in6 *)addr; - if (memcmp(&in6->sin6_addr, &tle_ipv6_any, - sizeof(tle_ipv6_any)) == 0 || - in6->sin6_port == 0) - return -EINVAL; - - s->s.port.src = in6->sin6_port; - rte_memcpy(&s->s.ipv6.addr.src, &in6->sin6_addr, - sizeof(s->s.ipv6.addr.src)); - rte_memcpy(&s->s.ipv6.mask.src, &tle_ipv6_none, - sizeof(s->s.ipv6.mask.src)); - rte_memcpy(&s->s.ipv6.mask.dst, &tle_ipv6_none, - sizeof(s->s.ipv6.mask.dst)); - } - - /* setup the destination device. */ - rc = stream_fill_dest(s); - if (rc != 0) - return rc; - - /* setup L4 dst address from device param */ - prm = &s->tx.dst.dev->prm; - if (s->s.type == TLE_V4) { - if (s->s.ipv4.addr.dst == INADDR_ANY) - s->s.ipv4.addr.dst = prm->local_addr4.s_addr; - } else if (memcmp(&s->s.ipv6.addr.dst, &tle_ipv6_any, - sizeof(tle_ipv6_any)) == 0) - memcpy(&s->s.ipv6.addr.dst, &prm->local_addr6, - sizeof(s->s.ipv6.addr.dst)); - - return rc; -} - static inline int -tx_syn(struct tle_tcp_stream *s, const struct sockaddr *addr) +tx_syn(struct tle_tcp_stream *s) { int32_t rc; - uint32_t tms, seq; + uint32_t seq; + uint64_t tms; union pkt_info pi; struct stbl *st; struct stbl_entry *se; - /* fill stream address */ - rc = stream_fill_addr(s, addr); + rc = stream_fill_dest(s); if (rc != 0) return rc; @@ -2107,7 +2245,7 @@ tx_syn(struct tle_tcp_stream *s, const struct sockaddr *addr) /* add the stream in stream table */ st = CTX_TCP_STLB(s->s.ctx); - se = stbl_add_stream_lock(st, s); + se = stbl_add_stream(st, &s->s); if (se == NULL) return -ENOBUFS; s->ste = se; @@ -2115,6 +2253,7 @@ tx_syn(struct tle_tcp_stream *s, const struct sockaddr *addr) /* put stream into the to-send queue */ txs_enqueue(s->s.ctx, s); + TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); return 0; } @@ -2147,7 +2286,7 @@ tle_tcp_stream_connect(struct tle_stream *ts, const struct sockaddr *addr) /* fill stream, prepare and transmit syn pkt */ s->tcb.uop |= TCP_OP_CONNECT; - rc = tx_syn(s, addr); + rc = tx_syn(s); tcp_stream_release(s); /* error happened, do a cleanup */ @@ -2160,13 +2299,29 @@ tle_tcp_stream_connect(struct tle_stream *ts, const struct sockaddr *addr) uint16_t tle_tcp_stream_recv(struct tle_stream *ts, struct rte_mbuf *pkt[], uint16_t num) { - uint32_t n; + uint32_t n, i; + uint32_t free_slots; struct tle_tcp_stream *s; s = TCP_STREAM(ts); + + free_slots = rte_ring_free_count(s->rx.q); + n = _rte_ring_mcs_dequeue_burst(s->rx.q, (void **)pkt, num); - if (n == 0) + if (n == 0) { + if (s->tcb.err != 0) { + rte_errno = s->tcb.err; + } else { + rte_errno = EAGAIN; + } return 0; + } + + for (i = 0; i < n; ++i) + s->tcb.rcv.cpy += rte_pktmbuf_pkt_len(pkt[i]); + + /* update receive window with left recv buffer*/ + s->tcb.rcv.wnd = calc_rx_wnd(s, s->tcb.rcv.wscale); /* * if we still have packets to read, @@ -2176,28 +2331,99 @@ tle_tcp_stream_recv(struct tle_stream *ts, struct rte_mbuf *pkt[], uint16_t num) if (tcp_stream_try_acquire(s) > 0 && s->rx.ev != NULL) tle_event_raise(s->rx.ev); tcp_stream_release(s); + /* if we have received fin, no more data will come, raise err event. */ + } else if (s->tcb.rcv.frs.on == 2) { + if (tcp_stream_try_acquire(s) > 0 && s->err.ev != NULL) + tle_event_raise(s->err.ev); + tcp_stream_release(s); + } + + /* update recv win to the remote */ + if (free_slots < RECV_WIN_NOTIFY_THRESH && + rte_ring_free_count(s->rx.q) >= RECV_WIN_NOTIFY_THRESH) { + s->tcb.snd.update_rcv = true; + txs_enqueue(s->s.ctx, s); } return n; } +uint16_t +tle_tcp_stream_inq(struct tle_stream *ts) +{ + struct tle_tcp_stream *s; + + s = TCP_STREAM(ts); + return s->tcb.rcv.nxt - s->tcb.rcv.cpy; +} + +#define DECONST(type, var) ((type)(uintptr_t)(const void *)(var)) + +ssize_t +tle_tcp_stream_readv(struct tle_stream *ts, const struct iovec *iov, int iovcnt) +{ + struct msghdr msg = {0}; + + msg.msg_iov = DECONST(struct iovec *, iov); /* Recover const later */ + msg.msg_iovlen = iovcnt; + return tle_tcp_stream_recvmsg(ts, &msg); +} + ssize_t -tle_tcp_stream_readv(struct tle_stream *ts, const struct iovec *iov, - int iovcnt) +tle_tcp_stream_recvmsg(struct tle_stream *ts, struct msghdr *msg) { + size_t sz; int32_t i; uint32_t mn, n, tn; - size_t sz; + uint32_t free_slots; struct tle_tcp_stream *s; struct iovec iv; struct rxq_objs mo[2]; + struct sockaddr_in *addr; + struct sockaddr_in6 *addr6; + const struct iovec *iov = msg->msg_iov; + int iovcnt = msg->msg_iovlen; s = TCP_STREAM(ts); + free_slots = rte_ring_free_count(s->rx.q); + /* get group of packets */ mn = tcp_rxq_get_objs(s, mo); - if (mn == 0) - return 0; + if (mn == 0) { + if (s->tcb.err != 0) + rte_errno = s->tcb.err; + else + rte_errno = EAGAIN; + return -1; + } + + if (!ts->option.timestamp) + ts->timestamp = mo[0].mb[0]->timestamp; + + if (msg->msg_control != NULL) { + if (ts->option.timestamp) + tle_set_timestamp(msg, mo[0].mb[0]); + else + msg->msg_controllen = 0; + } + + if (msg->msg_name != NULL) { + if (s->s.type == TLE_V4) { + addr = (struct sockaddr_in*)msg->msg_name; + addr->sin_family = AF_INET; + addr->sin_addr.s_addr = s->s.ipv4.addr.src; + addr->sin_port = s->s.port.src; + msg->msg_namelen = sizeof(struct sockaddr_in); + } else { + addr6 = (struct sockaddr_in6*)msg->msg_name; + addr6->sin6_family = AF_INET6; + rte_memcpy(&addr6->sin6_addr, &s->s.ipv6.addr.src, + sizeof(struct sockaddr_in6)); + addr6->sin6_port = s->s.port.src; + msg->msg_namelen = sizeof(struct sockaddr_in6); + } + } sz = 0; n = 0; @@ -2229,6 +2455,8 @@ tle_tcp_stream_readv(struct tle_stream *ts, const struct iovec *iov, } tcp_rxq_consume(s, tn); + /* update receive window with left recv buffer*/ + s->tcb.rcv.wnd = calc_rx_wnd(s, s->tcb.rcv.wscale); /* * if we still have packets to read, @@ -2238,6 +2466,20 @@ tle_tcp_stream_readv(struct tle_stream *ts, const struct iovec *iov, if (tcp_stream_try_acquire(s) > 0 && s->rx.ev != NULL) tle_event_raise(s->rx.ev); tcp_stream_release(s); + /* if we have received fin, no more data will come, raise err event. */ + } else if (s->tcb.rcv.frs.on == 2) { + if (tcp_stream_try_acquire(s) > 0 && s->err.ev != NULL) + tle_event_raise(s->err.ev); + tcp_stream_release(s); + } + + s->tcb.rcv.cpy += sz; + + /* update recv win to the remote */ + if (free_slots < RECV_WIN_NOTIFY_THRESH && + rte_ring_free_count(s->rx.q) >= RECV_WIN_NOTIFY_THRESH) { + s->tcb.snd.update_rcv = true; + txs_enqueue(s->s.ctx, s); } return sz; @@ -2263,48 +2505,35 @@ tx_segments(struct tle_tcp_stream *s, uint64_t ol_flags, if (i == num) { /* queue packets for further transmission. */ rc = _rte_ring_enqueue_bulk(s->tx.q, (void **)segs, num); - if (rc != 0) + if (rc != 0) { + rc = -EAGAIN; free_mbufs(segs, num); + } } return rc; } -uint16_t -tle_tcp_stream_send(struct tle_stream *ts, struct rte_mbuf *pkt[], uint16_t num) +static inline uint16_t +stream_send(struct tle_tcp_stream *s, struct rte_mbuf *pkt[], + uint16_t num, uint16_t mss, uint64_t ol_flags) { - uint32_t i, j, k, mss, n, state; + uint16_t i, j, k; int32_t rc; - uint64_t ol_flags; - struct tle_tcp_stream *s; + uint32_t n, free_slots; struct rte_mbuf *segs[TCP_MAX_PKT_SEG]; - - s = TCP_STREAM(ts); - - /* mark stream as not closable. */ - if (tcp_stream_acquire(s) < 0) { - rte_errno = EAGAIN; - return 0; - } - - state = s->tcb.state; - if (state != TCP_ST_ESTABLISHED && state != TCP_ST_CLOSE_WAIT) { - rte_errno = ENOTCONN; - tcp_stream_release(s); - return 0; - } - - mss = s->tcb.snd.mss; - ol_flags = s->tx.dst.ol_flags; + int32_t pkt_len; k = 0; rc = 0; + pkt_len = 0; while (k != num) { /* prepare and check for TX */ for (i = k; i != num; i++) { if (pkt[i]->pkt_len > mss || pkt[i]->nb_segs > TCP_MAX_PKT_SEG) break; + pkt_len += pkt[i]->pkt_len; rc = tcp_fill_mbuf(pkt[i], s, &s->tx.dst, ol_flags, s->s.port, 0, TCP_FLAG_ACK, 0, 0); if (rc != 0) @@ -2328,6 +2557,7 @@ tle_tcp_stream_send(struct tle_stream *ts, struct rte_mbuf *pkt[], uint16_t num) pkt[j]->l3_len + pkt[j]->l4_len); pkt[j]->ol_flags &= ol_flags; + pkt_len -= pkt[j]->pkt_len; } break; } @@ -2339,8 +2569,10 @@ tle_tcp_stream_send(struct tle_stream *ts, struct rte_mbuf *pkt[], uint16_t num) /* segment large packet and enqueue for sending */ } else if (i != num) { + free_slots = rte_ring_free_count(s->tx.q); + free_slots = RTE_MIN(free_slots, RTE_DIM(segs)); /* segment the packet. */ - rc = tcp_segmentation(pkt[i], segs, RTE_DIM(segs), + rc = tcp_segmentation(pkt[i], segs, free_slots, &s->tx.dst, mss); if (rc < 0) { rte_errno = -rc; @@ -2351,19 +2583,161 @@ tle_tcp_stream_send(struct tle_stream *ts, struct rte_mbuf *pkt[], uint16_t num) if (rc == 0) { /* free the large mbuf */ rte_pktmbuf_free(pkt[i]); + pkt_len += pkt[i]->pkt_len; /* set the mbuf as consumed */ k++; - } else + } else { /* no space left in tx queue */ + RTE_VERIFY(0); + break; + } + } + } + + s->tcb.snd.waitlen += pkt_len; + return k; +} + +static inline uint16_t +stream_send_tso(struct tle_tcp_stream *s, struct rte_mbuf *pkt[], + uint16_t num, uint16_t mss, uint64_t ol_flags) +{ + uint16_t i, k, nb_segs; + int32_t rc, pkt_len; + uint64_t ol_flags1; + struct rte_mbuf *pre_tail; + + k = 0; + rc = 0; + while (k != num) { + /* Make sure there is at least one slot available */ + if (rte_ring_free_count(s->tx.q) == 0) + break; + + /* prepare and check for TX */ + nb_segs = 0; + pkt_len = 0; + pre_tail = NULL; + for (i = k; i != num; i++) { + if (pkt[i]->nb_segs != 1) + rte_panic("chained mbuf: %p\n", pkt[i]); + /* We shall consider cwnd and snd wnd when limit len */ + if (nb_segs + pkt[i]->nb_segs <= TCP_MAX_PKT_SEG && + pkt_len + pkt[i]->pkt_len <= 65535 - RESERVE_HEADER_LEN) { + nb_segs += pkt[i]->nb_segs; + pkt_len += pkt[i]->pkt_len; + if (pre_tail) + pre_tail->next = pkt[i]; + pre_tail = rte_pktmbuf_lastseg(pkt[i]); + } else { + /* enqueue this one now */ + break; + } + } + + if (unlikely(i == k)) { + /* pkt[k] is a too big packet, now we fall back to + * non-tso send; we can optimize it later by + * splitting the mbuf. + */ + if (stream_send(s, &pkt[k], 1, mss, ol_flags) == 1) { + k++; + continue; + } else break; } + + pkt[k]->nb_segs = nb_segs; + pkt[k]->pkt_len = pkt_len; + + ol_flags1 = ol_flags; + if (pkt_len > mss) + ol_flags1 |= PKT_TX_TCP_SEG; + + rc = tcp_fill_mbuf(pkt[k], s, &s->tx.dst, ol_flags1, + s->s.port, 0, TCP_FLAG_ACK, 0, 0); + if (rc != 0) /* hard to recover */ + rte_panic("failed to fill mbuf: %p\n", pkt[k]); + + /* correct mss */ + pkt[k]->tso_segsz = mss; + + s->tcb.snd.waitlen += pkt_len; + /* We already make sure there is at least one slot */ + if (_rte_ring_enqueue_burst(s->tx.q, (void **)pkt + k, 1) < 1) + RTE_VERIFY(0); + + k = i; + } + + return k; +} + +uint16_t +tle_tcp_stream_send(struct tle_stream *ts, struct rte_mbuf *pkt[], uint16_t num) +{ + uint16_t k, mss, state; + uint64_t ol_flags; + struct tle_tcp_stream *s; + + s = TCP_STREAM(ts); + + if (s->tcb.err != 0) { + rte_errno = s->tcb.err; + return 0; + } + + /* mark stream as not closable. */ + if (tcp_stream_acquire(s) < 0) { + rte_errno = EAGAIN; + return 0; } + state = s->tcb.state; + switch (state) { + case TCP_ST_ESTABLISHED: + case TCP_ST_CLOSE_WAIT: + break; + case TCP_ST_FIN_WAIT_1: + case TCP_ST_FIN_WAIT_2: + case TCP_ST_CLOSING: + case TCP_ST_LAST_ACK: + rte_errno = EPIPE; + tcp_stream_release(s); + return 0; + default: + rte_errno = ENOTCONN; + tcp_stream_release(s); + return 0; + } + + mss = s->tcb.snd.mss; + + ol_flags = s->tx.dst.ol_flags; + + /* Some reference number on the case: + * " - tap - - " + * ~2Gbps with tso disabled; + * ~16Gbps with tso enabled. + */ + if (rte_ring_free_count(s->tx.q) == 0) { + /* Block send may try without waiting for tx event (raised by acked + * data), so here we will still put this stream for further process + */ + txs_enqueue(s->s.ctx, s); + rte_errno = EAGAIN; + k = 0; + } else if (s->tx.dst.dev->prm.tx_offload & DEV_TX_OFFLOAD_TCP_TSO) + k = stream_send_tso(s, pkt, num, mss, ol_flags); + else + k = stream_send(s, pkt, num, mss, ol_flags); + /* notify BE about more data to send */ if (k != 0) txs_enqueue(s->s.ctx, s); + /* if possible, re-arm stream write event. */ - if (rte_ring_free_count(s->tx.q) != 0 && s->tx.ev != NULL) + if (rte_ring_free_count(s->tx.q) && s->tx.ev != NULL && k == num) tle_event_raise(s->tx.ev); tcp_stream_release(s); @@ -2382,9 +2756,15 @@ tle_tcp_stream_writev(struct tle_stream *ts, struct rte_mempool *mp, struct tle_tcp_stream *s; struct iovec iv; struct rte_mbuf *mb[2 * MAX_PKT_BURST]; + uint16_t mss; s = TCP_STREAM(ts); + if (s->tcb.err != 0) { + rte_errno = s->tcb.err; + return -1; + } + /* mark stream as not closable. */ if (tcp_stream_acquire(s) < 0) { rte_errno = EAGAIN; @@ -2392,7 +2772,18 @@ tle_tcp_stream_writev(struct tle_stream *ts, struct rte_mempool *mp, } state = s->tcb.state; - if (state != TCP_ST_ESTABLISHED && state != TCP_ST_CLOSE_WAIT) { + switch (state) { + case TCP_ST_ESTABLISHED: + case TCP_ST_CLOSE_WAIT: + break; + case TCP_ST_FIN_WAIT_1: + case TCP_ST_FIN_WAIT_2: + case TCP_ST_CLOSING: + case TCP_ST_LAST_ACK: + rte_errno = EPIPE; + tcp_stream_release(s); + return -1; + default: rte_errno = ENOTCONN; tcp_stream_release(s); return -1; @@ -2403,11 +2794,24 @@ tle_tcp_stream_writev(struct tle_stream *ts, struct rte_mempool *mp, for (i = 0; i != iovcnt; i++) tsz += iov[i].iov_len; + if (tsz == 0) { + tcp_stream_release(s); + return 0; + } + slen = rte_pktmbuf_data_room_size(mp); - slen = RTE_MIN(slen, s->tcb.snd.mss); + mss = s->tcb.snd.mss; + + slen = RTE_MIN(slen, mss); num = (tsz + slen - 1) / slen; n = rte_ring_free_count(s->tx.q); + + if (n == 0) { + tcp_stream_release(s); + return 0; + } + num = RTE_MIN(num, n); n = RTE_MIN(num, RTE_DIM(mb)); @@ -2451,7 +2855,6 @@ tle_tcp_stream_writev(struct tle_stream *ts, struct rte_mempool *mp, k = 0; if (k != j) { - /* free pkts that were not enqueued */ free_mbufs(mb + k, j - k); @@ -2466,14 +2869,16 @@ tle_tcp_stream_writev(struct tle_stream *ts, struct rte_mempool *mp, } } - if (k != 0) { - + if (k != 0) { /* notify BE about more data to send */ txs_enqueue(s->s.ctx, s); /* if possible, re-arm stream write event. */ if (rte_ring_free_count(s->tx.q) != 0 && s->tx.ev != NULL) tle_event_raise(s->tx.ev); + } else { + rte_errno = EAGAIN; + sz = -1; } tcp_stream_release(s); @@ -2485,7 +2890,7 @@ static inline void tx_data_fin(struct tle_tcp_stream *s, uint32_t tms, uint32_t state) { /* try to send some data */ - tx_nxt_data(s, tms); + uint32_t tn = tx_nxt_data(s, tms); /* we also have to send a FIN */ if (state != TCP_ST_ESTABLISHED && @@ -2495,6 +2900,13 @@ tx_data_fin(struct tle_tcp_stream *s, uint32_t tms, uint32_t state) s->tcb.snd.fss = ++s->tcb.snd.nxt; send_ack(s, tms, TCP_FLAG_FIN | TCP_FLAG_ACK); } + + if (s->tcb.snd.update_rcv) { + if (tn == 0) + send_ack(s, tms, TCP_FLAG_ACK); /* update recv window */ + + s->tcb.snd.update_rcv = false; + } } static inline void @@ -2507,7 +2919,7 @@ tx_stream(struct tle_tcp_stream *s, uint32_t tms) if (state == TCP_ST_SYN_SENT) { /* send the SYN, start the rto timer */ send_ack(s, tms, TCP_FLAG_SYN); - timer_start(s); + timer_start(s, TIMER_RTO, s->tcb.snd.rto); } else if (state >= TCP_ST_ESTABLISHED && state <= TCP_ST_LAST_ACK) { @@ -2515,7 +2927,7 @@ tx_stream(struct tle_tcp_stream *s, uint32_t tms) /* start RTO timer. */ if (s->tcb.snd.nxt != s->tcb.snd.una) - timer_start(s); + timer_start(s, TIMER_RTO, s->tcb.snd.rto); } } @@ -2544,7 +2956,6 @@ rto_stream(struct tle_tcp_stream *s, uint32_t tms) if (s->tcb.snd.nb_retx < s->tcb.snd.nb_retm) { if (state >= TCP_ST_ESTABLISHED && state <= TCP_ST_LAST_ACK) { - /* update SND.CWD and SND.SSTHRESH */ rto_cwnd_update(&s->tcb); @@ -2570,50 +2981,131 @@ rto_stream(struct tle_tcp_stream *s, uint32_t tms) * than one SYN or SYN/ACK retransmissions or true loss * detection has been made. */ - if (s->tcb.snd.nb_retx != 0) + if (s->tcb.snd.nb_retx != 0) { s->tcb.snd.cwnd = s->tcb.snd.mss; + CWND_INFO("synsent", s->tcb.snd.cwnd); + } send_ack(s, tms, TCP_FLAG_SYN); - - } else if (state == TCP_ST_TIME_WAIT) { - stream_term(s); + TCP_INC_STATS(TCP_MIB_RETRANSSEGS); } /* RFC6298:5.5 back off the timer */ s->tcb.snd.rto = rto_roundup(2 * s->tcb.snd.rto); s->tcb.snd.nb_retx++; - timer_restart(s); + timer_restart(s, TIMER_RTO, s->tcb.snd.rto); } else { - send_rst(s, s->tcb.snd.nxt); + if (state == TCP_ST_SYN_SENT) { + if (stream_fill_dest(s) != 0 || + is_broadcast_ether_addr((struct ether_addr *)s->tx.dst.hdr)) + s->tcb.err = EHOSTUNREACH; + else + /* TODO: do we send rst on this */ + s->tcb.err = ENOTCONN; + } else + send_rst(s, s->tcb.snd.una); stream_term(s); } } +static inline void +set_keepalive_timer(struct tle_tcp_stream *s) +{ + if (s->s.option.keepalive) { + if (s->tcb.state == TCP_ST_ESTABLISHED) { + if (s->tcb.snd.nb_keepalive == 0) + timer_reset(s, TIMER_KEEPALIVE, + s->s.option.keepidle * MS_PER_S); + else + timer_reset(s, TIMER_KEEPALIVE, + s->s.option.keepintvl * MS_PER_S); + } + } else { + timer_stop(s, TIMER_KEEPALIVE); + s->tcb.snd.nb_keepalive = 0; + } +} + int tle_tcp_process(struct tle_ctx *ctx, uint32_t num) { - uint32_t i, k, tms; + uint8_t type; + uint32_t i, k; + uint64_t tms; struct sdr *dr; struct tle_timer_wheel *tw; struct tle_stream *p; struct tle_tcp_stream *s, *rs[num]; - /* process streams with RTO exipred */ + tms = tcp_get_tms(ctx->cycles_ms_shift); + /* process streams with RTO exipred */ tw = CTX_TCP_TMWHL(ctx); - tms = tcp_get_tms(ctx->cycles_ms_shift); tle_timer_expire(tw, tms); k = tle_timer_get_expired_bulk(tw, (void **)rs, RTE_DIM(rs)); for (i = 0; i != k; i++) { - - s = rs[i]; - s->timer.handle = NULL; - if (tcp_stream_try_acquire(s) > 0) - rto_stream(s, tms); - tcp_stream_release(s); + s = timer_stream(rs[i]); + type = timer_type(rs[i]); + s->timer.handle[type] = NULL; + + switch (type) { + case TIMER_RTO: + /* FE cannot change stream into below states, + * that's why we don't put it into lock + */ + if (s->tcb.state == TCP_ST_TIME_WAIT || + s->tcb.state == TCP_ST_FIN_WAIT_2) { + tcp_stream_down(s); + stream_term(s); + tcp_stream_up(s); + } else if (tcp_stream_acquire(s) > 0) { + /* + * stream may be closed in frontend concurrently. + * if stream has already been closed, it need not + * to retransmit anymore. + */ + if (s->tcb.state != TCP_ST_CLOSED) + rto_stream(s, tms); + tcp_stream_release(s); + } + /* Fail to aquire lock? FE is shutdown or close this + * stream, either FIN or RST needs to be sent, which + * means it's in tsq, will be processed later. + */ + break; + case TIMER_DACK: + if (rte_atomic32_read(&s->tx.arm) == 0 && + s->tcb.rcv.nxt != s->tcb.snd.ack && + tcp_stream_acquire(s) > 0) { + s->s.option.tcpquickack = 8; + send_ack(s, tms, TCP_FLAG_ACK); + tcp_stream_release(s); + } + break; + case TIMER_KEEPALIVE: + if (s->tcb.snd.nb_keepalive < s->s.option.keepcnt) { + if (tcp_stream_try_acquire(s) > 0 && + s->tcb.state == TCP_ST_ESTABLISHED) { + send_keepalive(s); + s->tcb.snd.nb_keepalive++; + timer_start(s, TIMER_KEEPALIVE, + s->s.option.keepintvl * MS_PER_S); + } + tcp_stream_release(s); + } else { + tcp_stream_down(s); + send_rst(s, s->tcb.snd.nxt); + s->tcb.err = ETIMEDOUT; + stream_term(s); + tcp_stream_up(s); + } + break; + default: + rte_panic("Invalid timer type: %d\n", type); + } } /* process streams from to-send queue */ @@ -2621,20 +3113,63 @@ tle_tcp_process(struct tle_ctx *ctx, uint32_t num) k = txs_dequeue_bulk(ctx, rs, RTE_DIM(rs)); for (i = 0; i != k; i++) { - s = rs[i]; - rte_atomic32_set(&s->tx.arm, 0); - if (tcp_stream_try_acquire(s) > 0) + if (s->tcb.uop & TCP_OP_RESET) { + /* already put into death row in close() */ + send_rst(s, s->tcb.snd.nxt); + continue; + } + + if (tcp_stream_acquire(s) > 0) { + if (s->tcb.uop & TCP_OP_KEEPALIVE) { + s->tcb.uop &= ~TCP_OP_KEEPALIVE; + set_keepalive_timer(s); + } + + if (s->tcb.state == TCP_ST_FIN_WAIT_2 && + s->tcb.uop & TCP_OP_CLOSE) { + /* This could happen after: + * 1) shutdown; + * 2) FIN sent; + * 3) ack received; + * 4) close; + */ + timer_start(s, TIMER_RTO, s->tcb.snd.rto_fw); + tcp_stream_release(s); + continue; + } + + if (s->tcb.state == TCP_ST_ESTABLISHED && + s->s.option.tcpcork) { + if (s->tcb.snd.cork_ts == 0) + s->tcb.snd.cork_ts = (uint32_t)tms; + + if (s->tcb.snd.waitlen < s->tcb.snd.mss && + (uint32_t)tms - s->tcb.snd.cork_ts < 200) { + txs_enqueue(s->s.ctx, s); + tcp_stream_release(s); + continue; + } + + s->tcb.snd.cork_ts = 0; + } + tx_stream(s, tms); - else + tcp_stream_release(s); + continue; + } + + if (s->tcb.state != TCP_ST_CLOSED) txs_enqueue(s->s.ctx, s); - tcp_stream_release(s); + + /* TCP_ST_CLOSED? See close with TCP_ST_CLOSED state */ } /* collect streams to close from the death row */ dr = CTX_TCP_SDR(ctx); + rte_spinlock_lock(&dr->lock); for (k = 0, p = STAILQ_FIRST(&dr->be); k != num && p != NULL; k++, p = STAILQ_NEXT(p, link)) @@ -2645,9 +3180,21 @@ tle_tcp_process(struct tle_ctx *ctx, uint32_t num) else STAILQ_FIRST(&dr->be) = p; + /* if stream still in tsq, wait one more round */ + for (i = 0; i != k; i++) { + if (rte_atomic32_read(&rs[i]->tx.arm) > 0) { + STAILQ_INSERT_TAIL(&dr->be, &rs[i]->s, link); + rs[i] = NULL; + } + } + + rte_spinlock_unlock(&dr->lock); + /* cleanup closed streams */ for (i = 0; i != k; i++) { s = rs[i]; + if (s == NULL) + continue; tcp_stream_down(s); tcp_stream_reset(ctx, s); } diff --git a/lib/libtle_l4p/tcp_rxtx.h b/lib/libtle_l4p/tcp_rxtx.h new file mode 100644 index 0000000..e7f8e3e --- /dev/null +++ b/lib/libtle_l4p/tcp_rxtx.h @@ -0,0 +1,252 @@ +/* + * Copyright (c) 2016-2017 Intel Corporation. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _TCP_RXTX_H_ +#define _TCP_RXTX_H_ + +#include "tcp_stream.h" + +#ifdef __cplusplus +extern "C" { +#endif + +static inline uint32_t +calc_seg_cnt(uint32_t plen, uint32_t mss) +{ + if (plen > mss) + return (plen + mss - 1) / mss; + else + return 1; +} + +static inline uint32_t +get_ip_pid(struct tle_dev *dev, uint32_t num, uint32_t type, uint32_t st) +{ + uint32_t pid; + rte_atomic32_t *pa; + + pa = &dev->tx.packet_id[type]; + + if (st == 0) { + pid = rte_atomic32_add_return(pa, num); + return pid - num; + } else { + pid = rte_atomic32_read(pa); + rte_atomic32_set(pa, pid + num); + return pid; + } +} + +static inline void +fill_tcph(struct tcp_hdr *l4h, const struct tcb *tcb, union l4_ports port, + uint32_t seq, uint8_t hlen, uint8_t flags) +{ + uint16_t wnd; + + l4h->src_port = port.dst; + l4h->dst_port = port.src; + + wnd = (flags & TCP_FLAG_SYN) ? + RTE_MIN(tcb->rcv.wnd, (uint32_t)UINT16_MAX) : + tcb->rcv.wnd >> tcb->rcv.wscale; + + /* ??? use sse shuffle to hton all remaining 16 bytes at once. ??? */ + l4h->sent_seq = rte_cpu_to_be_32(seq); + l4h->recv_ack = rte_cpu_to_be_32(tcb->rcv.nxt); + l4h->data_off = hlen / TCP_DATA_ALIGN << TCP_DATA_OFFSET; + l4h->tcp_flags = flags; + l4h->rx_win = rte_cpu_to_be_16(wnd); + l4h->cksum = 0; + l4h->tcp_urp = 0; + + if (flags & TCP_FLAG_SYN) + fill_syn_opts(l4h + 1, &tcb->so); + else if ((flags & TCP_FLAG_RST) == 0 && tcb->so.ts.raw != 0) + fill_tms_opts(l4h + 1, tcb->snd.ts, tcb->rcv.ts); +} + +static inline int +tcp_fill_mbuf(struct rte_mbuf *m, const struct tle_tcp_stream *s, + const struct tle_dest *dst, uint64_t ol_flags, + union l4_ports port, uint32_t seq, uint32_t flags, + uint32_t pid, uint32_t swcsm) +{ + uint32_t l4, len, plen; + struct tcp_hdr *l4h; + char *l2h, *l3; + + len = dst->l2_len + dst->l3_len; + plen = m->pkt_len; + + if (flags & TCP_FLAG_SYN) { + /* basic length */ + l4 = sizeof(*l4h) + TCP_OPT_LEN_MSS; + + /* add wscale space and nop */ + if (s->tcb.so.wscale) { + l4 += TCP_OPT_LEN_WSC + TCP_OPT_LEN_NOP; + } + + /* add timestamp space and nop */ + if (s->tcb.so.ts.raw) { + l4 += TCP_TX_OPT_LEN_TMS; + } + } else if ((flags & TCP_FLAG_RST) == 0 && s->tcb.rcv.ts != 0) { + l4 = sizeof(*l4h) + TCP_TX_OPT_LEN_TMS; + } else { + l4 = sizeof(*l4h); + } + + /* adjust mbuf to put L2/L3/L4 headers into it. */ + l2h = rte_pktmbuf_prepend(m, len + l4); + if (l2h == NULL) + return -EINVAL; + + /* copy L2/L3 header */ + rte_memcpy(l2h, dst->hdr, len); + + /* setup TCP header & options */ + l4h = (struct tcp_hdr *)(l2h + len); + fill_tcph(l4h, &s->tcb, port, seq, l4, flags); + + /* setup mbuf TX offload related fields. */ + m->tx_offload = _mbuf_tx_offload(dst->l2_len, dst->l3_len, l4, 0, 0, 0); + m->ol_flags |= ol_flags; + + /* update proto specific fields. */ + + l3 = l2h + dst->l2_len; + if (((struct ipv4_hdr*)l3)->version_ihl>>4 == 4) { + struct ipv4_hdr *l3h; + l3h = (struct ipv4_hdr *)l3; + l3h->packet_id = rte_cpu_to_be_16(pid); + l3h->total_length = rte_cpu_to_be_16(plen + dst->l3_len + l4); + + if ((ol_flags & PKT_TX_TCP_CKSUM) != 0) + l4h->cksum = _ipv4x_phdr_cksum(l3h, m->l3_len, + ol_flags); + else if (swcsm != 0) + l4h->cksum = _ipv4_udptcp_mbuf_cksum(m, len, l3h); + + if ((ol_flags & PKT_TX_IP_CKSUM) == 0 && swcsm != 0) + l3h->hdr_checksum = _ipv4x_cksum(l3h, m->l3_len); + } else { + struct ipv6_hdr *l3h; + l3h = (struct ipv6_hdr *)l3; + l3h->payload_len = rte_cpu_to_be_16(plen + l4); + if ((ol_flags & PKT_TX_TCP_CKSUM) != 0) + l4h->cksum = rte_ipv6_phdr_cksum(l3h, ol_flags); + else if (swcsm != 0) + l4h->cksum = _ipv6_udptcp_mbuf_cksum(m, len, l3h); + } + + return 0; +} + +static inline int +stream_drb_empty(struct tle_tcp_stream *s) +{ + return rte_ring_empty(s->tx.drb.r); +} + +static inline void +stream_drb_free(struct tle_tcp_stream *s, struct tle_drb *drbs[], + uint32_t nb_drb) +{ + _rte_ring_enqueue_burst(s->tx.drb.r, (void **)drbs, nb_drb); +} + +static inline uint32_t +stream_drb_alloc(struct tle_tcp_stream *s, struct tle_drb *drbs[], + uint32_t nb_drb) +{ + return _rte_ring_dequeue_burst(s->tx.drb.r, (void **)drbs, nb_drb); +} + +/* + * queue standalone packet to he particular output device + * It assumes that: + * - L2/L3/L4 headers should be already set. + * - packet fits into one segment. + */ +static inline int +send_pkt(struct tle_tcp_stream *s, struct tle_dev *dev, struct rte_mbuf *m) +{ + uint32_t n, nb; + struct tle_drb *drb; + + if (stream_drb_alloc(s, &drb, 1) == 0) + return -ENOBUFS; + + /* enqueue pkt for TX. */ + nb = 1; + n = tle_dring_mp_enqueue(&dev->tx.dr, (const void * const*)&m, 1, + &drb, &nb); + + /* free unused drbs. */ + if (nb != 0) + stream_drb_free(s, &drb, 1); + + return (n == 1) ? 0 : -ENOBUFS; +} + +#define TCP_OLFLAGS_CKSUM(flags) (flags & (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM)) + +static inline int +send_ctrl_pkt(struct tle_tcp_stream *s, struct rte_mbuf *m, uint32_t seq, + uint32_t flags) +{ + const struct tle_dest *dst; + uint32_t pid, type; + int32_t rc; + + dst = &s->tx.dst; + type = s->s.type; + pid = get_ip_pid(dst->dev, 1, type, (s->flags & TLE_CTX_FLAG_ST) != 0); + + rc = tcp_fill_mbuf(m, s, dst, TCP_OLFLAGS_CKSUM(dst->ol_flags), + s->s.port, seq, flags, pid, 1); + if (rc == 0) + rc = send_pkt(s, dst->dev, m); + + return rc; +} + +static inline int +send_rst(struct tle_tcp_stream *s, uint32_t seq) +{ + struct rte_mbuf *m; + int32_t rc; + + m = rte_pktmbuf_alloc(s->tx.dst.head_mp); + if (m == NULL) + return -ENOMEM; + + rc = send_ctrl_pkt(s, m, seq, TCP_FLAG_RST | TCP_FLAG_ACK); + if (rc != 0) + rte_pktmbuf_free(m); + else + TCP_INC_STATS(TCP_MIB_OUTRSTS); + + return rc; +} + + + +#ifdef __cplusplus +} +#endif + +#endif /* _TCP_RXTX_H_ */ diff --git a/lib/libtle_l4p/tcp_stream.c b/lib/libtle_l4p/tcp_stream.c index 676521b..4a65053 100644 --- a/lib/libtle_l4p/tcp_stream.c +++ b/lib/libtle_l4p/tcp_stream.c @@ -20,6 +20,8 @@ #include #include +#include + #include "tcp_stream.h" #include "tcp_timer.h" #include "stream_table.h" @@ -27,6 +29,7 @@ #include "tcp_ctl.h" #include "tcp_ofo.h" #include "tcp_txq.h" +#include "tcp_rxtx.h" static void unuse_stream(struct tle_tcp_stream *s) @@ -38,25 +41,27 @@ unuse_stream(struct tle_tcp_stream *s) static void fini_stream(struct tle_tcp_stream *s) { - if (s != NULL) { - rte_free(s->rx.q); - tcp_ofo_free(s->rx.ofo); - rte_free(s->tx.q); - rte_free(s->tx.drb.r); - } + rte_free(s); } static void tcp_fini_streams(struct tle_ctx *ctx) { - uint32_t i; struct tcp_streams *ts; + struct tle_stream *s; ts = CTX_TCP_STREAMS(ctx); if (ts != NULL) { stbl_fini(&ts->st); - for (i = 0; i != ctx->prm.max_streams; i++) - fini_stream(&ts->s[i]); + + /* TODO: free those in use? may be not necessary, as we assume + * all streams have been closed and are free. + */ + while (ctx->streams.nb_free--) { + s = STAILQ_FIRST(&ctx->streams.free); + STAILQ_FIRST(&ctx->streams.free) = STAILQ_NEXT(s, link); + fini_stream(TCP_STREAM(s)); + } /* free the timer wheel */ tle_timer_free(ts->tmr); @@ -94,61 +99,100 @@ alloc_ring(uint32_t n, uint32_t flags, int32_t socket) return r; } +/* stream memory layout: + * [tle_tcp_stream] [rx.q] [rx.ofo] [tx.q] [tx.drb.r] + */ static int -init_stream(struct tle_ctx *ctx, struct tle_tcp_stream *s) +add_stream(struct tle_ctx *ctx) { - size_t bsz, rsz, sz; - uint32_t f, i, k, n, nb; + size_t sz_s, sz_rxq, sz_ofo, sz_txq, sz_drb_r, sz; + /* for rx.q */ + uint32_t n_rxq; + /* for rx.ofo */ + struct ofo *ofo; + struct rte_mbuf **obj; + uint32_t ndb, nobj; + size_t dsz, osz; + /* for tx.q */ + uint32_t n_txq; + /* for tx.drb.r */ + size_t bsz, rsz; struct tle_drb *drb; - char name[RTE_RING_NAMESIZE]; - - f = ((ctx->prm.flags & TLE_CTX_FLAG_ST) == 0) ? 0 : - (RING_F_SP_ENQ | RING_F_SC_DEQ); - - /* init RX part. */ - - n = RTE_MAX(ctx->prm.max_stream_rbufs, 1U); - s->rx.q = alloc_ring(n, f | RING_F_SP_ENQ, ctx->prm.socket_id); - if (s->rx.q == NULL) - return -ENOMEM; - - s->rx.ofo = tcp_ofo_alloc(n, ctx->prm.socket_id); - if (s->rx.ofo == NULL) - return -ENOMEM; - - /* init TX part. */ + uint32_t k, nb, n_drb; - n = RTE_MAX(ctx->prm.max_stream_sbufs, 1U); - s->tx.q = alloc_ring(n, f | RING_F_SC_DEQ, ctx->prm.socket_id); - if (s->tx.q == NULL) - return -ENOMEM; + uint32_t f, i; + char name[RTE_RING_NAMESIZE]; + struct tle_tcp_stream *s; + // stream + sz_s = RTE_ALIGN_CEIL(sizeof(*s), RTE_CACHE_LINE_SIZE); + + // rx.q + n_rxq = RTE_MAX(ctx->prm.max_stream_rbufs, 1U); + n_rxq = rte_align32pow2(n_rxq); + sz_rxq = rte_ring_get_memsize(n_rxq); + sz_rxq = RTE_ALIGN_CEIL(sz_rxq, RTE_CACHE_LINE_SIZE); + + // rx.ofo + calc_ofo_elems(n_rxq, &nobj, &ndb); + osz = sizeof(*ofo) + sizeof(ofo->db[0]) * ndb; + dsz = sizeof(ofo->db[0].obj[0]) * nobj * ndb; + sz_ofo = osz + dsz; + sz_ofo = RTE_ALIGN_CEIL(sz_ofo, RTE_CACHE_LINE_SIZE); + + // tx.q + n_txq = RTE_MAX(ctx->prm.max_stream_sbufs, 1U); + n_txq = rte_align32pow2(n_txq); + sz_txq = rte_ring_get_memsize(n_txq); + sz_txq = RTE_ALIGN_CEIL(sz_txq, RTE_CACHE_LINE_SIZE); + + // tx.drb.r nb = drb_nb_elem(ctx); k = calc_stream_drb_num(ctx, nb); - n = rte_align32pow2(k); - - /* size of the drbs ring */ - rsz = rte_ring_get_memsize(n); + n_drb = rte_align32pow2(k); + rsz = rte_ring_get_memsize(n_drb); /* size of the drbs ring */ rsz = RTE_ALIGN_CEIL(rsz, RTE_CACHE_LINE_SIZE); + bsz = tle_drb_calc_size(nb); /* size of the drb. */ + sz_drb_r = rsz + bsz * k; /* total stream drbs size. */ + sz_drb_r = RTE_ALIGN_CEIL(sz_drb_r, RTE_CACHE_LINE_SIZE); - /* size of the drb. */ - bsz = tle_drb_calc_size(nb); - - /* total stream drbs size. */ - sz = rsz + bsz * k; - - s->tx.drb.r = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE, - ctx->prm.socket_id); - if (s->tx.drb.r == NULL) { - TCP_LOG(ERR, "%s(%p): allocation of %zu bytes on socket %d " + sz = sz_s + sz_rxq + sz_ofo + sz_txq + sz_drb_r; + s = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE, + ctx->prm.socket_id); + if (s == NULL) { + TCP_LOG(ERR, "%s: allocation of %zu bytes on socket %d " "failed with error code: %d\n", - __func__, s, sz, ctx->prm.socket_id, rte_errno); + __func__, sz, ctx->prm.socket_id, rte_errno); return -ENOMEM; } - snprintf(name, sizeof(name), "%p@%zu", s, sz); - rte_ring_init(s->tx.drb.r, name, n, f); + s->rx.q = (struct rte_ring *)((uintptr_t)s + sz_s); + s->rx.ofo = (struct ofo *)((uintptr_t)s->rx.q + sz_rxq); + ofo = s->rx.ofo; + s->tx.q = (struct rte_ring *)((uintptr_t)s->rx.ofo + sz_ofo); + s->tx.drb.r = (struct rte_ring *)((uintptr_t)s->tx.q + sz_txq); + // ring flags + f = ((ctx->prm.flags & TLE_CTX_FLAG_ST) == 0) ? 0 : + (RING_F_SP_ENQ | RING_F_SC_DEQ); + + /* init RX part. */ + snprintf(name, sizeof(name), "%p@%zu", s->rx.q, sz_rxq); + rte_ring_init(s->rx.q, name, n_rxq, f); + + obj = (struct rte_mbuf **)&ofo->db[ndb]; + for (i = 0; i != ndb; i++) { + ofo->db[i].nb_max = nobj; + ofo->db[i].obj = obj + i * nobj; + } + ofo->nb_max = ndb; + + /* init TX part. */ + snprintf(name, sizeof(name), "%p@%zu", s->tx.q, sz_txq); + rte_ring_init(s->tx.q, name, n_txq, f); + + snprintf(name, sizeof(name), "%p@%zu", s->tx.drb.r, sz_drb_r); + rte_ring_init(s->tx.drb.r, name, n_drb, f); for (i = 0; i != k; i++) { drb = (struct tle_drb *)((uintptr_t)s->tx.drb.r + rsz + bsz * i); @@ -200,7 +244,7 @@ tcp_init_streams(struct tle_ctx *ctx) f = ((ctx->prm.flags & TLE_CTX_FLAG_ST) == 0) ? 0 : (RING_F_SP_ENQ | RING_F_SC_DEQ); - sz = sizeof(*ts) + sizeof(ts->s[0]) * ctx->prm.max_streams; + sz = sizeof(*ts); ts = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE, ctx->prm.socket_id); if (ts == NULL) { @@ -210,6 +254,7 @@ tcp_init_streams(struct tle_ctx *ctx) return -ENOMEM; } + rte_spinlock_init(&ts->dr.lock); STAILQ_INIT(&ts->dr.fe); STAILQ_INIT(&ts->dr.be); @@ -228,12 +273,11 @@ tcp_init_streams(struct tle_ctx *ctx) if (ts->tsq == NULL) rc = -ENOMEM; else - rc = stbl_init(&ts->st, ctx->prm.max_streams, - ctx->prm.socket_id); + rc = stbl_init(&ts->st, (ctx->prm.flags & TLE_CTX_FLAG_ST) == 0); } - for (i = 0; rc == 0 && i != ctx->prm.max_streams; i++) - rc = init_stream(ctx, &ts->s[i]); + for (i = 0; rc == 0 && i != ctx->prm.min_streams; i++) + rc = add_stream(ctx); if (rc != 0) { TCP_LOG(ERR, "initalisation of %u-th stream failed", i); @@ -243,11 +287,30 @@ tcp_init_streams(struct tle_ctx *ctx) return rc; } -static void __attribute__((constructor)) +/* + * Note this function is not thread-safe, and we did not lock here as we + * have the assumption that this ctx is dedicated to one thread. + */ +static uint32_t +tcp_more_streams(struct tle_ctx *ctx) +{ + uint32_t i, nb; + uint32_t nb_max = ctx->prm.max_streams - 1; + uint32_t nb_cur = ctx->streams.nb_cur; + + nb = RTE_MIN(ctx->prm.delta_streams, nb_max - nb_cur); + for (i = 0; i < nb; i++) + if (add_stream(ctx) != 0) + break; + return i; +} + +static void __attribute__((constructor(101))) tcp_stream_setup(void) { static const struct stream_ops tcp_ops = { .init_streams = tcp_init_streams, + .more_streams = tcp_more_streams, .fini_streams = tcp_fini_streams, .free_drbs = tcp_free_drbs, }; @@ -305,16 +368,12 @@ tle_tcp_stream_open(struct tle_ctx *ctx, s = (struct tle_tcp_stream *)get_stream(ctx); if (s == NULL) { - rte_errno = ENFILE; - return NULL; - - /* some TX still pending for that stream. */ - } else if (TCP_STREAM_TX_PENDING(s)) { - put_stream(ctx, &s->s, 0); rte_errno = EAGAIN; return NULL; } + s->s.option.raw = prm->option; + /* setup L4 ports and L3 addresses fields. */ rc = stream_fill_ctx(ctx, &s->s, (const struct sockaddr *)&prm->addr.local, @@ -336,12 +395,14 @@ tle_tcp_stream_open(struct tle_ctx *ctx, /* store other params */ s->flags = ctx->prm.flags; + s->tcb.err = 0; s->tcb.snd.nb_retm = (prm->cfg.nb_retries != 0) ? prm->cfg.nb_retries : TLE_TCP_DEFAULT_RETRIES; s->tcb.snd.cwnd = (ctx->prm.icw == 0) ? TCP_INITIAL_CWND_MAX : ctx->prm.icw; s->tcb.snd.rto_tw = (ctx->prm.timewait == TLE_TCP_TIMEWAIT_DEFAULT) ? TCP_RTO_2MSL : ctx->prm.timewait; + s->tcb.snd.rto_fw = TLE_TCP_FINWAIT_TIMEOUT; tcp_stream_up(s); return &s->s; @@ -354,9 +415,16 @@ static inline int stream_close(struct tle_ctx *ctx, struct tle_tcp_stream *s) { uint16_t uop; - uint32_t state; static const struct tle_stream_cb zcb; + /* Put uop operation into this wlock; or it may cause this stream + * to be put into death ring twice, for example: + * 1) FE sets OP_CLOSE; + * 2) BE stream_term sets state as TCP_ST_CLOSED, and put in queue; + * 3) FE down the stream, and calls stream_term again. + */ + tcp_stream_down(s); + /* check was close() already invoked */ uop = s->tcb.uop; if ((uop & TCP_OP_CLOSE) != 0) @@ -366,47 +434,66 @@ stream_close(struct tle_ctx *ctx, struct tle_tcp_stream *s) if (rte_atomic16_cmpset(&s->tcb.uop, uop, uop | TCP_OP_CLOSE) == 0) return -EDEADLK; - /* mark stream as unavaialbe for RX/TX. */ - tcp_stream_down(s); - /* reset events/callbacks */ - s->rx.ev = NULL; s->tx.ev = NULL; + s->rx.ev = NULL; s->err.ev = NULL; s->rx.cb = zcb; s->tx.cb = zcb; s->err.cb = zcb; - state = s->tcb.state; - - /* CLOSED, LISTEN, SYN_SENT - we can close the stream straighway */ - if (state <= TCP_ST_SYN_SENT) { + switch (s->tcb.state) { + case TCP_ST_LISTEN: + /* close the stream straightway */ tcp_stream_reset(ctx, s); return 0; - } - - /* generate FIN and proceed with normal connection termination */ - if (state == TCP_ST_ESTABLISHED || state == TCP_ST_CLOSE_WAIT) { - - /* change state */ - s->tcb.state = (state == TCP_ST_ESTABLISHED) ? - TCP_ST_FIN_WAIT_1 : TCP_ST_LAST_ACK; - - /* mark stream as writable/readable again */ + case TCP_ST_CLOSED: + /* it could be put into this state if a RST packet is + * received, but this stream could be still in tsq trying + * to send something. + */ + /* fallthrough */ + case TCP_ST_SYN_SENT: + /* timer on and could be in tsq (SYN retrans) */ + stream_term(s); + /* fallthrough */ + case TCP_ST_FIN_WAIT_1: + /* fallthrough */ + case TCP_ST_CLOSING: + /* fallthrough */ + case TCP_ST_TIME_WAIT: + /* fallthrough */ + case TCP_ST_LAST_ACK: tcp_stream_up(s); - - /* queue stream into to-send queue */ - txs_enqueue(ctx, s); return 0; + case TCP_ST_ESTABLISHED: + /* fallthrough */ + case TCP_ST_CLOSE_WAIT: + if (s->tcb.state == TCP_ST_ESTABLISHED) { + s->tcb.state = TCP_ST_FIN_WAIT_1; + TCP_DEC_STATS_ATOMIC(TCP_MIB_CURRESTAB); + } else + s->tcb.state = TCP_ST_LAST_ACK; + + if (!rte_ring_empty(s->rx.q)) { + TCP_INC_STATS(TCP_MIB_ESTABRESETS); + s->tcb.uop |= TCP_OP_RESET; + stream_term(s); + } + break; + case TCP_ST_FIN_WAIT_2: + /* Can reach this state if shutdown was called, but the timer + * shall be set after this close. + */ + break; + default: + rte_panic("Invalid state when close: %d\n", s->tcb.state); } - /* - * accroding to the state, close() was already invoked, - * should never that point. - */ - RTE_ASSERT(0); - return -EINVAL; + tcp_stream_up(s); + txs_enqueue(ctx, s); + return 0; } uint32_t @@ -452,6 +539,64 @@ tle_tcp_stream_close(struct tle_stream *ts) return stream_close(ctx, s); } +int +tle_tcp_stream_shutdown(struct tle_stream *ts, int how) +{ + int ret; + bool wakeup; + uint32_t state; + struct tle_tcp_stream *s; + + s = TCP_STREAM(ts); + if (ts == NULL || s->s.type >= TLE_VNUM) + return -EINVAL; + + /* Refer to linux/net/ipv4/tcp.c:tcp_shutdown() */ + if (how == SHUT_RD) + return 0; + + tcp_stream_down(s); + + state = s->tcb.state; + + switch (state) { + case TCP_ST_LISTEN: + /* fallthrough */ + case TCP_ST_SYN_SENT: + s->tcb.state = TCP_ST_CLOSED; + wakeup = true; + ret = 0; + break; + case TCP_ST_ESTABLISHED: + /* fallthrough */ + case TCP_ST_CLOSE_WAIT: + if (state == TCP_ST_ESTABLISHED) { + TCP_DEC_STATS_ATOMIC(TCP_MIB_CURRESTAB); + s->tcb.state = TCP_ST_FIN_WAIT_1; + } else + s->tcb.state = TCP_ST_LAST_ACK; + txs_enqueue(ts->ctx, s); + wakeup = true; + ret = 0; + break; + default: + wakeup = false; + rte_errno = ENOTCONN; + ret = -1; + } + + if (wakeup) { + /* Notify other threads which may wait on the event */ + if (s->tx.ev) + tle_event_raise(s->tx.ev); + if (how == SHUT_RDWR && s->err.ev) + tle_event_raise(s->err.ev); + } + + tcp_stream_up(s); + return ret; +} + int tle_tcp_stream_get_addr(const struct tle_stream *ts, struct tle_tcp_stream_addr *addr) @@ -617,3 +762,73 @@ tle_tcp_stream_get_mss(const struct tle_stream * ts) s = TCP_STREAM(ts); return s->tcb.snd.mss; } + +int +tle_tcp_stream_get_info(const struct tle_stream * ts, void *info, socklen_t *optlen) +{ + struct tle_tcp_stream *s; + struct tcp_info i; + + if (ts == NULL) + return -EINVAL; + + s = TCP_STREAM(ts); + + memset(&i, 0, sizeof(struct tcp_info)); + + /* transform from tldk state into linux kernel state */ + switch (s->tcb.state) { + case TCP_ST_CLOSED: + i.tcpi_state = TCP_CLOSE; + break; + case TCP_ST_LISTEN: + i.tcpi_state = TCP_LISTEN; + break; + case TCP_ST_SYN_SENT: + i.tcpi_state = TCP_SYN_SENT; + break; + case TCP_ST_SYN_RCVD: + i.tcpi_state = TCP_SYN_RECV; + break; + case TCP_ST_ESTABLISHED: + i.tcpi_state = TCP_ESTABLISHED; + break; + case TCP_ST_FIN_WAIT_1: + i.tcpi_state = TCP_FIN_WAIT1; + break; + case TCP_ST_FIN_WAIT_2: + i.tcpi_state = TCP_FIN_WAIT2; + break; + case TCP_ST_CLOSE_WAIT: + i.tcpi_state = TCP_CLOSE_WAIT; + break; + case TCP_ST_CLOSING: + i.tcpi_state = TCP_CLOSING; + break; + case TCP_ST_LAST_ACK: + i.tcpi_state = TCP_LAST_ACK; + break; + case TCP_ST_TIME_WAIT: + i.tcpi_state = TCP_TIME_WAIT; + break; + } + + /* fix me, total retrans? */ + i.tcpi_total_retrans = s->tcb.snd.nb_retx; + + if (*optlen > sizeof(struct tcp_info)) + *optlen = sizeof(struct tcp_info); + rte_memcpy(info, &i, *optlen); + return 0; +} + +void +tle_tcp_stream_set_keepalive(struct tle_stream *ts) +{ + struct tle_tcp_stream *s; + + s = TCP_STREAM(ts); + + s->tcb.uop |= TCP_OP_KEEPALIVE; + txs_enqueue(ts->ctx, s); +} diff --git a/lib/libtle_l4p/tcp_stream.h b/lib/libtle_l4p/tcp_stream.h index 4629fe6..1202574 100644 --- a/lib/libtle_l4p/tcp_stream.h +++ b/lib/libtle_l4p/tcp_stream.h @@ -17,6 +17,8 @@ #define _TCP_STREAM_H_ #include +#include + #include #include #include @@ -45,23 +47,28 @@ enum { }; enum { - TCP_OP_LISTEN = 0x1, - TCP_OP_ACCEPT = 0x2, - TCP_OP_CONNECT = 0x4, - TCP_OP_CLOSE = 0x8, + TCP_OP_LISTEN = 0x1, + TCP_OP_ACCEPT = 0x2, + TCP_OP_CONNECT = 0x4, + TCP_OP_CLOSE = 0x8, + TCP_OP_RESET = 0x10, + TCP_OP_KEEPALIVE = 0x20 }; struct tcb { + int err; volatile uint16_t state; volatile uint16_t uop; /* operations by user performed */ struct { uint32_t nxt; + uint32_t cpy; /* head of yet unread data */ uint32_t irs; /* initial received sequence */ uint32_t wnd; uint32_t ts; struct { uint32_t seq; - uint32_t on; + uint32_t on; /* on == 1: received an out-of-order fin + * on == 2: received an in order fin */ } frs; uint32_t srtt; /* smoothed round trip time (scaled by >> 3) */ uint32_t rttvar; /* rtt variance */ @@ -83,15 +90,32 @@ struct tcb { uint32_t ssthresh; /* slow start threshold */ uint32_t rto; /* retransmission timeout */ uint32_t rto_tw; /* TIME_WAIT retransmission timeout */ + uint32_t rto_fw; /* FIN_WAIT_2 waiting timeout */ uint32_t iss; /* initial send sequence */ + uint32_t waitlen; /* total length of unacknowledged pkt */ + uint32_t cork_ts; uint16_t mss; uint8_t wscale; uint8_t nb_retx; /* number of retransmission */ uint8_t nb_retm; /**< max number of retx attempts. */ + uint8_t nb_keepalive;/* number of sended keepalive */ + bool update_rcv; /* Flag for updating recv window */ + uint16_t nxt_offset; /* Partial tx, next data of a segment to tx */ + uint32_t una_offset; /* Partial ack, next data of a mbuf to ack */ + struct rte_mbuf *nxt_pkt; /* Partial tx, next segment to send */ } snd; struct syn_opts so; /* initial syn options. */ }; +enum { + TIMER_RTO, + TIMER_DACK, + TIMER_KEEPALIVE, + TIMER_NUM, + TIMER_MAX_NUM = 8, + TIMER_MASK = TIMER_MAX_NUM - 1 +}; + struct tle_tcp_stream { struct tle_stream s; @@ -103,7 +127,7 @@ struct tle_tcp_stream { struct tcb tcb; struct { - void *handle; + void *handle[TIMER_NUM]; } timer; struct { @@ -155,7 +179,6 @@ struct tcp_streams { struct tle_timer_wheel *tmr; /* timer wheel */ struct rte_ring *tsq; /* to-send streams queue */ struct sdr dr; /* death row for zombie streams */ - struct tle_tcp_stream s[]; /* array of allocated streams. */ }; #define CTX_TCP_STREAMS(ctx) ((struct tcp_streams *)(ctx)->streams.buf) diff --git a/lib/libtle_l4p/tcp_timer.h b/lib/libtle_l4p/tcp_timer.h index 8faefb3..d242556 100644 --- a/lib/libtle_l4p/tcp_timer.h +++ b/lib/libtle_l4p/tcp_timer.h @@ -27,43 +27,53 @@ extern "C" { * all RTO values are in ms. */ #define TCP_RTO_MAX 60000U /* RFC 6298 (2.5) */ -#define TCP_RTO_MIN 1000U /* RFC 6298 (2.4) */ +#define TCP_RTO_MIN 200U /* Linux/include/net/tcp.h: TCP_RTO_MIN */ #define TCP_RTO_2MSL (2 * TCP_RTO_MAX) -#define TCP_RTO_DEFAULT TCP_RTO_MIN /* RFC 6298 (2.1)*/ +#define TCP_RTO_DEFAULT 1000U /* RFC 6298 (2.1)*/ #define TCP_RTO_GRANULARITY 100U +static inline struct tle_tcp_stream * +timer_stream(struct tle_tcp_stream *s) +{ + return (struct tle_tcp_stream *)((unsigned long)s & (~(unsigned long)TIMER_MASK)); +} + +static inline uint8_t +timer_type(struct tle_tcp_stream *s) +{ + return (uint8_t)((unsigned long)s & (unsigned long)TIMER_MASK); +} static inline void -timer_stop(struct tle_tcp_stream *s) +timer_stop(struct tle_tcp_stream *s, uint8_t type) { struct tle_timer_wheel *tw; - if (s->timer.handle != NULL) { + if (s->timer.handle[type] != NULL) { tw = CTX_TCP_TMWHL(s->s.ctx); - tle_timer_stop(tw, s->timer.handle); - s->timer.handle = NULL; + tle_timer_stop(tw, s->timer.handle[type]); + s->timer.handle[type] = NULL; } } static inline void -timer_start(struct tle_tcp_stream *s) +timer_start(struct tle_tcp_stream *s, uint8_t type, uint32_t timeout) { struct tle_timer_wheel *tw; - if (s->timer.handle == NULL) { + if (s->timer.handle[type] == NULL) { tw = CTX_TCP_TMWHL(s->s.ctx); - s->timer.handle = tle_timer_start(tw, s, s->tcb.snd.rto); - s->tcb.snd.nb_retx = 0; + s->timer.handle[type] = tle_timer_start(tw, (void*)((unsigned long)s | type), timeout); } } static inline void -timer_restart(struct tle_tcp_stream *s) +timer_restart(struct tle_tcp_stream *s, uint8_t type, uint32_t timeout) { struct tle_timer_wheel *tw; tw = CTX_TCP_TMWHL(s->s.ctx); - s->timer.handle = tle_timer_start(tw, s, s->tcb.snd.rto); + s->timer.handle[type] = tle_timer_start(tw, (void*)((unsigned long)s | type), timeout); } @@ -71,10 +81,10 @@ timer_restart(struct tle_tcp_stream *s) * reset number of retransmissions and restart RTO timer. */ static inline void -timer_reset(struct tle_tcp_stream *s) +timer_reset(struct tle_tcp_stream *s, uint8_t type, uint32_t timeout) { - timer_stop(s); - timer_start(s); + timer_stop(s, type); + timer_start(s, type, timeout); } static inline uint32_t diff --git a/lib/libtle_l4p/tcp_tx_seg.h b/lib/libtle_l4p/tcp_tx_seg.h index ac2b13b..b64aa77 100644 --- a/lib/libtle_l4p/tcp_tx_seg.h +++ b/lib/libtle_l4p/tcp_tx_seg.h @@ -27,7 +27,7 @@ tcp_segmentation(struct rte_mbuf *mbin, struct rte_mbuf *mbout[], uint16_t num, struct rte_mbuf *in_seg = NULL; uint32_t nbseg, in_seg_data_pos; uint32_t more_in_segs; - uint16_t bytes_left; + uint16_t out_bytes_remain; in_seg = mbin; in_seg_data_pos = 0; @@ -35,7 +35,7 @@ tcp_segmentation(struct rte_mbuf *mbin, struct rte_mbuf *mbout[], uint16_t num, /* Check that pkts_out is big enough to hold all fragments */ if (mss * num < (uint16_t)mbin->pkt_len) - return -ENOSPC; + return -EAGAIN; more_in_segs = 1; while (more_in_segs) { @@ -49,7 +49,7 @@ tcp_segmentation(struct rte_mbuf *mbin, struct rte_mbuf *mbout[], uint16_t num, return -ENOMEM; } - bytes_left = mss; + out_bytes_remain = mss; out_seg_prev = out_pkt; more_out_segs = 1; while (more_out_segs && more_in_segs) { @@ -68,7 +68,7 @@ tcp_segmentation(struct rte_mbuf *mbin, struct rte_mbuf *mbout[], uint16_t num, /* Prepare indirect buffer */ rte_pktmbuf_attach(out_seg, in_seg); - len = bytes_left; + len = out_bytes_remain; if (len > (in_seg->data_len - in_seg_data_pos)) len = in_seg->data_len - in_seg_data_pos; @@ -77,10 +77,10 @@ tcp_segmentation(struct rte_mbuf *mbin, struct rte_mbuf *mbout[], uint16_t num, out_pkt->pkt_len = (uint16_t)(len + out_pkt->pkt_len); out_pkt->nb_segs += 1; in_seg_data_pos += len; - bytes_left -= len; + out_bytes_remain -= len; /* Current output packet (i.e. fragment) done ? */ - if (bytes_left == 0) + if (out_bytes_remain == 0) more_out_segs = 0; /* Current input segment done ? */ diff --git a/lib/libtle_l4p/tcp_txq.h b/lib/libtle_l4p/tcp_txq.h index 78f1d56..303b8fd 100644 --- a/lib/libtle_l4p/tcp_txq.h +++ b/lib/libtle_l4p/tcp_txq.h @@ -68,9 +68,28 @@ tcp_txq_set_nxt_head(struct tle_tcp_stream *s, uint32_t num) static inline void tcp_txq_rst_nxt_head(struct tle_tcp_stream *s) { - struct rte_ring *r; + struct rte_ring *r = s->tx.q; + struct rte_mbuf *m; + uint32_t offset, data_len; + + if (s->tcb.snd.nxt_pkt != NULL) { + s->tcb.snd.nxt_offset = 0; + s->tcb.snd.nxt_pkt = NULL; + } + + offset = s->tcb.snd.una_offset; + if (offset) { + m = (struct rte_mbuf *)(_rte_ring_get_data(r)[r->cons.tail & r->mask]); + data_len = m->data_len - PKT_L234_HLEN(m); + while (offset >= data_len) { + offset -= data_len; + m = m->next; + data_len = m->data_len; + } + s->tcb.snd.nxt_pkt = m; + s->tcb.snd.nxt_offset = offset; + } - r = s->tx.q; r->cons.head = r->cons.tail; } @@ -110,9 +129,13 @@ static inline uint32_t txs_dequeue_bulk(struct tle_ctx *ctx, struct tle_tcp_stream *s[], uint32_t num) { struct rte_ring *r; + uint32_t n, i; r = CTX_TCP_TSQ(ctx); - return _rte_ring_dequeue_burst(r, (void **)s, num); + n = _rte_ring_dequeue_burst(r, (void **)s, num); + for (i = 0; i < n; i++) + rte_atomic32_clear(&s[i]->tx.arm); + return n; } #ifdef __cplusplus diff --git a/lib/libtle_l4p/tle_ctx.h b/lib/libtle_l4p/tle_ctx.h index de78a6b..f0efd51 100644 --- a/lib/libtle_l4p/tle_ctx.h +++ b/lib/libtle_l4p/tle_ctx.h @@ -54,6 +54,43 @@ extern "C" { struct tle_ctx; struct tle_dev; +typedef union tle_stream_options { + struct { + uint32_t reuseaddr: 1; + uint32_t reuseport: 1; + uint32_t keepalive: 1; + uint32_t ipv6only: 1; + uint32_t oobinline: 1; + uint32_t tcpcork: 1; + uint32_t tcpnodelay: 1; + uint32_t mulloop: 1; + uint32_t timestamp: 1; + uint32_t reserve: 3; + uint32_t tcpquickack: 4; + uint32_t multtl: 8; + uint32_t keepcnt: 8; + uint16_t keepidle; + uint16_t keepintvl; + }; + uint64_t raw; +} tle_stream_options_t; + +static inline void +tle_set_timestamp(struct msghdr *msg, struct rte_mbuf *m) +{ + struct timeval *tv; + struct cmsghdr *cmsg; + + cmsg = CMSG_FIRSTHDR(msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SO_TIMESTAMP; + cmsg->cmsg_len = CMSG_LEN(sizeof(struct timeval)); + msg->msg_controllen = cmsg->cmsg_len; + tv = (struct timeval*)CMSG_DATA(cmsg); + tv->tv_sec = m->timestamp >> 20; + tv->tv_usec = m->timestamp & 0xFFFFFUL; +} + /** * Blocked L4 ports info. */ @@ -112,6 +149,8 @@ struct tle_ctx_param { int32_t socket_id; /**< socket ID to allocate memory for. */ uint32_t proto; /**< L4 proto to handle. */ uint32_t max_streams; /**< max number of streams in context. */ + uint32_t min_streams; /**< min number of streams at init. */ + uint32_t delta_streams; /**< delta of streams of each allocation. */ uint32_t max_stream_rbufs; /**< max recv mbufs per stream. */ uint32_t max_stream_sbufs; /**< max send mbufs per stream. */ uint32_t send_bulk_size; /**< expected # of packets per send call. */ @@ -145,6 +184,8 @@ struct tle_ctx_param { */ #define TLE_TCP_TIMEWAIT_DEFAULT UINT32_MAX +#define TLE_TCP_FINWAIT_TIMEOUT 60000 + /** * create L4 processing context. * @param ctx_prm diff --git a/lib/libtle_l4p/tle_event.h b/lib/libtle_l4p/tle_event.h index d730345..dd7a997 100644 --- a/lib/libtle_l4p/tle_event.h +++ b/lib/libtle_l4p/tle_event.h @@ -43,7 +43,7 @@ struct tle_event { struct tle_evq *head; const void *data; enum tle_ev_state state; -} __rte_cache_aligned; +}; struct tle_evq { rte_spinlock_t lock; diff --git a/lib/libtle_l4p/tle_stats.h b/lib/libtle_l4p/tle_stats.h new file mode 100644 index 0000000..3588c6d --- /dev/null +++ b/lib/libtle_l4p/tle_stats.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TLE_STATS_H +#define TLE_STATS_H + +#include +#include + +/* tcp mib definitions */ +/* + * RFC 1213: MIB-II TCP group + * RFC 2012 (updates 1213): SNMPv2-MIB-TCP + */ +enum +{ + TCP_MIB_RTOALGORITHM, /* RtoAlgorithm */ + TCP_MIB_RTOMIN, /* RtoMin */ + TCP_MIB_RTOMAX, /* RtoMax */ + TCP_MIB_MAXCONN, /* MaxConn */ + TCP_MIB_ACTIVEOPENS, /* ActiveOpens */ + TCP_MIB_PASSIVEOPENS, /* PassiveOpens */ + TCP_MIB_ATTEMPTFAILS, /* AttemptFails */ + TCP_MIB_ESTABRESETS, /* EstabResets */ + TCP_MIB_CURRESTAB, /* CurrEstab */ + TCP_MIB_INSEGS, /* InSegs */ + TCP_MIB_OUTSEGS, /* OutSegs */ + TCP_MIB_RETRANSSEGS, /* RetransSegs */ + TCP_MIB_INERRS, /* InErrs */ + TCP_MIB_OUTRSTS, /* OutRsts */ + TCP_MIB_CSUMERRORS, /* InCsumErrors */ + TCP_MIB_MAX +}; + +/* udp mib definitions */ +/* + * RFC 1213: MIB-II UDP group + * RFC 2013 (updates 1213): SNMPv2-MIB-UDP + */ +enum +{ + UDP_MIB_INDATAGRAMS, /* InDatagrams */ + UDP_MIB_NOPORTS, /* NoPorts */ + UDP_MIB_INERRORS, /* InErrors */ + UDP_MIB_OUTDATAGRAMS, /* OutDatagrams */ + UDP_MIB_RCVBUFERRORS, /* RcvbufErrors */ + UDP_MIB_SNDBUFERRORS, /* SndbufErrors */ + UDP_MIB_CSUMERRORS, /* InCsumErrors */ + UDP_MIB_IGNOREDMULTI, /* IgnoredMulti */ + UDP_MIB_MAX +}; + +struct tcp_mib { + unsigned long mibs[TCP_MIB_MAX]; +}; + +struct udp_mib { + unsigned long mibs[UDP_MIB_MAX]; +}; + +struct tle_mib { + struct tcp_mib tcp; + struct udp_mib udp; +} __rte_cache_aligned; + +extern struct tle_mib default_mib; + +RTE_DECLARE_PER_LCORE(struct tle_mib *, mib); + +#define PERCPU_MIB RTE_PER_LCORE(mib) + +#define SNMP_INC_STATS(mib, field) (mib).mibs[field]++ +#define SNMP_DEC_STATS(mib, field) (mib).mibs[field]-- +#define SNMP_ADD_STATS(mib, field, n) (mib).mibs[field] += n +#define SNMP_ADD_STATS_ATOMIC(mib, field, n) \ + rte_atomic64_add((rte_atomic64_t *)(&(mib).mibs[field]), n) + +#define TCP_INC_STATS(field) SNMP_INC_STATS(PERCPU_MIB->tcp, field) +#define TCP_DEC_STATS(field) SNMP_DEC_STATS(PERCPU_MIB->tcp, field) +#define TCP_ADD_STATS(field, n) SNMP_ADD_STATS(PERCPU_MIB->tcp, field, n) +#define TCP_INC_STATS_ATOMIC(field) SNMP_ADD_STATS_ATOMIC(PERCPU_MIB->tcp, field, 1) +#define TCP_DEC_STATS_ATOMIC(field) SNMP_ADD_STATS_ATOMIC(PERCPU_MIB->tcp, field, (-1)) + +#define UDP_INC_STATS(field) SNMP_INC_STATS(PERCPU_MIB->udp, field) +#define UDP_ADD_STATS(field, n) SNMP_ADD_STATS(PERCPU_MIB->udp, field, n) +#define UDP_ADD_STATS_ATOMIC(field, n) \ + SNMP_ADD_STATS_ATOMIC(PERCPU_MIB->udp, field, n) + +#endif /* TLE_STATS_H */ diff --git a/lib/libtle_l4p/tle_tcp.h b/lib/libtle_l4p/tle_tcp.h index b0cbda6..93e853e 100644 --- a/lib/libtle_l4p/tle_tcp.h +++ b/lib/libtle_l4p/tle_tcp.h @@ -49,6 +49,7 @@ struct tle_tcp_stream_cfg { struct tle_tcp_stream_param { struct tle_tcp_stream_addr addr; struct tle_tcp_stream_cfg cfg; + uint64_t option; }; /** @@ -85,6 +86,25 @@ tle_tcp_stream_open(struct tle_ctx *ctx, */ int tle_tcp_stream_close(struct tle_stream *s); +/** + * shutdown an open stream in SHUT_WR way. + * similar to tle_tcp_stream_close(), except: + * - rx still works + * - er still works + * @param s + * Pointer to the stream to close. + * @return + * zero on successful completion. + * - -EINVAL - invalid parameter passed to function + * - -EDEADLK - close was already invoked on that stream + */ +int tle_tcp_stream_shutdown(struct tle_stream *s, int how); + +/** + * Send rst on this stream. + */ +void tle_tcp_stream_kill(struct tle_stream *s); + /** * close a group of open streams. * if the stream is in connected state, then: @@ -267,6 +287,15 @@ uint32_t tle_tcp_stream_update_cfg(struct tle_stream *ts[], uint16_t tle_tcp_stream_recv(struct tle_stream *s, struct rte_mbuf *pkt[], uint16_t num); +/** + * Get how many bytes are received in recv window. + * @param ts + * TCP stream to receive data from. + * @return + * bytes of data inside recv window. + */ +uint16_t tle_tcp_stream_inq(struct tle_stream *s); + /** * Reads iovcnt buffers from the for given TCP stream. * Note that the stream has to be in connected state. @@ -289,6 +318,19 @@ uint16_t tle_tcp_stream_recv(struct tle_stream *s, struct rte_mbuf *pkt[], ssize_t tle_tcp_stream_readv(struct tle_stream *ts, const struct iovec *iov, int iovcnt); +/** + * Like tle_tcp_stream_readv, but with more information returned in msghdr. + * Note that the stream has to be in connected state. + * @param ts + * TCP stream to receive data from. + * @param msg + * If not NULL, generate control message into msg_control field of msg. + * @return + * On success, number of bytes read in the stream receive buffer. + * In case of error, returns -1 and error code will be set in rte_errno. + */ +ssize_t tle_tcp_stream_recvmsg(struct tle_stream *ts, struct msghdr *msg); + /** * Consume and queue up to *num* packets, that will be sent eventually * by tle_tcp_tx_bulk(). @@ -420,6 +462,24 @@ uint16_t tle_tcp_tx_bulk(struct tle_dev *dev, struct rte_mbuf *pkt[], */ int tle_tcp_process(struct tle_ctx *ctx, uint32_t num); +/** + * Get tcp info of a tcp stream. + * This function is not multi-thread safe. + * @param ts + * TCP stream to get info from. + * @param info + * Pointer to store info. + * @param optlen + * Pointer to length of info. + * @return + * zero on successful completion. + * - ENOTCONN - connection is not connected yet. + */ +int +tle_tcp_stream_get_info(const struct tle_stream * ts, void *info, socklen_t *optlen); + +void tle_tcp_stream_set_keepalive(struct tle_stream *ts); + #ifdef __cplusplus } #endif diff --git a/lib/libtle_l4p/tle_udp.h b/lib/libtle_l4p/tle_udp.h index d3a8fe9..640ed64 100644 --- a/lib/libtle_l4p/tle_udp.h +++ b/lib/libtle_l4p/tle_udp.h @@ -35,6 +35,7 @@ struct tle_udp_stream_param { struct tle_event *send_ev; /**< send event to use. */ struct tle_stream_cb send_cb; /**< send callback to use. */ + uint64_t option; }; /** @@ -54,6 +55,36 @@ struct tle_stream * tle_udp_stream_open(struct tle_ctx *ctx, const struct tle_udp_stream_param *prm); +/** + * set an existed stream within given UDP context with new param. + * @param ts + * stream to set with new param + * @param ctx + * UDP context to set the stream within. + * @param prm + * Parameters used to set the stream. + * @return + * Pointer to UDP stream structure that can be used in future UDP API calls, + * or NULL on error, with error code set in rte_errno. + * Possible rte_errno errors include: + * - EINVAL - invalid parameter passed to function + * - ENOFILE - max limit of open streams reached for that context + */ +struct tle_stream * +tle_udp_stream_set(struct tle_stream *ts, struct tle_ctx *ctx, + const struct tle_udp_stream_param *prm); + +/** + * shutdown an open stream. + * + * @param s + * Pointer to the stream to shutdown. + * @return + * zero on successful completion. + * - -EINVAL - invalid parameter passed to function + */ +int tle_udp_stream_shutdown(struct tle_stream *s, int how); + /** * close an open stream. * All packets still remaining in stream receive buffer will be freed. @@ -180,6 +211,24 @@ uint16_t tle_udp_stream_recv(struct tle_stream *s, struct rte_mbuf *pkt[], uint16_t tle_udp_stream_send(struct tle_stream *s, struct rte_mbuf *pkt[], uint16_t num, const struct sockaddr *dst_addr); +/** + * updates configuration (associated events, callbacks, stream parameters) + * for the given streams. + * @param ts + * An array of pointers to the streams to update. + * @param prm + * An array of parameters to update for the given streams. + * @param num + * Number of elements in the *ts* and *prm* arrays. + * @return + * number of streams successfully updated. + * In case of error, error code set in rte_errno. + * Possible rte_errno errors include: + * - EINVAL - invalid parameter passed to function + */ +uint32_t tle_udp_stream_update_cfg(struct tle_stream *ts[], + struct tle_udp_stream_param prm[], uint32_t num); + #ifdef __cplusplus } #endif diff --git a/lib/libtle_l4p/udp_rxtx.c b/lib/libtle_l4p/udp_rxtx.c index 84a13ea..e9539b9 100644 --- a/lib/libtle_l4p/udp_rxtx.c +++ b/lib/libtle_l4p/udp_rxtx.c @@ -13,7 +13,6 @@ * limitations under the License. */ -#include #include #include #include @@ -24,14 +23,11 @@ #include "misc.h" static inline struct tle_udp_stream * -rx_stream_obtain(struct tle_dev *dev, uint32_t type, uint32_t port) +rx_stream_obtain_by_tuples(struct stbl *st, const union pkt_info *pi) { struct tle_udp_stream *s; - if (type >= TLE_VNUM || dev->dp[type] == NULL) - return NULL; - - s = (struct tle_udp_stream *)dev->dp[type]->streams[port]; + s = UDP_STREAM(stbl_find_stream(st, pi)); if (s == NULL) return NULL; @@ -41,6 +37,24 @@ rx_stream_obtain(struct tle_dev *dev, uint32_t type, uint32_t port) return s; } +static inline struct tle_udp_stream * +rx_stream_obtain(struct tle_dev *dev, uint32_t type, const union pkt_info *pi) +{ + struct tle_udp_stream *s; + + if (type == TLE_V4) + s = bhash_lookup4(dev->ctx->bhash[type], + pi->addr4.dst, pi->port.dst, 1); + else + s = bhash_lookup6(dev->ctx->bhash[type], + pi->addr6->dst, pi->port.dst, 1); + + if (s == NULL || rwl_acquire(&s->rx.use) < 0) + return NULL; + + return s; +} + static inline uint16_t get_pkt_type(const struct rte_mbuf *m) { @@ -57,8 +71,9 @@ get_pkt_type(const struct rte_mbuf *m) } static inline union l4_ports -pkt_info(struct rte_mbuf *m, union l4_ports *ports, union ipv4_addrs *addr4, - union ipv6_addrs **addr6) +pkt_info_udp(struct rte_mbuf *m, union l4_ports *ports, + union ipv4_addrs *addr4, union ipv6_addrs **addr6, + union pkt_info *pi) { uint32_t len; union l4_ports ret, *up; @@ -71,15 +86,20 @@ pkt_info(struct rte_mbuf *m, union l4_ports *ports, union ipv4_addrs *addr4, pa4 = rte_pktmbuf_mtod_offset(m, union ipv4_addrs *, len + offsetof(struct ipv4_hdr, src_addr)); addr4->raw = pa4->raw; + pi->addr4.raw = pa4->raw; + pi->tf.type = TLE_V4; } else if (ret.src == TLE_V6) { *addr6 = rte_pktmbuf_mtod_offset(m, union ipv6_addrs *, len + offsetof(struct ipv6_hdr, src_addr)); + pi->addr6 = *addr6; + pi->tf.type = TLE_V6; } len += m->l3_len; up = rte_pktmbuf_mtod_offset(m, union l4_ports *, len + offsetof(struct udp_hdr, src_port)); ports->raw = up->raw; + pi->port.raw = up->raw; ret.dst = ports->dst; return ret; } @@ -96,6 +116,11 @@ rx_stream(struct tle_udp_stream *s, void *mb[], struct rte_mbuf *rp[], r = _rte_ring_enqueue_burst(s->rx.q, mb, num); + if (unlikely(r != num)) { + UDP_ADD_STATS(UDP_MIB_RCVBUFERRORS, num - r); + UDP_ADD_STATS(UDP_MIB_INERRORS, num - r); + } + /* if RX queue was empty invoke user RX notification callback. */ if (s->rx.cb.func != NULL && r != 0 && rte_ring_count(s->rx.q) == r) s->rx.cb.func(s->rx.cb.data, &s->s); @@ -164,28 +189,64 @@ rx_stream4(struct tle_udp_stream *s, struct rte_mbuf *pkt[], return rx_stream(s, mb, rp + k, rc + k, n); } +/* + * Consider 2 UDP pkt_info *equal* if their: + * - types (IPv4/IPv6) + * - TCP src and dst ports + * - IP src and dst addresses + * are equal. + */ +static inline int +udp_pkt_info_bulk_eq(const union pkt_info pi[], uint32_t num) +{ + uint32_t i; + + i = 1; + + if (pi[0].tf.type == TLE_V4) { + while (i != num && pi[i].tf.type == TLE_V4 && + pi[0].port.raw == pi[i].port.raw && + pi[0].addr4.raw == pi[i].addr4.raw) + i++; + } else if (pi[0].tf.type == TLE_V6) { + while (i != num && pi[i].tf.type == TLE_V6 && + pi[0].port.raw == pi[i].port.raw && + ymm_cmp(&pi[0].addr6->raw, + &pi[i].addr6->raw) == 0) + i++; + } + + return i; +} + uint16_t tle_udp_rx_bulk(struct tle_dev *dev, struct rte_mbuf *pkt[], struct rte_mbuf *rp[], int32_t rc[], uint16_t num) { + struct stbl *st; struct tle_udp_stream *s; - uint32_t i, j, k, n, p, t; + uint32_t i, j, k, n, t; union l4_ports tp[num], port[num]; union ipv4_addrs a4[num]; union ipv6_addrs *pa6[num]; + union pkt_info pi[num]; + + st = CTX_UDP_STLB(dev->ctx); for (i = 0; i != num; i++) - tp[i] = pkt_info(pkt[i], &port[i], &a4[i], &pa6[i]); + tp[i] = pkt_info_udp(pkt[i], &port[i], &a4[i], + &pa6[i], &pi[i]); k = 0; for (i = 0; i != num; i = j) { - for (j = i + 1; j != num && tp[j].raw == tp[i].raw; j++) - ; + j = i + udp_pkt_info_bulk_eq(pi + i, num - i); t = tp[i].src; - p = tp[i].dst; - s = rx_stream_obtain(dev, t, p); + + s = rx_stream_obtain_by_tuples(st, &pi[i]); + if (s == NULL) + s = rx_stream_obtain(dev, t, &pi[i]); if (s != NULL) { if (t == TLE_V4) @@ -202,6 +263,7 @@ tle_udp_rx_bulk(struct tle_dev *dev, struct rte_mbuf *pkt[], rwl_release(&s->rx.use); } else { + UDP_ADD_STATS(UDP_MIB_NOPORTS, j - i); for (; i != j; i++) { rc[k] = ENOENT; rp[k] = pkt[i]; @@ -262,6 +324,8 @@ tle_udp_tx_bulk(struct tle_dev *dev, struct rte_mbuf *pkt[], uint16_t num) stream_drb_release(s, drb + i, j - i); } + UDP_ADD_STATS(UDP_MIB_OUTDATAGRAMS, n); + return n; } @@ -272,24 +336,18 @@ tle_udp_tx_bulk(struct tle_dev *dev, struct rte_mbuf *pkt[], uint16_t num) static inline uint32_t recv_pkt_process(struct rte_mbuf *m[], uint32_t num, uint32_t type) { - uint32_t i, k; - uint64_t flg[num], ofl[num]; - - for (i = 0; i != num; i++) { - flg[i] = m[i]->ol_flags; - ofl[i] = m[i]->tx_offload; - } + uint32_t i, k, offset; - k = 0; - for (i = 0; i != num; i++) { - - /* drop packets with invalid cksum(s). */ - if (check_pkt_csum(m[i], flg[i], type, IPPROTO_UDP) != 0) { + for (i = 0, k = 0; i != num; i++) { + if (check_pkt_csum(m[i], type, IPPROTO_UDP) != 0) { + UDP_INC_STATS(UDP_MIB_CSUMERRORS); rte_pktmbuf_free(m[i]); m[i] = NULL; k++; - } else - rte_pktmbuf_adj(m[i], _tx_offload_l4_offset(ofl[i])); + } else { + offset = _tx_offload_l4_offset(m[i]->tx_offload); + rte_pktmbuf_adj(m[i], offset); + } } return k; @@ -302,9 +360,25 @@ tle_udp_stream_recv(struct tle_stream *us, struct rte_mbuf *pkt[], uint16_t num) struct tle_udp_stream *s; s = UDP_STREAM(us); + n = 0; + +again: n = _rte_ring_mc_dequeue_burst(s->rx.q, (void **)pkt, num); - if (n == 0) + if (n == 0) { + if (rwl_try_acquire(&s->rx.use) > 0) + rte_errno = EAGAIN; + else + rte_errno = ESHUTDOWN; + rwl_release(&s->rx.use); return 0; + } + + k = recv_pkt_process(pkt, n, s->s.type); + if (unlikely(k)) + UDP_ADD_STATS_ATOMIC(UDP_MIB_CSUMERRORS, k); + n = compress_pkt_list(pkt, n, k); + if (n == 0) + goto again; /* * if we still have packets to read, @@ -316,8 +390,8 @@ tle_udp_stream_recv(struct tle_stream *us, struct rte_mbuf *pkt[], uint16_t num) rwl_release(&s->rx.use); } - k = recv_pkt_process(pkt, n, s->s.type); - return compress_pkt_list(pkt, n, k); + UDP_ADD_STATS_ATOMIC(UDP_MIB_INDATAGRAMS, n); + return n; } static inline int @@ -413,7 +487,7 @@ fragment(struct rte_mbuf *pkt, struct rte_mbuf *frag[], uint32_t num, /* Remove the Ethernet header from the input packet */ rte_pktmbuf_adj(pkt, dst->l2_len); - mtu = dst->mtu - dst->l2_len; + mtu = dst->mtu; /* fragment packet */ if (type == TLE_V4) @@ -475,13 +549,22 @@ queue_pkt_out(struct tle_udp_stream *s, struct tle_dev *dev, nb += nbc; /* no free drbs, can't send anything */ - if (nb == 0) + if (unlikely(nb == 0)) { + if (all_or_nothing) + UDP_ADD_STATS_ATOMIC(UDP_MIB_SNDBUFERRORS, 1); + else + UDP_ADD_STATS_ATOMIC(UDP_MIB_SNDBUFERRORS, nb_pkt); return 0; + } /* not enough free drbs, reduce number of packets to send. */ else if (nb != nbm) { - if (all_or_nothing) + if (all_or_nothing) { + UDP_ADD_STATS_ATOMIC(UDP_MIB_SNDBUFERRORS, 1); return 0; + } + + UDP_ADD_STATS_ATOMIC(UDP_MIB_SNDBUFERRORS, nb_pkt - nb * bsz); nb_pkt = nb * bsz; } @@ -509,12 +592,18 @@ tle_udp_stream_send(struct tle_stream *us, struct rte_mbuf *pkt[], const struct sockaddr_in *d4; const struct sockaddr_in6 *d6; struct tle_udp_stream *s; - const void *da; + const void *sa, *da; union udph udph; struct tle_dest dst; struct tle_drb *drb[num]; + uint8_t ufo; s = UDP_STREAM(us); + if (rwl_acquire(&s->tx.use) < 0) { + rte_errno = EPIPE; /* tx is shutdown */ + return 0; + } + type = s->s.type; /* start filling UDP header. */ @@ -523,7 +612,10 @@ tle_udp_stream_send(struct tle_stream *us, struct rte_mbuf *pkt[], /* figure out what destination addr/port to use. */ if (dst_addr != NULL) { - if (dst_addr->sa_family != s->prm.remote_addr.ss_family) { + if (dst_addr->sa_family != s->prm.remote_addr.ss_family && + (s->prm.remote_addr.ss_family == AF_INET || + !IN6_IS_ADDR_UNSPECIFIED(&s->s.ipv6.addr.dst))) { + rwl_release(&s->tx.use); rte_errno = EINVAL; return 0; } @@ -531,21 +623,28 @@ tle_udp_stream_send(struct tle_stream *us, struct rte_mbuf *pkt[], d4 = (const struct sockaddr_in *)dst_addr; da = &d4->sin_addr; udph.ports.dst = d4->sin_port; + sa = &s->s.ipv4.addr.dst; } else { d6 = (const struct sockaddr_in6 *)dst_addr; da = &d6->sin6_addr; udph.ports.dst = d6->sin6_port; + sa = &s->s.ipv6.addr.dst; } } else { udph.ports.dst = s->s.port.src; - if (type == TLE_V4) + if (type == TLE_V4) { da = &s->s.ipv4.addr.src; - else + sa = &s->s.ipv4.addr.dst; + } + else { da = &s->s.ipv6.addr.src; + sa = &s->s.ipv6.addr.dst; + } } - di = stream_get_dest(&s->s, da, &dst); + di = stream_get_dest(type, &s->s, sa, da, &dst); if (di < 0) { + rwl_release(&s->tx.use); rte_errno = -di; return 0; } @@ -553,12 +652,7 @@ tle_udp_stream_send(struct tle_stream *us, struct rte_mbuf *pkt[], pid = rte_atomic32_add_return(&dst.dev->tx.packet_id[type], num) - num; mtu = dst.mtu - dst.l2_len - dst.l3_len; - /* mark stream as not closable. */ - if (rwl_acquire(&s->tx.use) < 0) { - rte_errno = EAGAIN; - return 0; - } - + ufo = dst.dev->prm.tx_offload & DEV_TX_OFFLOAD_UDP_TSO; nb = 0; for (i = 0, k = 0; k != num; k = i) { @@ -568,7 +662,7 @@ tle_udp_stream_send(struct tle_stream *us, struct rte_mbuf *pkt[], ol_flags = dst.dev->tx.ol_flags[type]; while (i != num && frg == 0) { - frg = pkt[i]->pkt_len > mtu; + frg = (!ufo) && pkt[i]->pkt_len > mtu; if (frg != 0) ol_flags &= ~PKT_TX_UDP_CKSUM; rc = udp_fill_mbuf(pkt[i], type, ol_flags, pid + i, diff --git a/lib/libtle_l4p/udp_stream.c b/lib/libtle_l4p/udp_stream.c index 29f5a40..0cd5c27 100644 --- a/lib/libtle_l4p/udp_stream.c +++ b/lib/libtle_l4p/udp_stream.c @@ -43,74 +43,87 @@ fini_stream(struct tle_udp_stream *s) static void udp_fini_streams(struct tle_ctx *ctx) { - uint32_t i; - struct tle_udp_stream *s; + struct udp_streams *us; + struct tle_stream *s; + + us = CTX_UDP_STREAMS(ctx); + if (us != NULL) { + stbl_fini(&us->st); + + while (ctx->streams.nb_free--) { + s = STAILQ_FIRST(&ctx->streams.free); + STAILQ_FIRST(&ctx->streams.free) = STAILQ_NEXT(s, link); + fini_stream(UDP_STREAM(s)); + } - s = ctx->streams.buf; - if (s != NULL) { - for (i = 0; i != ctx->prm.max_streams; i++) - fini_stream(s + i); } - rte_free(s); + rte_free(us); ctx->streams.buf = NULL; STAILQ_INIT(&ctx->streams.free); } +/* stream memory layout: + * [tle_udp_stream] [rx.q] [tx.drb.r] + */ static int -init_stream(struct tle_ctx *ctx, struct tle_udp_stream *s) +add_stream(struct tle_ctx *ctx) { - size_t bsz, rsz, sz; - uint32_t i, k, n, nb; + size_t sz_s, sz_rxq, sz_drb_r, sz; + /* for rx.q */ + uint32_t n_rxq; + /* for tx.drb.r */ + size_t bsz, rsz; struct tle_drb *drb; - char name[RTE_RING_NAMESIZE]; + uint32_t k, nb, n_drb; - /* init RX part. */ - - n = RTE_MAX(ctx->prm.max_stream_rbufs, 1U); - n = rte_align32pow2(n); - sz = rte_ring_get_memsize(n); - - s->rx.q = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE, - ctx->prm.socket_id); - if (s->rx.q == NULL) { - UDP_LOG(ERR, "%s(%p): allocation of %zu bytes on socket %d " - "failed with error code: %d\n", - __func__, s, sz, ctx->prm.socket_id, rte_errno); - return -ENOMEM; - } + uint32_t i, f; + char name[RTE_RING_NAMESIZE]; + struct tle_udp_stream *s; - snprintf(name, sizeof(name), "%p@%zu", s, sz); - rte_ring_init(s->rx.q, name, n, RING_F_SP_ENQ); + // stream + sz_s = RTE_ALIGN_CEIL(sizeof(*s), RTE_CACHE_LINE_SIZE); - /* init TX part. */ + // rx.q + n_rxq = RTE_MAX(ctx->prm.max_stream_rbufs, 1U); + n_rxq = rte_align32pow2(n_rxq); + sz_rxq = rte_ring_get_memsize(n_rxq); + sz_rxq = RTE_ALIGN_CEIL(sz_rxq, RTE_CACHE_LINE_SIZE); + // tx.drb.r nb = drb_nb_elem(ctx); k = calc_stream_drb_num(ctx, nb); - n = rte_align32pow2(k); - - /* size of the drbs ring */ - rsz = rte_ring_get_memsize(n); + n_drb = rte_align32pow2(k); + rsz = rte_ring_get_memsize(n_drb); /* size of the drbs ring */ rsz = RTE_ALIGN_CEIL(rsz, RTE_CACHE_LINE_SIZE); + bsz = tle_drb_calc_size(nb); /* size of the drb. */ + sz_drb_r = rsz + bsz * k; /* total stream drbs size. */ + sz_drb_r = RTE_ALIGN_CEIL(sz_drb_r, RTE_CACHE_LINE_SIZE); - /* size of the drb. */ - bsz = tle_drb_calc_size(nb); - - /* total stream drbs size. */ - sz = rsz + bsz * k; - - s->tx.drb.r = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE, - ctx->prm.socket_id); - if (s->tx.drb.r == NULL) { - UDP_LOG(ERR, "%s(%p): allocation of %zu bytes on socket %d " + sz = sz_s + sz_rxq + sz_drb_r; + s = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE, + ctx->prm.socket_id); + if (s == NULL) { + UDP_LOG(ERR, "%s: allocation of %zu bytes on socket %d " "failed with error code: %d\n", - __func__, s, sz, ctx->prm.socket_id, rte_errno); + __func__, sz, ctx->prm.socket_id, rte_errno); return -ENOMEM; } - snprintf(name, sizeof(name), "%p@%zu", s, sz); - rte_ring_init(s->tx.drb.r, name, n, 0); + s->rx.q = (struct rte_ring *)((uintptr_t)s + sz_s); + s->tx.drb.r = (struct rte_ring *)((uintptr_t)s->rx.q + sz_rxq); + + // ring flags + f = ((ctx->prm.flags & TLE_CTX_FLAG_ST) == 0) ? 0 : + (RING_F_SP_ENQ | RING_F_SC_DEQ); + + /* init RX part. */ + snprintf(name, sizeof(name), "%p@%zu", s->rx.q, sz_rxq); + rte_ring_init(s->rx.q, name, n_rxq, f); + /* init TX part. */ + snprintf(name, sizeof(name), "%p@%zu", s->tx.drb.r, sz_drb_r); + rte_ring_init(s->tx.drb.r, name, n_drb, f); for (i = 0; i != k; i++) { drb = (struct tle_drb *)((uintptr_t)s->tx.drb.r + rsz + bsz * i); @@ -146,38 +159,59 @@ udp_init_streams(struct tle_ctx *ctx) size_t sz; uint32_t i; int32_t rc; - struct tle_udp_stream *s; + struct udp_streams *us; - sz = sizeof(*s) * ctx->prm.max_streams; - s = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE, + sz = sizeof(*us); + us = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE, ctx->prm.socket_id); - if (s == NULL) { + if (us == NULL) { UDP_LOG(ERR, "allocation of %zu bytes on socket %d " "for %u udp_streams failed\n", sz, ctx->prm.socket_id, ctx->prm.max_streams); return -ENOMEM; } - ctx->streams.buf = s; + ctx->streams.buf = us; STAILQ_INIT(&ctx->streams.free); - for (i = 0; i != ctx->prm.max_streams; i++) { - rc = init_stream(ctx, s + i); - if (rc != 0) { - UDP_LOG(ERR, "initalisation of %u-th stream failed", i); - udp_fini_streams(ctx); - return rc; - } + rc = stbl_init(&us->st, (ctx->prm.flags & TLE_CTX_FLAG_ST) == 0); + if (rc < 0) { + UDP_LOG(ERR, "failed to init UDP stbl: rc = %dl\n", rc); + return rc; } - return 0; + for (i = 0; rc == 0 && i != ctx->prm.min_streams; i++) + rc = add_stream(ctx); + + if (rc != 0) { + UDP_LOG(ERR, "initalisation of %u-th stream failed", i); + udp_fini_streams(ctx); + } + + return rc; } -static void __attribute__((constructor)) +static uint32_t +udp_more_streams(struct tle_ctx *ctx) +{ + uint32_t i, nb; + uint32_t nb_max = ctx->prm.max_streams; + uint32_t nb_cur = ctx->streams.nb_cur; + + nb = RTE_MIN(ctx->prm.delta_streams, nb_max - nb_cur); + for (i = 0; i < nb; i++) + if (add_stream(ctx) != 0) + break; + + return i; +} + +static void __attribute__((constructor(101))) udp_stream_setup(void) { static const struct stream_ops udp_ops = { .init_streams = udp_init_streams, + .more_streams = udp_more_streams, .fini_streams = udp_fini_streams, .free_drbs = udp_free_drbs, }; @@ -188,8 +222,8 @@ udp_stream_setup(void) static inline void stream_down(struct tle_udp_stream *s) { - rwl_down(&s->rx.use); - rwl_down(&s->tx.use); + rwl_try_down(&s->rx.use); + rwl_try_down(&s->tx.use); } static inline void @@ -223,6 +257,59 @@ check_stream_prm(const struct tle_ctx *ctx, return 0; } +struct tle_stream * +tle_udp_stream_set(struct tle_stream *ts, struct tle_ctx *ctx, + const struct tle_udp_stream_param *prm) +{ + struct tle_udp_stream *s; + int32_t rc; + + if (ctx == NULL || prm == NULL || check_stream_prm(ctx, prm) != 0) { + tle_udp_stream_close(ts); + rte_errno = EINVAL; + return NULL; + } + + s = UDP_STREAM(ts); + + /* free stream's destination port */ + rc = stream_clear_ctx(ctx, &s->s); + + if (s->ste) { + stbl_del_stream(CTX_UDP_STLB(ctx), s->ste, ts); + s->ste = NULL; + } + + /* copy input parameters. */ + s->prm = *prm; + s->s.option.raw = prm->option; + + /* setup L4 ports and L3 addresses fields. */ + rc = stream_fill_ctx(ctx, &s->s, + (const struct sockaddr *)&prm->local_addr, + (const struct sockaddr *)&prm->remote_addr); + + if (rc != 0) + goto error; + + /* add stream to the table for non-listen type stream */ + if (!is_empty_addr((const struct sockaddr *)&prm->remote_addr)) { + s->ste = stbl_add_stream(CTX_UDP_STLB(ctx), &s->s); + if (s->ste == NULL) { + rc = EEXIST; + goto error; + } + } + + return &s->s; + +error: + tle_udp_stream_close(ts); + rte_errno = rc; + return NULL; + +} + struct tle_stream * tle_udp_stream_open(struct tle_ctx *ctx, const struct tle_udp_stream_param *prm) @@ -237,42 +324,80 @@ tle_udp_stream_open(struct tle_ctx *ctx, s = (struct tle_udp_stream *)get_stream(ctx); if (s == NULL) { - rte_errno = ENFILE; - return NULL; - - /* some TX still pending for that stream. */ - } else if (UDP_STREAM_TX_PENDING(s)) { - put_stream(ctx, &s->s, 0); rte_errno = EAGAIN; return NULL; } /* copy input parameters. */ s->prm = *prm; + s->s.option.raw = prm->option; /* setup L4 ports and L3 addresses fields. */ rc = stream_fill_ctx(ctx, &s->s, (const struct sockaddr *)&prm->local_addr, (const struct sockaddr *)&prm->remote_addr); - if (rc != 0) { - put_stream(ctx, &s->s, 1); - s = NULL; - rte_errno = rc; - } else { - /* setup stream notification menchanism */ - s->rx.ev = prm->recv_ev; - s->rx.cb = prm->recv_cb; - s->tx.ev = prm->send_ev; - s->tx.cb = prm->send_cb; - - /* mark stream as avaialbe for RX/TX */ - if (s->tx.ev != NULL) - tle_event_raise(s->tx.ev); - stream_up(s); + if (rc != 0) + goto error; + + /* add stream to the table for non-listen type stream */ + if (!is_empty_addr((const struct sockaddr *)&prm->remote_addr)) { + s->ste = stbl_add_stream(CTX_UDP_STLB(ctx), &s->s); + if (s->ste == NULL) { + rc = EEXIST; + goto error; + } } + /* setup stream notification menchanism */ + s->rx.ev = prm->recv_ev; + s->rx.cb = prm->recv_cb; + s->tx.ev = prm->send_ev; + s->tx.cb = prm->send_cb; + + /* mark stream as avaialbe for RX/TX */ + if (s->tx.ev != NULL) + tle_event_raise(s->tx.ev); + stream_up(s); + return &s->s; + +error: + put_stream(ctx, &s->s, 1); + rte_errno = rc; + return NULL; +} + +int +tle_udp_stream_shutdown(struct tle_stream *us, int how) +{ + bool shut_rd = false; + bool shut_wr = false; + struct tle_udp_stream *s = UDP_STREAM(us); + + switch (how) { + case SHUT_RD: + shut_rd = true; + rwl_down(&s->rx.use); + break; + case SHUT_WR: + shut_wr = true; + rwl_down(&s->tx.use); + break; + case SHUT_RDWR: + shut_rd = true; + shut_wr = true; + stream_down(s); + break; + default: + return -EINVAL; + } + + if (shut_rd && s->rx.ev != NULL) + tle_event_raise(s->rx.ev); + if (shut_wr && s->tx.ev != NULL) + tle_event_raise(s->tx.ev); + return 0; } int @@ -312,6 +437,11 @@ tle_udp_stream_close(struct tle_stream *us) /* empty stream's RX queue */ empty_mbuf_ring(s->rx.q); + if (s->ste) { + stbl_del_stream(CTX_UDP_STLB(ctx), s->ste, us); + s->ste = NULL; + } + /* * mark the stream as free again. * if there still are pkts queued for TX, @@ -344,3 +474,56 @@ tle_udp_stream_get_param(const struct tle_stream *us, return 0; } + +/* + * helper function, updates stream config + */ +static inline int +stream_update_cfg(struct tle_stream *us, struct tle_udp_stream_param *prm) +{ + struct tle_udp_stream *s; + + s = UDP_STREAM(us); + + /* setup stream notification menchanism */ + s->rx.ev = prm->recv_ev; + s->rx.cb = prm->recv_cb; + s->tx.ev = prm->send_ev; + s->tx.cb = prm->send_cb; + + rte_smp_wmb(); + + /* invoke async notifications, if any */ + if (rte_ring_count(s->rx.q) != 0) { + if (s->rx.ev != NULL) + tle_event_raise(s->rx.ev); + else if (s->rx.cb.func != NULL) + s->rx.cb.func(s->rx.cb.data, &s->s); + } + + /* always ok to write */ + if (s->tx.ev != NULL) + tle_event_raise(s->tx.ev); + else if (s->tx.cb.func != NULL) + s->tx.cb.func(s->tx.cb.data, &s->s); + + return 0; +} + +uint32_t +tle_udp_stream_update_cfg(struct tle_stream *us[], + struct tle_udp_stream_param prm[], uint32_t num) +{ + int32_t rc; + uint32_t i; + + for (i = 0; i != num; i++) { + rc = stream_update_cfg(us[i], &prm[i]); + if (rc != 0) { + rte_errno = -rc; + break; + } + } + + return i; +} diff --git a/lib/libtle_l4p/udp_stream.h b/lib/libtle_l4p/udp_stream.h index a950e56..55a66f8 100644 --- a/lib/libtle_l4p/udp_stream.h +++ b/lib/libtle_l4p/udp_stream.h @@ -24,6 +24,7 @@ #include "osdep.h" #include "ctx.h" #include "stream.h" +#include "stream_table.h" #ifdef __cplusplus extern "C" { @@ -41,6 +42,7 @@ union udph { struct tle_udp_stream { struct tle_stream s; + struct stbl_entry *ste; /* entry in streams table. */ struct { struct rte_ring *q; @@ -63,6 +65,13 @@ struct tle_udp_stream { struct tle_udp_stream_param prm; } __rte_cache_aligned; +struct udp_streams { + struct stbl st; +}; + +#define CTX_UDP_STREAMS(ctx) ((struct udp_streams *)(ctx)->streams.buf) +#define CTX_UDP_STLB(ctx) (&CTX_UDP_STREAMS(ctx)->st) + #define UDP_STREAM(p) \ ((struct tle_udp_stream *)((uintptr_t)(p) - offsetof(struct tle_udp_stream, s))) diff --git a/lib/libtle_timer/timer.c b/lib/libtle_timer/timer.c index 8b89fd6..a0169ef 100644 --- a/lib/libtle_timer/timer.c +++ b/lib/libtle_timer/timer.c @@ -134,6 +134,30 @@ put_timer(struct tle_timer_list *list, struct tle_timer_elmt *e) list->num++; } +static inline struct tle_timer_elmt * +get_free_timer(struct tle_timer_wheel *tw) +{ + unsigned i, n; + struct tle_timer_elmt *e; + + e = LIST_FIRST(&tw->free.head); + if (e == NULL) { + n = 128; + n = RTE_MIN(n, tw->prm.max_timer - tw->free.num); + for (i = 0; i < n; i++) { + e = rte_zmalloc_socket(NULL, sizeof(*e), + sizeof(e), tw->prm.socket_id); + if (e != NULL) + put_timer(&tw->free, e); + else + rte_panic("Failed to allocate timer"); + } + } + + e = get_timer(&tw->free); + return e; +} + static inline void rem_timer(struct tle_timer_list *list, struct tle_timer_elmt *e) { @@ -149,8 +173,6 @@ tle_timer_create(struct tle_timer_wheel_args *prm, uint64_t now) uint32_t i, j; size_t sz; struct tle_timer_wheel *tw; - struct tle_timer_elmt *e; - struct tle_timer_elmt *timers; if (prm == NULL) { rte_errno = -EINVAL; @@ -169,7 +191,7 @@ tle_timer_create(struct tle_timer_wheel_args *prm, uint64_t now) return NULL; } - sz = sizeof(*tw) + prm->max_timer * sizeof(struct tle_timer_elmt); + sz = sizeof(*tw); /* allocate memory */ tw = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE, @@ -182,17 +204,11 @@ tle_timer_create(struct tle_timer_wheel_args *prm, uint64_t now) tw->last_run_time = now; tw->prm = *prm; - timers = (struct tle_timer_elmt *)(tw + 1); /* initialize the lists */ LIST_INIT(&tw->free.head); LIST_INIT(&tw->expired.head); - for (i = 0; i < prm->max_timer; i++) { - e = timers + i; - put_timer(&tw->free, e); - } - for (i = 0; i < TW_N_RINGS; i++) for (j = 0; j < TW_SLOTS_PER_RING; j++) LIST_INIT(&tw->w[i][j].head); @@ -223,11 +239,6 @@ tle_timer_start(struct tle_timer_wheel *tw, void *obj, uint64_t interval) return NULL; } - if (tw->free.num == 0) { - rte_errno = ENOMEM; - return NULL; - } - nb_tick = interval / tw->prm.tick_size; fast_ring_index = nb_tick & TW_RING_MASK; @@ -248,7 +259,7 @@ tle_timer_start(struct tle_timer_wheel *tw, void *obj, uint64_t interval) slow_ring_index %= TW_SLOTS_PER_RING; ts = &tw->w[TW_RING_SLOW][slow_ring_index]; - e = get_timer(&tw->free); + e = get_free_timer(tw); e->obj = obj; e->fast_index = fast_ring_index; put_timer(ts, e); @@ -260,7 +271,7 @@ tle_timer_start(struct tle_timer_wheel *tw, void *obj, uint64_t interval) /* Timer expires less than 51.2 seconds from now */ ts = &tw->w[TW_RING_FAST][fast_ring_index]; - e = get_timer(&tw->free); + e = get_free_timer(tw); e->obj = obj; put_timer(ts, e); diff --git a/mk/tle.app.mk b/mk/tle.app.mk index 602b870..14a5c23 100644 --- a/mk/tle.app.mk +++ b/mk/tle.app.mk @@ -13,6 +13,10 @@ EXTLIB_BUILD := y +ifneq ($(HACK_CC),) +CC = $(HACK_CC) +endif + # we must create the output dir first and recall the same Makefile # from this directory ifeq ($(NOT_FIRST_CALL),) diff --git a/mk/tle.lib.mk b/mk/tle.lib.mk index 7455585..302cb60 100644 --- a/mk/tle.lib.mk +++ b/mk/tle.lib.mk @@ -13,6 +13,10 @@ EXTLIB_BUILD := y +ifneq ($(HACK_CC),) +CC = $(HACK_CC) +endif + # we must create the output dir first and recall the same Makefile # from this directory ifeq ($(NOT_FIRST_CALL),) diff --git a/test/Makefile b/test/Makefile index c5cf270..46ac27d 100644 --- a/test/Makefile +++ b/test/Makefile @@ -25,4 +25,8 @@ DIRS-y += dring DIRS-y += gtest DIRS-y += timer +ifeq ($(PACKETDRILL),y) +DIRS-y += packetdrill +endif + include $(TLDK_ROOT)/mk/tle.subdir.mk diff --git a/test/gtest/Makefile b/test/gtest/Makefile index e980c23..2f7b800 100644 --- a/test/gtest/Makefile +++ b/test/gtest/Makefile @@ -95,6 +95,7 @@ CXXFLAGS += $(EXTRA_CFLAGS) LDFLAGS += -lstdc++ LDFLAGS += -L$(GMOCK_DIR) -L$(GMOCK_DIR)/../lib -lgmock +LDLIBS += -L$(GMOCK_DIR)/gtest -L$(GMOCK_DIR)/../lib -lgtest LDLIBS += -whole-archive -ltle_l4p -ltle_dring include $(TLDK_ROOT)/mk/tle.cpp-obj.mk diff --git a/test/gtest/test_tle_ctx.cpp b/test/gtest/test_tle_ctx.cpp index b9808ee..bbb36e0 100644 --- a/test/gtest/test_tle_ctx.cpp +++ b/test/gtest/test_tle_ctx.cpp @@ -32,6 +32,7 @@ TEST(ctx_create, create_invalid_socket) memset(&prm, 0, sizeof(prm)); prm.socket_id = SOCKET_ID_ANY; prm.max_streams = 0x10; + prm.min_streams = 0x10; prm.max_stream_rbufs = 0x100; prm.max_stream_sbufs = 0x100; diff --git a/test/gtest/test_tle_tcp_stream.cpp b/test/gtest/test_tle_tcp_stream.cpp index b861049..1538f0b 100644 --- a/test/gtest/test_tle_tcp_stream.cpp +++ b/test/gtest/test_tle_tcp_stream.cpp @@ -86,7 +86,7 @@ TEST_F(test_tle_tcp_stream, tcp_stream_test_open_duplicate_ipv4) stream_dup = tle_tcp_stream_open(ctx, (const struct tle_tcp_stream_param*)&stream_prm); ASSERT_EQ(stream_dup, nullptr); - ASSERT_EQ(rte_errno, EEXIST); + ASSERT_EQ(rte_errno, EADDRINUSE); ret = tle_tcp_stream_close(stream); ASSERT_EQ(ret, 0); @@ -103,7 +103,7 @@ TEST_F(test_tle_tcp_stream, tcp_stream_test_open_duplicate_ipv6) stream_dup = tle_tcp_stream_open(ctx, (const struct tle_tcp_stream_param*)&stream_prm6); ASSERT_EQ(stream_dup, nullptr); - ASSERT_EQ(rte_errno, EEXIST); + ASSERT_EQ(rte_errno, EADDRINUSE); ret = tle_tcp_stream_close(stream6); ASSERT_EQ(ret, 0); diff --git a/test/gtest/test_tle_tcp_stream.h b/test/gtest/test_tle_tcp_stream.h index 2caf2b5..80f0bea 100644 --- a/test/gtest/test_tle_tcp_stream.h +++ b/test/gtest/test_tle_tcp_stream.h @@ -32,6 +32,8 @@ #include "test_common.h" #define MAX_STREAMS 0x100 +#define MIN_STREAMS 0x10 +#define DELTA_STREAMS 0x20 #define MAX_STREAM_RBUFS 0x100 #define MAX_STREAM_SBUFS 0x100 #define RX_NO_OFFLOAD 0x0 @@ -41,6 +43,8 @@ static struct tle_ctx_param ctx_prm_tmpl = { .socket_id = SOCKET_ID_ANY, .proto = TLE_PROTO_TCP, .max_streams = MAX_STREAMS, + .min_streams = MIN_STREAMS, + .delta_streams = DELTA_STREAMS, .max_stream_rbufs = MAX_STREAM_RBUFS, .max_stream_sbufs = MAX_STREAM_SBUFS, }; diff --git a/test/gtest/test_tle_udp_destroy.cpp b/test/gtest/test_tle_udp_destroy.cpp index 2f26dd8..49306b5 100644 --- a/test/gtest/test_tle_udp_destroy.cpp +++ b/test/gtest/test_tle_udp_destroy.cpp @@ -24,6 +24,7 @@ TEST(udp_destroy_null, udp_destroy_null) TEST_F(udp_destroy, udp_destroy_positive) { int rc; + rte_errno = 0; tle_ctx_destroy(ctx); ASSERT_EQ(rte_errno, 0); } diff --git a/test/gtest/test_tle_udp_stream_gen.cpp b/test/gtest/test_tle_udp_stream_gen.cpp index 0f60b09..1007e4d 100644 --- a/test/gtest/test_tle_udp_stream_gen.cpp +++ b/test/gtest/test_tle_udp_stream_gen.cpp @@ -123,14 +123,13 @@ TEST_P(tle_rx_test, test) /* Receive packets until we reach end on pcap file*/ do { nb_rx = rte_eth_rx_burst(portid, 0, m, BURST_SIZE); - for(auto &d: tp.devs) { - memset(rc, 0, sizeof(int) * BURST_SIZE); - nb_rx_bulk = tle_udp_rx_bulk(d.ptr, m, rp, rc, nb_rx); - d.act_pkts_bulk_rx += nb_rx_bulk; - for(j = 0; j < BURST_SIZE; j++) { - if(rc[j] == ENOENT) - d.act_pkts_enoent += 1; - } + auto &d = tp.devs[0]; + memset(rc, 0, sizeof(int) * BURST_SIZE); + nb_rx_bulk = tle_udp_rx_bulk(d.ptr, m, rp, rc, nb_rx); + d.act_pkts_bulk_rx += nb_rx_bulk; + for(j = 0; j < BURST_SIZE; j++) { + if(rc[j] == ENOENT) + d.act_pkts_enoent += 1; } for(auto &s: tp.streams) { @@ -139,14 +138,12 @@ TEST_P(tle_rx_test, test) } } while (nb_rx > 0); - /* * Verify results - number of rx packets per dev and stream. */ - for(auto &d: tp.devs) { - EXPECT_EQ(d.act_pkts_bulk_rx, d.exp_pkts_bulk_rx); - EXPECT_EQ(d.act_pkts_enoent, d.exp_pkts_enoent); - } + auto &d = tp.devs[0]; + EXPECT_EQ(d.act_pkts_bulk_rx, d.exp_pkts_bulk_rx); + EXPECT_EQ(d.act_pkts_enoent, d.exp_pkts_enoent); for(auto &s: tp.streams) { EXPECT_EQ(s.act_pkts_rx, s.exp_pkts_rx); @@ -257,9 +254,9 @@ test_str * 3 dev, 3 stream per dev, only correct pkts */ "Mixed IPv4+IPv6; Multiple devs with multiple correct streams", { - {"10.0.0.1", "2001::1000",RX_NO_OFFLOAD, TX_NO_OFFLOAD, 300, 0, 600}, - {"20.0.0.1", "2002::1000", RX_NO_OFFLOAD, TX_NO_OFFLOAD, 300, 0, 600}, - {"30.0.0.1", "2003::1000", RX_NO_OFFLOAD, TX_NO_OFFLOAD, 300, 0, 600}, + {"10.0.0.1", "2001::1000",RX_NO_OFFLOAD, TX_NO_OFFLOAD, 900, 0, 0}, + {"20.0.0.1", "2002::1000", RX_NO_OFFLOAD, TX_NO_OFFLOAD, 900, 0, 0}, + {"30.0.0.1", "2003::1000", RX_NO_OFFLOAD, TX_NO_OFFLOAD, 900, 0, 0}, }, { {AF_INET, 10001, 10011, "10.0.0.1", "10.0.0.2", 100, 0}, @@ -268,20 +265,20 @@ test_str {AF_INET, 20001, 20011, "20.0.0.1", "20.0.0.2", 100, 0}, {AF_INET6, 20002, 20012, "2002::1000", "2002::3000", 100, 0}, {AF_INET6, 20003, 20013, "2002::1000", "2002::4000", 100, 0}, - {AF_INET, 20001, 20011, "30.0.0.1", "30.0.0.2", 100, 0}, - {AF_INET6, 20002, 20012, "2003::1000", "2003::3000", 100, 0}, - {AF_INET6, 20003, 20013, "2003::1000", "2003::4000", 100, 0} + {AF_INET, 30001, 30011, "30.0.0.1", "30.0.0.2", 100, 0}, + {AF_INET6, 30002, 30012, "2003::1000", "2003::3000", 100, 0}, + {AF_INET6, 30003, 30013, "2003::1000", "2003::4000", 100, 0} }, { {AF_INET, "10.0.0.2", "10.0.0.1", 10011, 10001, 100, 0, 0, 0}, {AF_INET, "10.0.0.3", "10.0.0.1", 10012, 10002, 100, 0, 0, 0}, {AF_INET, "20.0.0.2", "20.0.0.1", 20011, 20001, 100, 0, 0, 0}, - {AF_INET, "30.0.0.2", "30.0.0.1", 20011, 20001, 100, 0, 0, 0}, + {AF_INET, "30.0.0.2", "30.0.0.1", 30011, 30001, 100, 0, 0, 0}, {AF_INET6, "2001::4000", "2001::1000", 10013, 10003, 100, 0, 0, 0}, {AF_INET6, "2002::3000", "2002::1000", 20012, 20002, 100, 0, 0, 0}, {AF_INET6, "2002::4000", "2002::1000", 20013, 20003, 100, 0, 0, 0}, - {AF_INET6, "2003::3000", "2003::1000", 20012, 20002, 100, 0, 0, 0}, - {AF_INET6, "2003::4000", "2003::1000", 20013, 20003, 100, 0, 0, 0}, + {AF_INET6, "2003::3000", "2003::1000", 30012, 30002, 100, 0, 0, 0}, + {AF_INET6, "2003::4000", "2003::1000", 30013, 30003, 100, 0, 0, 0}, } } )); @@ -425,20 +422,20 @@ test_str {AF_INET, 20001, 20011, "20.0.0.1", "20.0.0.2", 0, 100}, {AF_INET6, 20002, 20012, "2002::1000", "2002::3000", 0, 100}, {AF_INET6, 20003, 20013, "2002::1000", "2002::4000", 0, 100}, - {AF_INET, 20001, 20011, "30.0.0.1", "30.0.0.2", 0, 100}, - {AF_INET6, 20002, 20012, "2003::1000", "2003::3000", 0, 100}, - {AF_INET6, 20003, 20013, "2003::1000", "2003::4000", 0, 100} + {AF_INET, 30001, 30011, "30.0.0.1", "30.0.0.2", 0, 100}, + {AF_INET6, 30002, 30012, "2003::1000", "2003::3000", 0, 100}, + {AF_INET6, 30003, 30013, "2003::1000", "2003::4000", 0, 100} }, { {AF_INET, "10.0.0.2", "10.0.0.1", 10011, 10001, 100, 0, 0, 0}, {AF_INET, "10.0.0.3", "10.0.0.1", 10012, 10002, 100, 0, 0, 0}, {AF_INET, "20.0.0.2", "20.0.0.1", 20011, 20001, 100, 0, 0, 0}, - {AF_INET, "30.0.0.2", "30.0.0.1", 20011, 20001, 100, 0, 0, 0}, + {AF_INET, "30.0.0.2", "30.0.0.1", 30011, 30001, 100, 0, 0, 0}, {AF_INET6, "2001::4000", "2001::1000", 10013, 10003, 100, 0, 0, 0}, {AF_INET6, "2002::3000", "2002::1000", 20012, 20002, 100, 0, 0, 0}, {AF_INET6, "2002::4000", "2002::1000", 20013, 20003, 100, 0, 0, 0}, - {AF_INET6, "2003::3000", "2003::1000", 20012, 20002, 100, 0, 0, 0}, - {AF_INET6, "2003::4000", "2003::1000", 20013, 20003, 100, 0, 0, 0}, + {AF_INET6, "2003::3000", "2003::1000", 30012, 30002, 100, 0, 0, 0}, + {AF_INET6, "2003::4000", "2003::1000", 30013, 30003, 100, 0, 0, 0}, } } )); diff --git a/test/gtest/test_tle_udp_stream_gen.h b/test/gtest/test_tle_udp_stream_gen.h index 1f3d210..eb92385 100644 --- a/test/gtest/test_tle_udp_stream_gen.h +++ b/test/gtest/test_tle_udp_stream_gen.h @@ -379,6 +379,8 @@ test_tle_udp_gen_base::setup_ctx(void) memset(&ctx_prm, 0, sizeof(ctx_prm)); ctx_prm.socket_id = SOCKET_ID_ANY; ctx_prm.max_streams = 0x10; + ctx_prm.min_streams = 0x8; + ctx_prm.delta_streams = 0x8; ctx_prm.max_stream_rbufs = CTX_MAX_RBUFS; ctx_prm.max_stream_sbufs = CTX_MAX_SBUFS; ctx_prm.lookup4 = lookup4_function; diff --git a/test/packetdrill/COPYING b/test/packetdrill/COPYING new file mode 100644 index 0000000..d159169 --- /dev/null +++ b/test/packetdrill/COPYING @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/test/packetdrill/Makefile b/test/packetdrill/Makefile new file mode 100644 index 0000000..1cceb47 --- /dev/null +++ b/test/packetdrill/Makefile @@ -0,0 +1,2 @@ +packetdrill-ext-libs := -lrt -ldl -static -L$(TLDK_ROOT) -Wl,--whole-archive -ltldk -Wl,--no-whole-archive -lnuma -lpthread +include Makefile.common diff --git a/test/packetdrill/Makefile.FreeBSD b/test/packetdrill/Makefile.FreeBSD new file mode 100644 index 0000000..a32f827 --- /dev/null +++ b/test/packetdrill/Makefile.FreeBSD @@ -0,0 +1,2 @@ +packetdrill-ext-libs := -lpthread -lpcap +.include "Makefile.common" diff --git a/test/packetdrill/Makefile.Linux b/test/packetdrill/Makefile.Linux new file mode 100644 index 0000000..0c8b7ac --- /dev/null +++ b/test/packetdrill/Makefile.Linux @@ -0,0 +1,2 @@ +packetdrill-ext-libs := -lpthread -lrt -ldl -static +include Makefile.common diff --git a/test/packetdrill/Makefile.NetBSD b/test/packetdrill/Makefile.NetBSD new file mode 100644 index 0000000..a32f827 --- /dev/null +++ b/test/packetdrill/Makefile.NetBSD @@ -0,0 +1,2 @@ +packetdrill-ext-libs := -lpthread -lpcap +.include "Makefile.common" diff --git a/test/packetdrill/Makefile.OpenBSD b/test/packetdrill/Makefile.OpenBSD new file mode 100644 index 0000000..a32f827 --- /dev/null +++ b/test/packetdrill/Makefile.OpenBSD @@ -0,0 +1,2 @@ +packetdrill-ext-libs := -lpthread -lpcap +.include "Makefile.common" diff --git a/test/packetdrill/Makefile.common b/test/packetdrill/Makefile.common new file mode 100644 index 0000000..b614d08 --- /dev/null +++ b/test/packetdrill/Makefile.common @@ -0,0 +1,63 @@ +all: binaries + +CFLAGS = -g -Wall -Werror + +parser.o: parser.y + bison --output=parser.c --defines=parser.h --report=state parser.y + $(CC) $(CFLAGS) -c parser.c + +lexer.o: lexer.l parser.o + flex -olexer.c lexer.l + $(CC) -O2 -g -Wall -c lexer.c + +packetdrill-lib := \ + checksum.o code.o config.o hash.o hash_map.o ip_address.o ip_prefix.o \ + netdev.o net_utils.o \ + packet.o packet_socket_linux.o packet_socket_pcap.o \ + packet_checksum.o packet_parser.o packet_to_string.o \ + symbols_linux.o \ + symbols_freebsd.o \ + symbols_openbsd.o \ + symbols_netbsd.o \ + gre_packet.o icmp_packet.o ip_packet.o tcp_packet.o udp_packet.o \ + mpls_packet.o \ + run.o run_command.o run_packet.o run_system_call.o \ + script.o socket.o system.o \ + tcp_options.o tcp_options_iterator.o tcp_options_to_string.o \ + logging.o types.o lexer.o parser.o \ + fmemopen.o open_memstream.o \ + link_layer.o wire_conn.o wire_protocol.o \ + wire_client.o wire_client_netdev.o \ + wire_server.o wire_server_netdev.o \ + epoll.o pipe.o file.o so_testing.o wrap.o + +packetdrill-objs := packetdrill.o $(packetdrill-lib) + +packetdrill: $(packetdrill-objs) + $(CC) -o packetdrill -g $(packetdrill-objs) $(packetdrill-ext-libs) + +test-bins := checksum_test packet_parser_test packet_to_string_test +tests: $(test-bins) + ./checksum_test + ./packet_parser_test + ./packet_to_string_test + +binaries: packetdrill $(test-bins) + +checksum_test-objs := $(packetdrill-lib) checksum_test.o +checksum_test: $(checksum_test-objs) + $(CC) -o checksum_test $(checksum_test-objs) $(packetdrill-ext-libs) + +packet_parser_test-objs := $(packetdrill-lib) packet_parser_test.o +packet_parser_test: $(packet_parser_test-objs) + $(CC) -o packet_parser_test $(packet_parser_test-objs) \ + $(packetdrill-ext-libs) + +packet_to_string_test-objs := $(packetdrill-lib) packet_to_string_test.o +packet_to_string_test: $(packet_to_string_test-objs) + $(CC) -o packet_to_string_test $(packet_to_string_test-objs) \ + $(packetdrill-ext-libs) + +clean: + /bin/rm -f *.o packetdrill lexer.c parser.c parser.h parser.output \ + $(test-bins) diff --git a/test/packetdrill/README b/test/packetdrill/README new file mode 100644 index 0000000..bfa0a47 --- /dev/null +++ b/test/packetdrill/README @@ -0,0 +1,58 @@ + +packetdrill +=========== + +This directory contains the source code for the packetdrill network +stack testing tool. + +The web site for packetdrill is here: + +https://code.google.com/p/packetdrill/ + + +building +======== + +To build packetdrill, first install flex and bison. + +Then set up the Makefile for your platform: + +# ./configure + +Then build the tool: + +# make + + +running +======= + +Here's a quick example. + +On FreeBSD, OpenBSD, and NetBSD, try: + +# ./packetdrill tests/bsd/fast_retransmit/fr-4pkt-sack-bsd.pkt + +On Linux try: + +# ./packetdrill tests/linux/fast_retransmit/fr-4pkt-sack-linux.pkt + + +license +======= + +The packetdrill tool is released under version 2 of the GPL. See the +COPYING file for full details. + + +discussion and contributions +============================== + +If you have any questions, or code or patches to offer, please join +the packetdrill e-mail list at: + +http://groups.google.com/group/packetdrill + +Contributions of code or tests are both welcomed! + +Enjoy! diff --git a/test/packetdrill/assert.h b/test/packetdrill/assert.h new file mode 100644 index 0000000..9d03822 --- /dev/null +++ b/test/packetdrill/assert.h @@ -0,0 +1,10 @@ +#include + +extern void __attribute__((noreturn)) die(char *format, ...); + +#define assert(expr) \ + do { \ + if (!(expr)) \ + die("assertion %s failed at %s line %d",\ + __STRING(expr), __FILE__, __LINE__);\ + } while (0) diff --git a/test/packetdrill/capability.h b/test/packetdrill/capability.h new file mode 100644 index 0000000..7ec5ee2 --- /dev/null +++ b/test/packetdrill/capability.h @@ -0,0 +1,102 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// Author: willemb@google.com (Will de Bruijn) +// +// POSIX capability support for Linux: simplified libcap +// GPL applies, as this interface was inspired by sys/capability.h + +#ifndef _LINUX_GTESTS_NET_CAPABILITY_H +#define _LINUX_GTESTS_NET_CAPABILITY_H + +#ifdef HAVE_SYS_CAPABILITY_H +#include +#else +#include + +typedef struct __user_cap_data_struct *cap_t; +typedef int cap_value_t; + +typedef enum { + CAP_EFFECTIVE=0, + CAP_PERMITTED=1, + CAP_INHERITABLE=2 +} cap_flag_t; + +typedef enum { + CAP_CLEAR=0, + CAP_SET=1 +} cap_flag_value_t; + +static struct __user_cap_header_struct header = { + .version = _LINUX_CAPABILITY_VERSION_3, + .pid = 0, +}; + +// System calls: implemented in libc +int capset(cap_user_header_t header, cap_user_data_t data); +int capget(cap_user_header_t header, const cap_user_data_t data); + +// Extract a value for one name in one of the capability lists +// only supports flag CAP_EFFECTIVE +static inline int +cap_get_flag(cap_t cap, cap_value_t name, cap_flag_t flag, cap_flag_value_t *val) +{ + assert(flag == CAP_EFFECTIVE); + assert(name < (sizeof(cap->effective) * 8)); + *val = (cap->effective & (1 << name)) ? CAP_SET : CAP_CLEAR; + return 0; +} + +// Set the value for a number of names in one of the capability lists +// only supports flag CAP_EFFECTIVE +static inline int +cap_set_flag(cap_t cap, cap_flag_t flag, int num_name, + const cap_value_t *names, cap_flag_value_t val) +{ + int i; + + assert(flag == CAP_EFFECTIVE); + if (val == CAP_SET) + for (i = 0; i < num_name; i++) + cap->effective |= (1 << names[i]); + else + for (i = 0; i < num_name; i++) + cap->effective &= ~(1 << names[i]); + + return 0; +} + +// Get the capability lists from the kernel +static inline cap_t +cap_get_proc(void) +{ + cap_t capabilities = calloc(_LINUX_CAPABILITY_U32S_3, + sizeof(struct __user_cap_data_struct)); + if (capget(&header, capabilities)) { + perror("capget"); + return NULL; + } + + return capabilities; +} + +// Update the capability lists in the kernel +static inline int +cap_set_proc(cap_t capabilities) +{ + if (capset(&header, capabilities)) { + perror("capset"); + return -1; + } + return 0; +} + +// Free a capability list +static inline int +cap_free(void *capabilities) +{ + free(capabilities); + return 0; +} + +#endif /* !HAVE_SYS_CAPABILITY_H */ +#endif /* _LINUX_GTESTS_NET_CAPABILITY_H */ diff --git a/test/packetdrill/checksum.c b/test/packetdrill/checksum.c new file mode 100644 index 0000000..3e549d3 --- /dev/null +++ b/test/packetdrill/checksum.c @@ -0,0 +1,239 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Helpers to calculate IP, TCP, and UDP checksums. + */ + +#include "checksum.h" + +#include "assert.h" + +/* Add bytes in buffer to a running checksum. Returns the new + * intermediate checksum. Use ip_checksum_fold() to convert the + * intermediate checksum to final form. + */ +static u64 ip_checksum_partial(const void *p, size_t len, u64 sum) +{ + /* Main loop: 32 bits at a time. + * We take advantage of intel's ability to do unaligned memory + * accesses with minimal additional cost. Other architectures + * probably want to be more careful here. + */ + const u32 *p32 = (const u32 *)(p); + for (; len >= sizeof(*p32); len -= sizeof(*p32)) + sum += *p32++; + + /* Handle un-32bit-aligned trailing bytes */ + const u16 *p16 = (const u16 *)(p32); + if (len >= 2) { + sum += *p16++; + len -= sizeof(*p16); + } + if (len > 0) { + const u8 *p8 = (const u8 *)(p16); + sum += ntohs(*p8 << 8); /* RFC says pad last byte */ + } + + return sum; +} + +static __be16 ip_checksum_fold(u64 sum) +{ + while (sum & ~0xffffffffULL) + sum = (sum >> 32) + (sum & 0xffffffffULL); + while (sum & 0xffff0000ULL) + sum = (sum >> 16) + (sum & 0xffffULL); + + return ~sum; +} + +static u64 tcp_udp_v4_header_checksum_partial( + struct in_addr src_ip, struct in_addr dst_ip, u8 protocol, u16 len) +{ + /* The IPv4 pseudo-header is defined in RFC 793, Section 3.1. */ + struct ipv4_pseudo_header_t { + /* We use a union here to avoid aliasing issues with gcc -O2 */ + union { + struct header { + struct in_addr src_ip; + struct in_addr dst_ip; + __u8 mbz; + __u8 protocol; + __be16 length; + } __packed fields; + u32 words[3]; + }; + }; + struct ipv4_pseudo_header_t pseudo_header; + assert(sizeof(pseudo_header) == 12); + + /* Fill in the pseudo-header. */ + pseudo_header.fields.src_ip = src_ip; + pseudo_header.fields.dst_ip = dst_ip; + pseudo_header.fields.mbz = 0; + pseudo_header.fields.protocol = protocol; + pseudo_header.fields.length = htons(len); + return ip_checksum_partial(&pseudo_header, sizeof(pseudo_header), 0); +} + +__be16 tcp_udp_v4_checksum(struct in_addr src_ip, struct in_addr dst_ip, + u8 protocol, const void *payload, u16 len) +{ + u64 sum = tcp_udp_v4_header_checksum_partial( + src_ip, dst_ip, protocol, len); + sum = ip_checksum_partial(payload, len, sum); + return ip_checksum_fold(sum); +} + +/* Calculates and returns IPv4 header checksum. */ +__be16 ipv4_checksum(void *ip_header, size_t ip_header_bytes) +{ + return ip_checksum_fold( + ip_checksum_partial(ip_header, ip_header_bytes, 0)); +} + +static u64 tcp_udp_v6_header_checksum_partial( + const struct in6_addr *src_ip, + const struct in6_addr *dst_ip, + u8 protocol, u32 len) +{ + /* The IPv6 pseudo-header is defined in RFC 2460, Section 8.1. */ + struct ipv6_pseudo_header_t { + /* We use a union here to avoid aliasing issues with gcc -O2 */ + union { + struct header { + struct in6_addr src_ip; + struct in6_addr dst_ip; + __be32 length; + __u8 mbz[3]; + __u8 next_header; + } __packed fields; + u32 words[10]; + }; + }; + struct ipv6_pseudo_header_t pseudo_header; + assert(sizeof(pseudo_header) == 40); + + /* Fill in the pseudo-header. */ + pseudo_header.fields.src_ip = *src_ip; + pseudo_header.fields.dst_ip = *dst_ip; + pseudo_header.fields.length = htonl(len); + memset(pseudo_header.fields.mbz, 0, sizeof(pseudo_header.fields.mbz)); + pseudo_header.fields.next_header = protocol; + return ip_checksum_partial(&pseudo_header, sizeof(pseudo_header), 0); +} + +__be16 tcp_udp_v6_checksum(const struct in6_addr *src_ip, + const struct in6_addr *dst_ip, + u8 protocol, const void *payload, u32 len) +{ + u64 sum = tcp_udp_v6_header_checksum_partial( + src_ip, dst_ip, protocol, len); + sum = ip_checksum_partial(payload, len, sum); + return ip_checksum_fold(sum); +} + +#define CRC32C(c, d) (c = (c>>8) ^ crc_c[(c^(d))&0xFF]) + +static u32 crc_c[256] = { + 0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4, + 0xC79A971F, 0x35F1141C, 0x26A1E7E8, 0xD4CA64EB, + 0x8AD958CF, 0x78B2DBCC, 0x6BE22838, 0x9989AB3B, + 0x4D43CFD0, 0xBF284CD3, 0xAC78BF27, 0x5E133C24, + 0x105EC76F, 0xE235446C, 0xF165B798, 0x030E349B, + 0xD7C45070, 0x25AFD373, 0x36FF2087, 0xC494A384, + 0x9A879FA0, 0x68EC1CA3, 0x7BBCEF57, 0x89D76C54, + 0x5D1D08BF, 0xAF768BBC, 0xBC267848, 0x4E4DFB4B, + 0x20BD8EDE, 0xD2D60DDD, 0xC186FE29, 0x33ED7D2A, + 0xE72719C1, 0x154C9AC2, 0x061C6936, 0xF477EA35, + 0xAA64D611, 0x580F5512, 0x4B5FA6E6, 0xB93425E5, + 0x6DFE410E, 0x9F95C20D, 0x8CC531F9, 0x7EAEB2FA, + 0x30E349B1, 0xC288CAB2, 0xD1D83946, 0x23B3BA45, + 0xF779DEAE, 0x05125DAD, 0x1642AE59, 0xE4292D5A, + 0xBA3A117E, 0x4851927D, 0x5B016189, 0xA96AE28A, + 0x7DA08661, 0x8FCB0562, 0x9C9BF696, 0x6EF07595, + 0x417B1DBC, 0xB3109EBF, 0xA0406D4B, 0x522BEE48, + 0x86E18AA3, 0x748A09A0, 0x67DAFA54, 0x95B17957, + 0xCBA24573, 0x39C9C670, 0x2A993584, 0xD8F2B687, + 0x0C38D26C, 0xFE53516F, 0xED03A29B, 0x1F682198, + 0x5125DAD3, 0xA34E59D0, 0xB01EAA24, 0x42752927, + 0x96BF4DCC, 0x64D4CECF, 0x77843D3B, 0x85EFBE38, + 0xDBFC821C, 0x2997011F, 0x3AC7F2EB, 0xC8AC71E8, + 0x1C661503, 0xEE0D9600, 0xFD5D65F4, 0x0F36E6F7, + 0x61C69362, 0x93AD1061, 0x80FDE395, 0x72966096, + 0xA65C047D, 0x5437877E, 0x4767748A, 0xB50CF789, + 0xEB1FCBAD, 0x197448AE, 0x0A24BB5A, 0xF84F3859, + 0x2C855CB2, 0xDEEEDFB1, 0xCDBE2C45, 0x3FD5AF46, + 0x7198540D, 0x83F3D70E, 0x90A324FA, 0x62C8A7F9, + 0xB602C312, 0x44694011, 0x5739B3E5, 0xA55230E6, + 0xFB410CC2, 0x092A8FC1, 0x1A7A7C35, 0xE811FF36, + 0x3CDB9BDD, 0xCEB018DE, 0xDDE0EB2A, 0x2F8B6829, + 0x82F63B78, 0x709DB87B, 0x63CD4B8F, 0x91A6C88C, + 0x456CAC67, 0xB7072F64, 0xA457DC90, 0x563C5F93, + 0x082F63B7, 0xFA44E0B4, 0xE9141340, 0x1B7F9043, + 0xCFB5F4A8, 0x3DDE77AB, 0x2E8E845F, 0xDCE5075C, + 0x92A8FC17, 0x60C37F14, 0x73938CE0, 0x81F80FE3, + 0x55326B08, 0xA759E80B, 0xB4091BFF, 0x466298FC, + 0x1871A4D8, 0xEA1A27DB, 0xF94AD42F, 0x0B21572C, + 0xDFEB33C7, 0x2D80B0C4, 0x3ED04330, 0xCCBBC033, + 0xA24BB5A6, 0x502036A5, 0x4370C551, 0xB11B4652, + 0x65D122B9, 0x97BAA1BA, 0x84EA524E, 0x7681D14D, + 0x2892ED69, 0xDAF96E6A, 0xC9A99D9E, 0x3BC21E9D, + 0xEF087A76, 0x1D63F975, 0x0E330A81, 0xFC588982, + 0xB21572C9, 0x407EF1CA, 0x532E023E, 0xA145813D, + 0x758FE5D6, 0x87E466D5, 0x94B49521, 0x66DF1622, + 0x38CC2A06, 0xCAA7A905, 0xD9F75AF1, 0x2B9CD9F2, + 0xFF56BD19, 0x0D3D3E1A, 0x1E6DCDEE, 0xEC064EED, + 0xC38D26C4, 0x31E6A5C7, 0x22B65633, 0xD0DDD530, + 0x0417B1DB, 0xF67C32D8, 0xE52CC12C, 0x1747422F, + 0x49547E0B, 0xBB3FFD08, 0xA86F0EFC, 0x5A048DFF, + 0x8ECEE914, 0x7CA56A17, 0x6FF599E3, 0x9D9E1AE0, + 0xD3D3E1AB, 0x21B862A8, 0x32E8915C, 0xC083125F, + 0x144976B4, 0xE622F5B7, 0xF5720643, 0x07198540, + 0x590AB964, 0xAB613A67, 0xB831C993, 0x4A5A4A90, + 0x9E902E7B, 0x6CFBAD78, 0x7FAB5E8C, 0x8DC0DD8F, + 0xE330A81A, 0x115B2B19, 0x020BD8ED, 0xF0605BEE, + 0x24AA3F05, 0xD6C1BC06, 0xC5914FF2, 0x37FACCF1, + 0x69E9F0D5, 0x9B8273D6, 0x88D28022, 0x7AB90321, + 0xAE7367CA, 0x5C18E4C9, 0x4F48173D, 0xBD23943E, + 0xF36E6F75, 0x0105EC76, 0x12551F82, 0xE03E9C81, + 0x34F4F86A, 0xC69F7B69, 0xD5CF889D, 0x27A40B9E, + 0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E, + 0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351 +}; + +__be32 sctp_crc32c(const void *packet, u32 len) +{ + u32 i, crc32c; + u8 byte0, byte1, byte2, byte3; + const u8 *buf = (const u8 *)packet; + + crc32c = ~0; + for (i = 0; i < len; i++) + CRC32C(crc32c, buf[i]); + crc32c = ~crc32c; + byte0 = crc32c & 0xff; + byte1 = (crc32c>>8) & 0xff; + byte2 = (crc32c>>16) & 0xff; + byte3 = (crc32c>>24) & 0xff; + crc32c = ((byte0 << 24) | (byte1 << 16) | (byte2 << 8) | byte3); + return htonl(crc32c); +} diff --git a/test/packetdrill/checksum.h b/test/packetdrill/checksum.h new file mode 100644 index 0000000..43681d2 --- /dev/null +++ b/test/packetdrill/checksum.h @@ -0,0 +1,54 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Helpers to calculate IP, TCP, and UDP checksums. + */ + +#ifndef __CHECKSUM_H__ +#define __CHECKSUM_H__ + +#include "types.h" + +#include +#include + +/* IPv4 ... */ + +/* Calculates and returns IPv4 header checksum (in network byte order). */ +extern __be16 ipv4_checksum(void *ip_header, size_t ip_header_bytes); + +/* Calculates TCP or UDP checksum for IPv4 (in network byte order). */ +extern __be16 tcp_udp_v4_checksum(struct in_addr src_ip, struct in_addr dst_ip, + u8 protocol, const void *payload, u16 len); + +/* IPv6 ... */ + +/* Calculates TCP, UDP, or ICMP checksum for IPv6 (in network byte order). */ +extern __be16 tcp_udp_v6_checksum(const struct in6_addr *src_ip, + const struct in6_addr *dst_ip, + u8 protocol, const void *payload, u32 len); + +/* SCTP ... */ + +/* Calculates the CRC32C checksum used by SCTP (in network byte order). */ +extern __be32 sctp_crc32c(const void *packet, u32 len); + +#endif /* __CHECKSUM_H__ */ diff --git a/test/packetdrill/checksum_test.c b/test/packetdrill/checksum_test.c new file mode 100644 index 0000000..08ef2e1 --- /dev/null +++ b/test/packetdrill/checksum_test.c @@ -0,0 +1,140 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Unit test for checksum.c. + */ + +#include "checksum.h" + +#include +#include "assert.h" +#include "ip.h" +#include "ipv6.h" +#include "sctp.h" +#include "tcp.h" + +static void test_tcp_udp_v4_checksum(void) +{ + u8 data[] = { + 0x45, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x00, + 0xff, 0x06, 0xf9, 0x10, 0x01, 0x01, 0x01, 0x01, + 0xc0, 0xa8, 0x00, 0x01, 0x04, 0xd2, 0xeb, 0x35, + 0x00, 0x00, 0x00, 0x00, 0xc6, 0xf0, 0x56, 0x00, + 0xa0, 0x12, 0x16, 0xa0, 0x54, 0x12, 0x00, 0x00, + 0x02, 0x04, 0x05, 0xb4, 0x04, 0x02, 0x08, 0x0a, + 0x00, 0x00, 0x02, 0xbc, 0x00, 0x06, 0x0a, 0xd8, + 0x01, 0x03, 0x03, 0x07, + }; + + struct in_addr src_ip, dst_ip; + struct tcp *tcp = (struct tcp *) (data + sizeof(struct ipv4)); + int len = sizeof(data) - sizeof(struct ipv4); + u16 checksum = 0; + + assert(inet_pton(AF_INET, "1.1.1.1", &src_ip) == 1); + assert(inet_pton(AF_INET, "192.168.0.1", &dst_ip) == 1); + + checksum = + ntohs(tcp_udp_v4_checksum(src_ip, dst_ip, IPPROTO_TCP, tcp, len)); + assert(checksum == 0); + + tcp->check = 0; + checksum = + ntohs(tcp_udp_v4_checksum(src_ip, dst_ip, IPPROTO_TCP, tcp, len)); + assert(checksum == 0x5412); +} + +static void test_tcp_udp_v6_checksum(void) +{ + u8 data[] = { + 0x60, 0x00, 0x00, 0x00, 0x00, 0x20, 0x06, 0xff, + 0x20, 0x01, 0x0d, 0xb8, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, + 0xfd, 0x3d, 0xfa, 0x7b, 0xd1, 0x7d, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, + 0xd3, 0xe2, 0x1f, 0x90, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x80, 0x02, 0x80, 0x18, + 0x06, 0x60, 0x00, 0x00, 0x02, 0x04, 0x03, 0xe8, + 0x04, 0x02, 0x01, 0x01, 0x01, 0x03, 0x03, 0x07, + }; + + struct ipv6 *ipv6 = (struct ipv6 *) (data); + struct tcp *tcp = (struct tcp *) (data + sizeof(struct ipv6)); + int len = sizeof(data) - sizeof(struct ipv6); + u16 checksum = 0; + + checksum = + ntohs(tcp_udp_v6_checksum(&ipv6->src_ip, + &ipv6->dst_ip, + IPPROTO_TCP, tcp, len)); + assert(checksum == 0); + + tcp->check = 0; + checksum = + ntohs(tcp_udp_v6_checksum(&ipv6->src_ip, + &ipv6->dst_ip, + IPPROTO_TCP, tcp, len)); + assert(checksum == 0x0660); +} + +static void test_ipv4_checksum(void) +{ + u8 data[] = { + 0x45, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x00, + 0xff, 0x06, 0xf9, 0x10, 0x01, 0x01, 0x01, 0x01, + 0xc0, 0xa8, 0x00, 0x01, + }; + struct ipv4 *ip = (struct ipv4 *) data; + u16 checksum = 0; + + checksum = ntohs(ipv4_checksum(data, sizeof(data))); + assert(checksum == 0); + + ip->check = 0; + checksum = ntohs(ipv4_checksum(data, sizeof(data))); + assert(checksum == 0xf910); +} + +static void test_sctp_crc32c(void) +{ + u8 data[] = { + 0x07, 0xd0, 0xd6, 0x61, 0x11, 0x0c, 0xc5, 0x6c, + 0xda, 0xd7, 0x37, 0x74, 0x06, 0x00, 0x00, 0x0f, + 0x00, 0x0c, 0x00, 0x0b, 0x47, 0x6f, 0x6f, 0x64, + 0x62, 0x79, 0x65, 0x00, + }; + struct sctp_common_header *sctp_common_header; + u32 crc32c; + + sctp_common_header = (struct sctp_common_header *)data; + sctp_common_header->crc32c = 0; + crc32c = ntohl(sctp_crc32c(data, sizeof(data))); + assert(crc32c == 0xdad73774); +} + +int main(void) +{ + test_tcp_udp_v4_checksum(); + test_tcp_udp_v6_checksum(); + test_ipv4_checksum(); + test_sctp_crc32c(); + return 0; +} diff --git a/test/packetdrill/code.c b/test/packetdrill/code.c new file mode 100644 index 0000000..0c38e40 --- /dev/null +++ b/test/packetdrill/code.c @@ -0,0 +1,777 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Implementation for a module to write out post-processing code that + * can run custom programmatic analyses and constraint verification. + */ + +#include "code.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "assert.h" +#include "run.h" +#include "tcp.h" + +/* We emit the following Python preamble at the top of the output + * Python code. It defines a custom exception hook so that when an + * exception is raised (such as a failed assertion) we print the file + * name and line number of the code snippet in the original test + * script that caused the error, not just the file name and line + * number in the generated Python file (which will be meaningless or + * confusing to the user). + */ +const char python_preamble[] = +"import sys\n" +"import traceback\n" +"def excepthook(etype, value, tb):\n" +" sys.stderr.write(\"%s:%d: error in Python code\\n\" %\n" +" (_file, _line))\n" +" traceback.print_exception(etype, value, tb)\n" +"\n" +"sys.excepthook = excepthook\n" +"\n"; + +/* Write out the standard utility routines useful for a given language. */ +static void write_preamble(struct code_state *code) +{ + assert(code->format > FORMAT_NONE); + assert(code->format < FORMAT_NUM_TYPES); + switch (code->format) { + case FORMAT_NONE: + case FORMAT_NUM_TYPES: + assert(!"bad code format type"); + case FORMAT_PYTHON: + fprintf(code->file, "%s\n", python_preamble); + break; + /* omitting default so compiler catches missing cases */ + } +} + +#if HAVE_TCP_INFO + +/* Write out a formatted text representation of an assignment of the + * given value to the given named variable. + */ +static void emit_var(struct code_state *code, const char *name, u64 value) +{ + assert(code->format > FORMAT_NONE); + assert(code->format < FORMAT_NUM_TYPES); + switch (code->format) { + case FORMAT_NONE: + case FORMAT_NUM_TYPES: + assert(!"bad code format type"); + case FORMAT_PYTHON: + fprintf(code->file, "%s = %llu\n", name, value); + break; + /* omitting default so compiler catches missing cases */ + } +} + +/* Write out a newline to terminate a sequence of variable assignments */ +static void emit_var_end(struct code_state *code) +{ + fprintf(code->file, "\n"); +} + +/* Write out a formatted representation of useful symbolic names. */ +static void write_symbols(struct code_state *code) +{ +#ifdef linux + /* Emit symbolic names for tcpi_state values. */ + emit_var(code, "TCP_ESTABLISHED", TCP_ESTABLISHED); + emit_var(code, "TCP_SYN_SENT", TCP_SYN_SENT); + emit_var(code, "TCP_SYN_RECV", TCP_SYN_RECV); + emit_var(code, "TCP_FIN_WAIT1", TCP_FIN_WAIT1); + emit_var(code, "TCP_FIN_WAIT2", TCP_FIN_WAIT2); + emit_var(code, "TCP_TIME_WAIT", TCP_TIME_WAIT); + emit_var(code, "TCP_CLOSE", TCP_CLOSE); + emit_var(code, "TCP_CLOSE_WAIT", TCP_CLOSE_WAIT); + emit_var(code, "TCP_LAST_ACK", TCP_LAST_ACK); + emit_var(code, "TCP_LISTEN", TCP_LISTEN); + emit_var(code, "TCP_CLOSING", TCP_CLOSING); + /* Emit symbolic names for tcpi_ca_state values. */ + emit_var(code, "TCP_CA_Open", TCP_CA_Open); + emit_var(code, "TCP_CA_Disorder", TCP_CA_Disorder); + emit_var(code, "TCP_CA_CWR", TCP_CA_CWR); + emit_var(code, "TCP_CA_Recovery", TCP_CA_Recovery); + emit_var(code, "TCP_CA_Loss", TCP_CA_Loss); +#endif /* linux */ + + /* tcpi_options flags */ +#ifdef linux + emit_var(code, "TCPI_OPT_TIMESTAMPS", TCPI_OPT_TIMESTAMPS); + emit_var(code, "TCPI_OPT_WSCALE", TCPI_OPT_WSCALE); + emit_var(code, "TCPI_OPT_ECN", TCPI_OPT_ECN); + emit_var(code, "TCPI_OPT_SYN_DATA", TCPI_OPT_SYN_DATA); +#endif /* linux */ +} + +#endif /* HAVE_TCP_INFO */ + +#ifdef linux + +/* Write out a formatted representation of the given tcp_info buffer. */ +static void write_tcp_info(struct code_state *code, + const struct _tcp_info *info, + int len) +{ + assert(len >= sizeof(struct _tcp_info)); + + write_symbols(code); + + /* Emit the recorded values of tcpi_foo values. */ + emit_var(code, "tcpi_state", info->tcpi_state); + emit_var(code, "tcpi_ca_state", info->tcpi_ca_state); + emit_var(code, "tcpi_retransmits", info->tcpi_retransmits); + emit_var(code, "tcpi_probes", info->tcpi_probes); + emit_var(code, "tcpi_backoff", info->tcpi_backoff); + emit_var(code, "tcpi_options", info->tcpi_options); + emit_var(code, "tcpi_snd_wscale", info->tcpi_snd_wscale); + emit_var(code, "tcpi_rcv_wscale", info->tcpi_rcv_wscale); + emit_var(code, "tcpi_delivery_rate_app_limited", + info->tcpi_delivery_rate_app_limited); + emit_var(code, "tcpi_rto", info->tcpi_rto); + emit_var(code, "tcpi_ato", info->tcpi_ato); + emit_var(code, "tcpi_snd_mss", info->tcpi_snd_mss); + emit_var(code, "tcpi_rcv_mss", info->tcpi_rcv_mss); + emit_var(code, "tcpi_unacked", info->tcpi_unacked); + emit_var(code, "tcpi_sacked", info->tcpi_sacked); + emit_var(code, "tcpi_lost", info->tcpi_lost); + emit_var(code, "tcpi_retrans", info->tcpi_retrans); + emit_var(code, "tcpi_fackets", info->tcpi_fackets); + emit_var(code, "tcpi_last_data_sent", info->tcpi_last_data_sent); + emit_var(code, "tcpi_last_ack_sent", info->tcpi_last_ack_sent); + emit_var(code, "tcpi_last_data_recv", info->tcpi_last_data_recv); + emit_var(code, "tcpi_last_ack_recv", info->tcpi_last_ack_recv); + emit_var(code, "tcpi_pmtu", info->tcpi_pmtu); + emit_var(code, "tcpi_rcv_ssthresh", info->tcpi_rcv_ssthresh); + emit_var(code, "tcpi_rtt", info->tcpi_rtt); + emit_var(code, "tcpi_rttvar", info->tcpi_rttvar); + emit_var(code, "tcpi_snd_ssthresh", info->tcpi_snd_ssthresh); + emit_var(code, "tcpi_snd_cwnd", info->tcpi_snd_cwnd); + emit_var(code, "tcpi_advmss", info->tcpi_advmss); + emit_var(code, "tcpi_reordering", info->tcpi_reordering); + emit_var(code, "tcpi_total_retrans", info->tcpi_total_retrans); + emit_var(code, "tcpi_pacing_rate", info->tcpi_pacing_rate); + emit_var(code, "tcpi_max_pacing_rate", info->tcpi_max_pacing_rate); + emit_var(code, "tcpi_rcv_rtt", info->tcpi_rcv_rtt); + emit_var(code, "tcpi_rcv_space", info->tcpi_rcv_space); + emit_var(code, "tcpi_bytes_acked", info->tcpi_bytes_acked); + emit_var(code, "tcpi_bytes_received", info->tcpi_bytes_received); + emit_var(code, "tcpi_segs_out", info->tcpi_segs_out); + emit_var(code, "tcpi_segs_in", info->tcpi_segs_in); + emit_var(code, "tcpi_notsent_bytes", info->tcpi_notsent_bytes); + emit_var(code, "tcpi_min_rtt", info->tcpi_min_rtt); + emit_var(code, "tcpi_data_segs_in", info->tcpi_data_segs_in); + emit_var(code, "tcpi_data_segs_out", info->tcpi_data_segs_out); + emit_var(code, "tcpi_delivery_rate", info->tcpi_delivery_rate); + emit_var(code, "tcpi_busy_time", info->tcpi_busy_time); + emit_var(code, "tcpi_rwnd_limited", info->tcpi_rwnd_limited); + emit_var(code, "tcpi_sndbuf_limited", info->tcpi_sndbuf_limited); + + emit_var_end(code); +} + +/* Write out a formatted representation of the given _tcp_bbr_info buffer. */ +static void write_tcp_bbr_cc_info(struct code_state *code, + const union _tcp_cc_info *info, + int len) +{ + struct _tcp_bbr_info *b = (struct _tcp_bbr_info *)info; + u64 bw; + + /* Check for fields in initial BBR release: */ + if (len < (offsetof(struct _tcp_bbr_info, bbr_cwnd_gain) + + sizeof(b->bbr_cwnd_gain))) + return; + emit_var(code, "bbr_bw_lo", b->bbr_bw_lo); + emit_var(code, "bbr_bw_hi", b->bbr_bw_hi); + /* "bbr_bw" is made up for convenience */ + bw = ((u64)b->bbr_bw_hi << 32) + b->bbr_bw_lo; + emit_var(code, "bbr_bw", bw); + emit_var(code, "bbr_min_rtt", b->bbr_min_rtt); + emit_var(code, "bbr_pacing_gain", b->bbr_pacing_gain); + emit_var(code, "bbr_cwnd_gain", b->bbr_cwnd_gain); +} + +/* Write out a formatted representation of the given _tcp_dctcp_info buffer. */ +static void write_tcp_dctcp_cc_info(struct code_state *code, + const union _tcp_cc_info *info, + int len) +{ + struct _tcp_dctcp_info *d = (struct _tcp_dctcp_info *)info; + + if (len < (offsetof(struct _tcp_dctcp_info, dctcp_ab_tot) + + sizeof(d->dctcp_ab_tot))) + return; + emit_var(code, "dctcp_enabled", d->dctcp_enabled); + emit_var(code, "dctcp_ce_state", d->dctcp_ce_state); + emit_var(code, "dctcp_alpha", d->dctcp_alpha); + emit_var(code, "dctcp_ab_ecn", d->dctcp_ab_ecn); + emit_var(code, "dctcp_ab_tot", d->dctcp_ab_tot); +} + +/* Write out a formatted representation of the given _tcpvegas_info buffer. */ +static void write_tcp_vegas_cc_info(struct code_state *code, + const union _tcp_cc_info *info, + int len) +{ + struct _tcpvegas_info *v = (struct _tcpvegas_info *)info; + + if (len < (offsetof(struct _tcpvegas_info, tcpv_minrtt) + + sizeof(v->tcpv_minrtt))) + return; + emit_var(code, "tcpv_enabled", v->tcpv_enabled); + emit_var(code, "tcpv_rttcnt", v->tcpv_rttcnt); + emit_var(code, "tcp_rtt", v->tcpv_rtt); + emit_var(code, "tcp_minrtt", v->tcpv_minrtt); +} + +/* Write out a formatted representation of the given tcp_cc_info buffer. */ +static void write_tcp_cc_info(struct code_state *code, + const union _tcp_cc_info *info, + int len) +{ + /* getsockopt returns 0 len info if C.C. does not support the opt */ + write_tcp_bbr_cc_info(code, info, len); + write_tcp_dctcp_cc_info(code, info, len); + write_tcp_vegas_cc_info(code, info, len); + emit_var_end(code); +} + +/* Write out a formatted representation of the given mem_info buffer. */ +static void write_so_meminfo(struct code_state *code, + const u32 *mem_info, + int len) +{ + assert(len >= sizeof(u32) * _SK_MEMINFO_VARS); + + emit_var(code, "SK_MEMINFO_RMEM_ALLOC", mem_info[_SK_MEMINFO_RMEM_ALLOC]); + emit_var(code, "SK_MEMINFO_RCVBUF", mem_info[_SK_MEMINFO_RCVBUF]); + emit_var(code, "SK_MEMINFO_WMEM_ALLOC", mem_info[_SK_MEMINFO_WMEM_ALLOC]); + emit_var(code, "SK_MEMINFO_SNDBUF", mem_info[_SK_MEMINFO_SNDBUF]); + emit_var(code, "SK_MEMINFO_FWD_ALLOC", mem_info[_SK_MEMINFO_FWD_ALLOC]); + emit_var(code, "SK_MEMINFO_WMEM_QUEUED", mem_info[_SK_MEMINFO_WMEM_QUEUED]); + emit_var(code, "SK_MEMINFO_OPTMEM", mem_info[_SK_MEMINFO_OPTMEM]); + emit_var(code, "SK_MEMINFO_BACKLOG", mem_info[_SK_MEMINFO_BACKLOG]); + emit_var(code, "SK_MEMINFO_DROPS", mem_info[_SK_MEMINFO_DROPS]); + + emit_var_end(code); +} +#endif /* linux */ + +#if defined(__FreeBSD__) + +/* Write out a formatted representation of the given tcp_info buffer. */ +static void write_tcp_info(struct code_state *code, + const struct _tcp_info *info, + int len) +{ + assert(len >= sizeof(struct _tcp_info)); + + write_symbols(code); + + /* Emit the recorded values of tcpi_foo values. */ + emit_var(code, "tcpi_state", info->tcpi_state); + emit_var(code, "tcpi_options", info->tcpi_options); + emit_var(code, "tcpi_snd_wscale", info->tcpi_snd_wscale); + emit_var(code, "tcpi_rcv_wscale", info->tcpi_rcv_wscale); + emit_var(code, "tcpi_rto", info->tcpi_rto); + emit_var(code, "tcpi_snd_mss", info->tcpi_snd_mss); + emit_var(code, "tcpi_rcv_mss", info->tcpi_rcv_mss); + emit_var(code, "tcpi_last_data_recv", info->tcpi_last_data_recv); + emit_var(code, "tcpi_rtt", info->tcpi_rtt); + emit_var(code, "tcpi_rttvar", info->tcpi_rttvar); + emit_var(code, "tcpi_snd_ssthresh", info->tcpi_snd_ssthresh); + emit_var(code, "tcpi_snd_cwnd", info->tcpi_snd_cwnd); + emit_var(code, "tcpi_rcv_space", info->tcpi_rcv_space); + + /* FreeBSD extensions to tcp_info. */ + emit_var(code, "tcpi_snd_wnd", info->tcpi_snd_wnd); + emit_var(code, "tcpi_snd_bwnd", info->tcpi_snd_bwnd); + emit_var(code, "tcpi_snd_nxt", info->tcpi_snd_nxt); + emit_var(code, "tcpi_rcv_nxt", info->tcpi_rcv_nxt); + emit_var(code, "tcpi_toe_tid", info->tcpi_toe_tid); + emit_var(code, "tcpi_snd_rexmitpack", info->tcpi_snd_rexmitpack); + emit_var(code, "tcpi_rcv_ooopack", info->tcpi_rcv_ooopack); + emit_var(code, "tcpi_snd_zerowin", info->tcpi_snd_zerowin); + + emit_var_end(code); +} + +#endif /* __FreeBSD__ */ + +/* Allocate a new empty struct code_text struct. */ +static struct code_text *text_new(void) +{ + struct code_text *text = calloc(1, sizeof(struct code_text)); + return text; +} + +/* Free the given text struct and all storage to which it points. */ +static void text_free(struct code_text *text) +{ + free(text->text); + free(text->file_name); + free(text); +} + +/* Allocate a new empty struct code_data struct. */ +static struct code_data *data_new(void) +{ + struct code_data *data = calloc(1, sizeof(struct code_data)); + return data; +} + +/* Free the given data and all storage to which it points. */ +static void data_free(struct code_data *data) +{ + free(data->buffer); + free(data); +} + +/* Allocate a new empty fragment. */ +static struct code_fragment *fragment_new(void) +{ + struct code_fragment *fragment = + calloc(1, sizeof(struct code_fragment)); + return fragment; +} + +/* Free the given fragment and all storage to which it points. */ +static void fragment_free(struct code_fragment *fragment) +{ + assert(fragment->type > FRAGMENT_NONE); + assert(fragment->type < FRAGMENT_NUM_TYPES); + switch (fragment->type) { + case FRAGMENT_NONE: + case FRAGMENT_NUM_TYPES: + assert(!"bad code fragment type"); + break; + case FRAGMENT_TEXT: + text_free(fragment->contents.text); + break; + case FRAGMENT_DATA: + data_free(fragment->contents.data); + break; + /* omitting default so compiler catches missing cases */ + } + free(fragment); +} + +/* Write out the text to the given file. */ +static void write_text(struct code_state *code, struct code_text *text) +{ + assert(code->format > FORMAT_NONE); + assert(code->format < FORMAT_NUM_TYPES); + switch (code->format) { + case FORMAT_NONE: + case FORMAT_NUM_TYPES: + assert(!"bad code format type"); + case FORMAT_PYTHON: + fprintf(code->file, + "_file = '%s'\n" + "_line = %d\n" + "%s\n\n", + text->file_name, text->line_number, text->text); + break; + /* omitting default so compiler catches missing cases */ + } +} + +/* Write out a textual representation of the data to the given file. */ +static void write_data(struct code_state *code, struct code_data *data) +{ + assert(data->type > DATA_NONE); + assert(data->type < DATA_NUM_TYPES); + switch (data->type) { + case DATA_NONE: + case DATA_NUM_TYPES: + assert(!"bad data type"); + break; +#if HAVE_TCP_INFO + case DATA_TCP_INFO: + write_tcp_info(code, data->buffer, data->len); + break; +#endif /* HAVE_TCP_INFO */ +#if HAVE_TCP_CC_INFO + case DATA_TCP_CC_INFO: + write_tcp_cc_info(code, data->buffer, data->len); + break; +#endif /* HAVE_TCP_CC_INFO */ +#if HAVE_SO_MEMINFO + case DATA_SO_MEMINFO: + write_so_meminfo(code, data->buffer, data->len); + break; +#endif /* HAVE_SO_MEMINFO */ + /* omitting default so compiler catches missing cases */ + } +} + +/* Write out a textual representation of the fragment to the given file. */ +static void write_fragment(struct code_state *code, + struct code_fragment *fragment) +{ + assert(fragment->type > FRAGMENT_NONE); + assert(fragment->type < FRAGMENT_NUM_TYPES); + switch (fragment->type) { + case FRAGMENT_NONE: + case FRAGMENT_NUM_TYPES: + assert(!"bad code fragment type"); + break; + case FRAGMENT_TEXT: + write_text(code, fragment->contents.text); + break; + case FRAGMENT_DATA: + write_data(code, fragment->contents.data); + break; + /* omitting default so compiler catches missing cases */ + } +} + +/* Format and write out all the code fragments. */ +static void write_all_fragments(struct code_state *code) +{ + struct code_fragment *fragment = NULL; + for (fragment = code->list_head; fragment != NULL; + fragment = fragment->next) { + write_fragment(code, fragment); + } +} + +/* Append the code fragment to the end of the list of code fragments. */ +static void append_fragment(struct code_state *code, + struct code_fragment *fragment) +{ + *(code->list_tail) = fragment; + code->list_tail = &(fragment->next); +} + +/* Append a literal ASCII text code snippet that we should emit. + * Takes ownership of the malloc-allocated text memory and frees it. + */ +static void append_text(struct code_state *code, + const char *file_name, int line_number, + char *text_buffer) +{ + struct code_text *text = text_new(); + text->text = text_buffer; + text->file_name = strdup(file_name); + text->line_number = line_number; + + struct code_fragment *fragment = fragment_new(); + fragment->type = FRAGMENT_TEXT; + fragment->contents.text = text; + append_fragment(code, fragment); +} + +/* Append a live binary buffer that we should translate into the + * format configured earlier by the user for this script. + * Takes ownership of the malloc-allocated buffer and frees it. + */ +static void append_data(struct code_state *code, enum code_data_t data_type, + void *data_buffer, int data_len) +{ + struct code_data *data = data_new(); + data->buffer = data_buffer; + data->type = data_type; + data->len = data_len; + + struct code_fragment *fragment = fragment_new(); + fragment->type = FRAGMENT_DATA; + fragment->contents.data = data; + append_fragment(code, fragment); +} + +struct code_state *code_new(struct config *config) +{ + struct code_state *code = calloc(1, sizeof(struct code_state)); + + /* Set up the pointer to the tail of the empty linked list. */ + code->list_tail = &(code->list_head); + + if (strcmp(config->code_format, "python") == 0) + code->format = FORMAT_PYTHON; + else + die("unsupported --code_format '%s'\n", config->code_format); + + /* See which getsockopt we should use to get data for our code. */ + if (strcmp(config->code_sockopt, "") == 0) { + code->data_type = DATA_NONE; /* auto-detect */ +#if HAVE_TCP_INFO + } else if (strcmp(config->code_sockopt, "TCP_INFO") == 0) { + code->data_type = DATA_TCP_INFO; +#endif +#if HAVE_TCP_CC_INFO + } else if (strcmp(config->code_sockopt, "TCP_CC_INFO") == 0) { + code->data_type = DATA_TCP_CC_INFO; +#endif /* HAVE_TCP_CC_INFO */ +#if HAVE_SO_MEMINFO + } else if (strcmp(config->code_sockopt, "SO_MEMINFO") == 0) { + code->data_type = DATA_SO_MEMINFO; +#endif /* HAVE_SO_MEMINFO */ + } else { + die("unsupported --code_sockopt '%s'\n", config->code_sockopt); + } + + code->command_line = strdup(config->code_command_line); + code->verbose = config->verbose; + + return code; +} + +void code_free(struct code_state *code) +{ + if (code->command_line != NULL) + free(code->command_line); + if (code->path != NULL) + free(code->path); + + /* Free all the code fragments. */ + struct code_fragment *fragment = code->list_head; + while (fragment != NULL) { + struct code_fragment *dead_fragment = fragment; + fragment = fragment->next; + fragment_free(dead_fragment); + } + + memset(code, 0, sizeof(*code)); /* paranoia to help catch bugs */ + free(code); +} + +/* Write all the code fragments to a newly-chosen temporary file and + * store the name of the file in code->path. + */ +static void write_code_file(struct code_state *code) +{ + /* mkstemp will fill this in with the actual unique path name. */ + char path_template[] = "/tmp/code_XXXXXX"; + int code_fd = mkstemp(path_template); + if (code_fd < 0) + die_perror("error making temp output file for code: mkstemp"); + + assert(code->path == NULL); + code->path = strdup(path_template); + + code->file = fdopen(code_fd, "w"); + if (code->file == NULL) + die_perror("error opening temp output file for code: fdopen"); + + write_preamble(code); + write_all_fragments(code); + + if (fclose(code->file) != 0) + die_perror("error closing temp output file for code: fclose"); + + code->file = NULL; +} + +/* Execute the code in the file at code->path by executing the + * configured command line. On success, returns STATUS_OK. On error + * returns STATUS_ERR and fills in *error. + */ +static int execute_code_command_line(struct code_state *code, char **error) +{ + int result = STATUS_ERR; /* return value */ + char *full_command_line = NULL; + asprintf(&full_command_line, "%s %s", code->command_line, code->path); + + /* For verbose debugging we dump the full output file. */ + if (code->verbose) { + char *verbose_command_line = NULL; + asprintf(&verbose_command_line, "cat %s", code->path); + system(verbose_command_line); + free(verbose_command_line); + printf("running: '%s'\n", full_command_line); + } + + int status = system(full_command_line); + if (status == -1) { + asprintf(error, "error running '%s' with system(3): %s", + code->command_line, strerror(errno)); + goto out; + } + if (WIFSIGNALED(status) && + (WTERMSIG(status) == SIGINT || WTERMSIG(status) == SIGQUIT)) { + asprintf(error, "'%s' got signal %d (%s)", + code->command_line, + WTERMSIG(status), strsignal(WTERMSIG(status))); + goto out; + } + if (WEXITSTATUS(status) != 0) { + asprintf(error, "'%s' returned non-zero status %d", + code->command_line, WEXITSTATUS(status)); + goto out; + } + result = STATUS_OK; + +out: + free(full_command_line); + return result; +} + +/* Delete the temporary file at code->path. */ +static void delete_code_file(struct code_state *code) +{ + if ((code->path != NULL) && (unlink(code->path) != 0)) + die_perror("error deleting code file: unlink:"); +} + +/* Write out the code to a file, execute the code, and delete the file. */ +int code_execute(struct code_state *code, char **error) +{ + if (code->list_head == NULL) + return STATUS_OK; /* no code to execute */ + + write_code_file(code); + int result = execute_code_command_line(code, error); + delete_code_file(code); + return result; +} + +/* Run a getsockopt for the given fd to grab data of the given type. + * On success, return a pointer the filled-in buffer (allocated by malloc); + * on failure, return NULL. + */ +static void *get_data(struct state *state, struct event *event, + int fd, enum code_data_t data_type, int *len) +{ + int opt_name = 0; + int data_len = 0; + int level; + + assert(data_type > DATA_NONE); + assert(data_type < DATA_NUM_TYPES); + switch (data_type) { + case DATA_NONE: + case DATA_NUM_TYPES: + assert(!"bad data type"); + break; +#if HAVE_TCP_INFO + case DATA_TCP_INFO: + opt_name = TCP_INFO; + data_len = sizeof(struct _tcp_info); + level = SOL_TCP; + break; +#endif /* HAVE_TCP_INFO */ +#if HAVE_TCP_CC_INFO + case DATA_TCP_CC_INFO: + opt_name = TCP_CC_INFO; + data_len = sizeof(union _tcp_cc_info); + level = SOL_TCP; + break; +#endif /* HAVE_TCP_CC_INFO */ +#if HAVE_SO_MEMINFO + case DATA_SO_MEMINFO: + opt_name = SO_MEMINFO; + data_len = sizeof(u32) * _SK_MEMINFO_VARS; + level = SOL_SOCKET; + break; +#endif /* HAVE_SO_MEMINFO */ + /* omitting default so compiler catches missing cases */ + } + assert(opt_name != 0); + assert(data_len > 0); + socklen_t opt_len = data_len; + void *data = calloc(1, data_len); + + int result = getsockopt(fd, level, opt_name, data, &opt_len); + if (result < 0) { + free(data); + return NULL; + } + *len = opt_len; + return data; +} + +void run_code_event(struct state *state, struct event *event, + const char *text) +{ + DEBUGP("%d: run code event\n", event->line_number); + + char *error = NULL; + + /* Wait for the right time before firing off this event. */ + wait_for_event(state); + + if (state->socket_under_test == NULL) { + asprintf(&error, "no socket to use for code"); + goto error_out; + } + int fd = state->socket_under_test->fd.live_fd; + struct code_state *code = state->code; + + void *data = NULL; + void *data_ext = NULL; + void *data_meminfo = NULL; + int data_len = 0; +#if HAVE_TCP_INFO + code->data_type = DATA_TCP_INFO; + data = get_data(state, event, fd, code->data_type, &data_len); + if (data) + append_data(code, code->data_type, data, data_len); +#endif /* HAVE_TCP_INFO */ + if (data == NULL && data_ext == NULL) { + asprintf(&error, + "can't find getsockopt to get TCP info"); + goto error_out; + } +#if HAVE_TCP_CC_INFO + code->data_type = DATA_TCP_CC_INFO; + data = get_data(state, event, fd, code->data_type, &data_len); + if (data) { + append_data(code, code->data_type, data, data_len); + } else { + asprintf(&error, + "can't find getsockopt to get TCP_CC_INFO"); + goto error_out; + } +#endif /* HAVE_TCP_CC_INFO */ +#if HAVE_SO_MEMINFO + code->data_type = DATA_SO_MEMINFO; + data_meminfo = get_data(state, event, fd, code->data_type, &data_len); + if (data_meminfo) + append_data(code, code->data_type, data_meminfo, data_len); + if (data_meminfo == NULL) { + asprintf(&error, + "can't find getsockopt to get sk_meminfo"); + goto error_out; + } +#endif + + append_text(code, state->config->script_path, event->line_number, + strdup(text)); + + return; + +error_out: + die("%s:%d: runtime error in code: %s\n", + state->config->script_path, event->line_number, error); + free(error); +} diff --git a/test/packetdrill/code.h b/test/packetdrill/code.h new file mode 100644 index 0000000..ef626bb --- /dev/null +++ b/test/packetdrill/code.h @@ -0,0 +1,122 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Interface for a module to write out post-processing code that + * can run custom programmatic analyses and constraint verification. + */ + +#ifndef __CODE_H__ +#define __CODE_H__ + +#include "types.h" + +#include "config.h" +#include "script.h" + +/* Post-processing format syntax variants we support. */ +enum code_format_t { + FORMAT_NONE, /* uninitialized or no code so far */ + FORMAT_PYTHON, /* Python syntax: var_name = 123 */ + FORMAT_NUM_TYPES, /* number of types of format */ +}; + +/* The type of a particular fragment of code. */ +enum code_fragment_t { + FRAGMENT_NONE, /* uninitialized or none so far */ + FRAGMENT_TEXT, /* literal code text to emit */ + FRAGMENT_DATA, /* binary buffer to dump as text */ + FRAGMENT_NUM_TYPES, /* number of types of fragments */ +}; + +/* The type of a particular binary data buffer. */ +enum code_data_t { + DATA_NONE, /* uninitialized or none so far */ +#if HAVE_TCP_INFO + DATA_TCP_INFO, /* binary tcp_info */ +#endif /* HAVE_TCP_INFO */ +#if HAVE_TCP_CC_INFO + DATA_TCP_CC_INFO, /* binary tcp_cc_info */ +#endif /* HAVE_SO_MEMINFO */ +#if HAVE_SO_MEMINFO + DATA_SO_MEMINFO, /* binary so_memfino */ +#endif /* HAVE_SO_MEMINFO */ + DATA_NUM_TYPES, /* number of types of fragments */ +}; + +/* Info about a textual code snippet to encode in the post-processing code. */ +struct code_text { + char *text; /* the code snippet string */ + char *file_name; /* name of script text was read from */ + int line_number; /* line on which text started */ +}; + +/* Info about a data buffer to encode in the post-processing code. */ +struct code_data { + void *buffer; /* malloc-allocated buffer */ + enum code_data_t type; /* type of data in the buffer */ + int len; /* length of data in buffer */ +}; + +/* Info about a fragment to insert in the post-processing code. */ +struct code_fragment { + enum code_fragment_t type; /* what's in this fragment? */ + union { + struct code_text *text; /* ASCII text code snippet */ + struct code_data *data; /* typed binary data buffer */ + } contents; + struct code_fragment *next; /* next in linked list */ +}; + +/* Internal state for the code execution module. */ +struct code_state { + bool verbose; /* print debug info? */ + enum code_format_t format; /* language syntax to emit */ + enum code_data_t data_type; /* data to get for snippets */ + char *command_line; /* system(3) command to run */ + char *path; /* path where we write code */ + FILE *file; /* output file we're writing */ + struct code_fragment *list_head; /* linked list head */ + struct code_fragment **list_tail; /* pointer to tail */ +}; + +/* Allocate and return a new code executor using the given config. */ +extern struct code_state *code_new(struct config *config); + +/* Tear down a code executor and free up the resources it has allocated. */ +extern void code_free(struct code_state *code); + +/* Run the TCP_INFO getsockopt on the current socket under test to + * get a snapshot of socket state, and stash the resulting data and + * code snippet so that at the end of the test we can emit the data + * and the code snippet, and then execute both. + */ +struct state; +extern void run_code_event(struct state *state, + struct event *event, const char *text); + +/* Call this at the end of test execution to run the code by writing + * out the text of the code and invoking the command line supplied by + * the user. On success, returns STATUS_OK. On error returns + * STATUS_ERR and fills in *error. + */ +extern int code_execute(struct code_state *code, char **error); + +#endif /* __CODE_H__ */ diff --git a/test/packetdrill/config.c b/test/packetdrill/config.c new file mode 100644 index 0000000..37e2eb0 --- /dev/null +++ b/test/packetdrill/config.c @@ -0,0 +1,605 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Helper functions for configuration information for a test run. + */ + +#include +#include +#include +#include + +#include "config.h" +#include "logging.h" +#include "ip_prefix.h" + +/* For the sake of clarity, we require long option names, e.g. --foo, + * for all options except -v. + */ +enum option_codes { + OPT_IP_VERSION = 256, + OPT_BIND_PORT, + OPT_CODE_COMMAND, + OPT_CODE_FORMAT, + OPT_CODE_SOCKOPT, + OPT_CONNECT_PORT, + OPT_REMOTE_IP, + OPT_LOCAL_IP, + OPT_GATEWAY_IP, + OPT_NETMASK_IP, + OPT_SPEED, + OPT_MSS, + OPT_MTU, + OPT_INIT_SCRIPTS, + OPT_TOLERANCE_USECS, + OPT_WIRE_CLIENT, + OPT_WIRE_SERVER, + OPT_WIRE_SERVER_IP, + OPT_WIRE_SERVER_PORT, + OPT_WIRE_CLIENT_DEV, + OPT_WIRE_SERVER_DEV, + OPT_SO_FILENAME, + OPT_SO_FLAGS, + OPT_TCP_TS_ECR_SCALED, + OPT_TCP_TS_TICK_USECS, + OPT_STRICT_SEGMENTS, + OPT_NON_FATAL, + OPT_DRY_RUN, + OPT_IS_ANYIP, + OPT_SEND_OMIT_FREE, + OPT_DEFINE = 'D', /* a '-D' single-letter option */ + OPT_VERBOSE = 'v', /* a '-v' single-letter option */ +}; + +/* Specification of command line options for getopt_long(). */ +struct option options[] = { + { "ip_version", .has_arg = true, NULL, OPT_IP_VERSION }, + { "bind_port", .has_arg = true, NULL, OPT_BIND_PORT }, + { "code_command", .has_arg = true, NULL, OPT_CODE_COMMAND }, + { "code_format", .has_arg = true, NULL, OPT_CODE_FORMAT }, + { "code_sockopt", .has_arg = true, NULL, OPT_CODE_SOCKOPT }, + { "connect_port", .has_arg = true, NULL, OPT_CONNECT_PORT }, + { "remote_ip", .has_arg = true, NULL, OPT_REMOTE_IP }, + { "local_ip", .has_arg = true, NULL, OPT_LOCAL_IP }, + { "gateway_ip", .has_arg = true, NULL, OPT_GATEWAY_IP }, + { "netmask_ip", .has_arg = true, NULL, OPT_NETMASK_IP }, + { "speed", .has_arg = true, NULL, OPT_SPEED }, + { "mss", .has_arg = true, NULL, OPT_MSS }, + { "mtu", .has_arg = true, NULL, OPT_MTU }, + { "init_scripts", .has_arg = true, NULL, OPT_INIT_SCRIPTS }, + { "tolerance_usecs", .has_arg = true, NULL, OPT_TOLERANCE_USECS }, + { "wire_client", .has_arg = false, NULL, OPT_WIRE_CLIENT }, + { "wire_server", .has_arg = false, NULL, OPT_WIRE_SERVER }, + { "wire_server_ip", .has_arg = true, NULL, OPT_WIRE_SERVER_IP }, + { "wire_server_port", .has_arg = true, NULL, OPT_WIRE_SERVER_PORT }, + { "wire_client_dev", .has_arg = true, NULL, OPT_WIRE_CLIENT_DEV }, + { "wire_server_dev", .has_arg = true, NULL, OPT_WIRE_SERVER_DEV }, + { "so_filename", .has_arg = true, NULL, OPT_SO_FILENAME }, + { "so_flags", .has_arg = true, NULL, OPT_SO_FLAGS }, + { "tcp_ts_ecr_scaled", .has_arg = false, NULL, OPT_TCP_TS_ECR_SCALED }, + { "tcp_ts_tick_usecs", .has_arg = true, NULL, OPT_TCP_TS_TICK_USECS }, + { "strict_segments", .has_arg = false, NULL, OPT_STRICT_SEGMENTS }, + { "non_fatal", .has_arg = true, NULL, OPT_NON_FATAL }, + { "dry_run", .has_arg = false, NULL, OPT_DRY_RUN }, + { "is_anyip", .has_arg = false, NULL, OPT_IS_ANYIP }, + { "send_omit_free", .has_arg = false, NULL, OPT_SEND_OMIT_FREE }, + { "define", .has_arg = true, NULL, OPT_DEFINE }, + { "verbose", .has_arg = false, NULL, OPT_VERBOSE }, + { NULL }, +}; + +void show_usage(void) +{ + fprintf(stderr, "Usage: packetdrill\n" + "\t[--ip_version=[ipv4,ipv4-mapped-ipv6,ipv6]]\n" + "\t[--bind_port=bind_port]\n" + "\t[--code_command=code_command]\n" + "\t[--code_format=code_format]\n" + "\t[--code_sockopt=TCP_INFO]\n" + "\t[--connect_port=connect_port]\n" + "\t[--remote_ip=remote_ip]\n" + "\t[--local_ip=local_ip]\n" + "\t[--gateway_ip=gateway_ip]\n" + "\t[--netmask_ip=netmask_ip]\n" + "\t[--init_scripts=\n" + "\t[--speed=\n" + "\t[--mss=\n" + "\t[--mtu=\n" + "\t[--tolerance_usecs=tolerance_usecs]\n" + "\t[--tcp_ts_ecr_scaled]\n" + "\t[--tcp_ts_tick_usecs=]\n" + "\t[--strict_segments]\n" + "\t[--non_fatal=]\n" + "\t[--wire_client]\n" + "\t[--wire_server]\n" + "\t[--wire_server_ip=]\n" + "\t[--wire_server_port=]\n" + "\t[--wire_client_dev=]\n" + "\t[--wire_server_dev=]\n" + "\t[--so_filename=]\n" + "\t[--so_flags=]\n" + "\t[--dry_run]\n" + "\t[--is_anyip]\n" + "\t[--send_omit_free]\n" + "\t[--define symbol1=val1 --define symbol2=val2 ...]\n" + "\t[--verbose|-v]\n" + "\tscript_path ...\n"); +} + +/* Address Configuration for IPv4 + * + * For IPv4, we use the 192.168.0.0/16 RFC 1918 private IP space for + * our tun interface. To avoid accidents and confusion we want remote + * addresses to be permanently unallocated addresses outside of the + * private/unroutable RFC 1918 ranges (kernel code can behave + * differently for private addresses). So for remote addresses we use + * the 192.0.2.0/24 TEST-NET-1 range (see RFC 5737). + * + * Summary for IPv4: + * - local address: 192.168.0.0/16 private IP space (RFC 1918) + * - remote address: 192.0.2.0/24 TEST-NET-1 range (RFC 5737) + */ + +#define DEFAULT_V4_LIVE_REMOTE_IP_STRING "192.0.2.1/24" +#define DEFAULT_V4_LIVE_LOCAL_IP_STRING "192.168.0.0" +/* Note : generate_random_ipv4_addr() assumes the gateway is .1 + */ +#define DEFAULT_V4_LIVE_GATEWAY_IP_STRING "192.168.0.1" +#define DEFAULT_V4_LIVE_NETMASK_IP_STRING "255.255.0.0" + +/* Address Configuration for IPv6 + * + * For IPv6 we use a ULA (unique local address) for our local (tun) + * interface, and the RFC 3849 documentation space for our remote + * address. + * + * Summary for IPv6: + * - local address: fd3d:fa7b:d17d::/48 in unique local address space (RFC 4193) + * - remote address: 2001:DB8::/32 documentation prefix (RFC 3849) + */ + +#define DEFAULT_V6_LIVE_REMOTE_IP_STRING "2001:DB8::1/32" +#define DEFAULT_V6_LIVE_LOCAL_IP_STRING "fd3d:fa7b:d17d::0" +#define DEFAULT_V6_LIVE_GATEWAY_IP_STRING "fd3d:fa7b:d17d:8888::0" +#define DEFAULT_V6_LIVE_PREFIX_LEN 48 + +/* Fill in any as-yet-unspecified IP address attributes using IPv4 defaults. */ +static void set_ipv4_defaults(struct config *config) +{ + if (strlen(config->live_remote_ip_string) == 0) + strcpy(config->live_remote_ip_string, + DEFAULT_V4_LIVE_REMOTE_IP_STRING); + if (strlen(config->live_netmask_ip_string) == 0) + strcpy(config->live_netmask_ip_string, + DEFAULT_V4_LIVE_NETMASK_IP_STRING); + if (strlen(config->live_local_ip_string) == 0) + generate_random_ipv4_addr(config->live_local_ip_string, + DEFAULT_V4_LIVE_LOCAL_IP_STRING, + config->live_netmask_ip_string); + if (strlen(config->live_gateway_ip_string) == 0) + strcpy(config->live_gateway_ip_string, + DEFAULT_V4_LIVE_GATEWAY_IP_STRING); +} + +/* Fill in any as-yet-unspecified IP address attributes using IPv6 defaults. */ +static void set_ipv6_defaults(struct config *config) +{ + if (strlen(config->live_remote_ip_string) == 0) + strcpy(config->live_remote_ip_string, + DEFAULT_V6_LIVE_REMOTE_IP_STRING); + if (strlen(config->live_local_ip_string) == 0) + generate_random_ipv6_addr(config->live_local_ip_string, + DEFAULT_V6_LIVE_LOCAL_IP_STRING, + DEFAULT_V6_LIVE_PREFIX_LEN); + if (strlen(config->live_gateway_ip_string) == 0) + strcpy(config->live_gateway_ip_string, + DEFAULT_V6_LIVE_GATEWAY_IP_STRING); +} + +/* Set default configuration before we begin parsing. */ +void set_default_config(struct config *config) +{ + memset(config, 0, sizeof(*config)); + config->code_command_line = "/usr/bin/python"; + config->code_format = "python"; + config->code_sockopt = ""; /* auto-detect */ + config->ip_version = IP_VERSION_4; + config->live_bind_port = 8080; + config->live_connect_port = 8080; + config->tolerance_usecs = 4000; + config->speed = TUN_DRIVER_SPEED_CUR; + config->mtu = TUN_DRIVER_DEFAULT_MTU; + + config->tcp_ts_ecr_scaled = false; + + /* For now, by default we disable checks of outbound TS val + * values, since there are timestamp val bugs in the tests and + * kernel. TODO(ncardwell): Switch default tcp_ts_tick_usecs + * to 1000 when TCP timestamp val bugs have been eradicated + * from kernel and tests. + */ + config->tcp_ts_tick_usecs = 0; /* disable checks of TS val */ + + config->live_remote_ip_string[0] = '\0'; + config->live_local_ip_string[0] = '\0'; + config->live_gateway_ip_string[0] = '\0'; + config->live_netmask_ip_string[0] = '\0'; + + config->init_scripts = NULL; + + config->wire_server_port = 8081; + config->wire_client_device = "eth0"; + config->wire_server_device = "eth0"; +} + +static void set_remote_ip_and_prefix(struct config *config) +{ + config->live_remote_ip = config->live_remote_prefix.ip; + ip_to_string(&config->live_remote_ip, + config->live_remote_ip_string); + + ip_prefix_normalize(&config->live_remote_prefix); + ip_prefix_to_string(&config->live_remote_prefix, + config->live_remote_prefix_string); +} + +/* Here's a table summarizing the types of various entities in the + * different flavors of IP that we support: + * + * flavor socket_domain bind/connect/accept IP local/remote IP + * -------- ------------- ------------------------- --------------- + * 4 AF_INET AF_INET AF_INET + * 4-mapped-6 AF_INET6 AF_INET6 mapped from IPv4 AF_INET + * 6 AF_INET6 AF_INET6 AF_INET6 + */ + +/* Calculate final configuration values needed for IPv4 */ +static void finalize_ipv4_config(struct config *config) +{ + set_ipv4_defaults(config); + + config->live_local_ip = ipv4_parse(config->live_local_ip_string); + + config->live_remote_prefix = + ipv4_prefix_parse(config->live_remote_ip_string); + set_remote_ip_and_prefix(config); + + config->live_prefix_len = + netmask_to_prefix(config->live_netmask_ip_string); + config->live_gateway_ip = ipv4_parse(config->live_gateway_ip_string); + config->live_bind_ip = config->live_local_ip; + config->live_connect_ip = config->live_remote_ip; + config->socket_domain = AF_INET; + config->wire_protocol = AF_INET; +} + +/* Calculate final configuration values needed for ipv4-mapped-ipv6 */ +static void finalize_ipv4_mapped_ipv6_config(struct config *config) +{ + set_ipv4_defaults(config); + + config->live_local_ip = ipv4_parse(config->live_local_ip_string); + + config->live_remote_prefix = + ipv4_prefix_parse(config->live_remote_ip_string); + set_remote_ip_and_prefix(config); + + config->live_prefix_len = + netmask_to_prefix(config->live_netmask_ip_string); + config->live_gateway_ip = ipv4_parse(config->live_gateway_ip_string); + config->live_bind_ip = ipv6_map_from_ipv4(config->live_local_ip); + config->live_connect_ip = ipv6_map_from_ipv4(config->live_remote_ip); + config->socket_domain = AF_INET6; + config->wire_protocol = AF_INET; +} + +/* Calculate final configuration values needed for IPv6 */ +static void finalize_ipv6_config(struct config *config) +{ + set_ipv6_defaults(config); + + config->live_local_ip = ipv6_parse(config->live_local_ip_string); + + config->live_remote_prefix = + ipv6_prefix_parse(config->live_remote_ip_string); + set_remote_ip_and_prefix(config); + + config->live_prefix_len = DEFAULT_V6_LIVE_PREFIX_LEN; + config->live_gateway_ip = ipv6_parse(config->live_gateway_ip_string); + config->live_bind_ip = config->live_local_ip; + config->live_connect_ip = config->live_remote_ip; + config->socket_domain = AF_INET6; + config->wire_protocol = AF_INET6; +} + +void finalize_config(struct config *config) +{ + assert(config->ip_version >= IP_VERSION_4); + assert(config->ip_version <= IP_VERSION_6); + switch (config->ip_version) { + case IP_VERSION_4: + finalize_ipv4_config(config); + break; + case IP_VERSION_4_MAPPED_6: + finalize_ipv4_mapped_ipv6_config(config); + break; + case IP_VERSION_6: + finalize_ipv6_config(config); + break; + /* omitting default so compiler will catch missing cases */ + } +} + +/* Expect that arg is comma-delimited, allowing for spaces. */ +void parse_non_fatal_arg(char *arg, struct config *config) +{ + char *argdup, *saveptr, *token; + + if (arg == NULL || strlen(arg) == 0) + return; + + argdup = strdup(arg); + token = strtok_r(argdup, ", ", &saveptr); + while (token != NULL) { + if (strcmp(token, "packet") == 0) + config->non_fatal_packet = true; + else if (strcmp(token, "syscall") == 0) + config->non_fatal_syscall = true; + token = strtok_r(NULL, ", ", &saveptr); + } + + free(argdup); +} + + +/* Process a command line option */ +static void process_option(int opt, char *optarg, struct config *config, + char *where) +{ + int port = 0; + char *end = NULL, *equals = NULL, *symbol = NULL, *value = NULL; + unsigned long speed = 0; + + DEBUGP("process_option %d ('%c') = %s\n", + opt, (char)opt, optarg); + + switch (opt) { + case OPT_IP_VERSION: + if (strcmp(optarg, "ipv4") == 0) + config->ip_version = IP_VERSION_4; + else if (strcmp(optarg, "ipv4-mapped-ipv6") == 0) + config->ip_version = IP_VERSION_4_MAPPED_6; + else if (strcmp(optarg, "ipv6") == 0) + config->ip_version = IP_VERSION_6; + else + die("%s: bad --ip_version: %s\n", where, optarg); + break; + case OPT_BIND_PORT: + port = atoi(optarg); + if ((port <= 0) || (port > 0xffff)) + die("%s: bad --bind_port: %s\n", where, optarg); + config->live_bind_port = port; + break; + case OPT_CODE_COMMAND: + config->code_command_line = optarg; + break; + case OPT_CODE_FORMAT: + config->code_format = optarg; + break; + case OPT_CODE_SOCKOPT: + config->code_sockopt = optarg; + break; + case OPT_CONNECT_PORT: + port = atoi(optarg); + if ((port <= 0) || (port > 0xffff)) + die("%s: bad --connect_port: %s\n", where, optarg); + config->live_connect_port = port; + break; + case OPT_REMOTE_IP: + strncpy(config->live_remote_ip_string, optarg, ADDR_STR_LEN-1); + break; + case OPT_LOCAL_IP: + strncpy(config->live_local_ip_string, optarg, ADDR_STR_LEN-1); + break; + case OPT_GATEWAY_IP: + strncpy(config->live_gateway_ip_string, optarg, ADDR_STR_LEN-1); + break; + case OPT_MSS: + config->mss = atoi(optarg); + if (config->mss <= 0) + die("%s: bad --mss: %s\n", where, optarg); + break; + case OPT_MTU: + config->mtu = atoi(optarg); + if (config->mtu < 0) + die("%s: bad --mtu: %s\n", where, optarg); + break; + case OPT_NETMASK_IP: + strncpy(config->live_netmask_ip_string, optarg, ADDR_STR_LEN-1); + break; + case OPT_INIT_SCRIPTS: + config->init_scripts = optarg; + break; + case OPT_NON_FATAL: + parse_non_fatal_arg(optarg, config); + break; + case OPT_SPEED: + speed = strtoul(optarg, &end, 10); + if (end == optarg || *end || !is_valid_u32(speed)) + die("%s: bad --speed: %s\n", where, optarg); + config->speed = speed; + break; + case OPT_TOLERANCE_USECS: + config->tolerance_usecs = atoi(optarg); + if (config->tolerance_usecs <= 0) + die("%s: bad --tolerance_usecs: %s\n", where, optarg); + break; + case OPT_TCP_TS_ECR_SCALED: + config->tcp_ts_ecr_scaled = true; + break; + case OPT_TCP_TS_TICK_USECS: + config->tcp_ts_tick_usecs = atoi(optarg); + if (config->tcp_ts_tick_usecs < 0 || + config->tcp_ts_tick_usecs > 1000000) + die("%s: bad --tcp_ts_tick_usecs: %s\n", where, optarg); + break; + case OPT_STRICT_SEGMENTS: + config->strict_segments = true; + break; + case OPT_WIRE_CLIENT: + config->is_wire_client = true; + break; + case OPT_WIRE_SERVER: + config->is_wire_server = true; + break; + case OPT_WIRE_SERVER_IP: + config->wire_server_ip_string = strdup(optarg); + config->wire_server_ip = + ipv4_parse(config->wire_server_ip_string); + break; + case OPT_WIRE_SERVER_PORT: + port = atoi(optarg); + if ((port <= 0) || (port > 0xffff)) + die("%s: bad --wire_server_port: %s\n", where, optarg); + config->wire_server_port = port; + break; + case OPT_WIRE_CLIENT_DEV: + config->wire_client_device = strdup(optarg); + break; + case OPT_WIRE_SERVER_DEV: + config->wire_server_device = strdup(optarg); + break; + case OPT_SO_FILENAME: + config->so_filename = strdup(optarg); + break; + case OPT_SO_FLAGS: + config->so_flags = strdup(optarg); + break; + case OPT_DRY_RUN: + config->dry_run = true; + break; + case OPT_IS_ANYIP: + config->is_anyip = true; + break; + case OPT_SEND_OMIT_FREE: + config->send_omit_free = true; + break; + case OPT_DEFINE: + equals = strstr(optarg, "="); + if (equals == optarg || equals == NULL) + die("%s: bad definition: %s\n", where, optarg); + symbol = strndup(optarg, equals - optarg); + value = strdup(equals + 1); + definition_set(&config->defines, symbol, value); + break; + case OPT_VERBOSE: + config->verbose = true; + break; + default: + show_usage(); + exit(EXIT_FAILURE); + } +} + + +/* Parse command line options. Returns a pointer to the first argument + * beyond the options. + */ +char **parse_command_line_options(int argc, char *argv[], + struct config *config) +{ + int c = 0; + int i = 0; + + DEBUGP("parse_command_line_options argc=%d\n", argc); + for (i = 0; i < argc; ++i) + DEBUGP("argv[%d] = '%s'\n", i, argv[i]); + + /* Make a copy of our arguments for later, in case we need to + * pass our options to a server. We use argc+1 here because, + * following main() calling conventions, we make the array + * element at argv[argc] a NULL pointer. + */ + config->argv = calloc(argc + 1, sizeof(char *)); + for (i = 0; argv[i]; ++i) + config->argv[i] = strdup(argv[i]); + + /* Parse the arguments. */ + optind = 0; + while ((c = getopt_long(argc, argv, "vD:", options, NULL)) > 0) + process_option(c, optarg, config, "Command Line"); + return argv + optind; +} + +static void parse_script_options(struct config *config, + struct option_list *option_list) +{ + struct option_list *opt = option_list; + while (opt != NULL) { + int i; + int c = 0; + for (i = 0; options[i].name != NULL; i++) { + if (strcmp(options[i].name, opt->name) == 0) { + c = options[i].val; + break; + } + } + + if (!c) + die("%s: option '%s' unknown\n", + config->script_path, opt->name); + if (opt->value && !options[i].has_arg) + die("%s: option '%s' forbids an argument\n", + config->script_path, opt->name); + if (!opt->value && options[i].has_arg) + die("%s: option '%s' requires an argument\n", + config->script_path, opt->name); + + process_option(options[i].val, + opt->value, config, + config->script_path); + + opt = opt->next; + } +} + +/* The parser calls this callback after it finishes parsing all + * --foo=bar options inside the script. At this point we know all + * command line and in-script options, and can finalize our + * configuration. Notably, this allows us to know when we parse a TCP + * packet line in the script whether we should create an IPv4 or IPv6 + * packet. + */ +void parse_and_finalize_config(struct invocation *invocation) +{ + DEBUGP("parse_and_finalize_config\n"); + + /* Parse options in script */ + parse_script_options(invocation->config, + invocation->script->option_list); + + /* Command line options overwrite options in script */ + parse_command_line_options(invocation->argc, invocation->argv, + invocation->config); + + /* Now take care of the last details */ + finalize_config(invocation->config); +} diff --git a/test/packetdrill/config.h b/test/packetdrill/config.h new file mode 100644 index 0000000..649a8c4 --- /dev/null +++ b/test/packetdrill/config.h @@ -0,0 +1,204 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Configuration information for a test run, and helper functions. + */ + +#ifndef __CONFIG_H__ +#define __CONFIG_H__ + +#include "types.h" + +#include +#include +#include +#include +#include "ip_address.h" +#include "ip_prefix.h" +#include "script.h" + +#define TUN_DRIVER_SPEED_CUR 0 /* don't change current speed */ +#define TUN_DRIVER_DEFAULT_MTU 1500 /* default MTU for tun device */ + +extern struct option options[]; + +/* A linked list of symbol->value (FOO=bar) definitions from command line. */ +struct definition { + char *symbol; /* name of the symbol; owns the string */ + char *value; /* value of the symbol; owns the string */ + struct definition *next; /* link for linked list */ +}; + +/* Return the definition in the linked list with a matching symbol, or NULL */ +static inline struct definition *definition_find(struct definition *defs, + char *symbol) +{ + struct definition *def = NULL; + + for (def = defs; def != NULL; def = def->next) { + if (strcmp(def->symbol, symbol) == 0) + return def; + } + return NULL; +} + +/* Set the value of the given symbol to the given value. */ +static inline void definition_set(struct definition **defs, + char *symbol, char *value) +{ + struct definition *def = definition_find(*defs, symbol); + + if (def) { + free(def->value); + def->value = value; + } else { + def = calloc(1, sizeof(struct definition)); + def->symbol = symbol; + def->value = value; + def->next = *defs; /* link to existing entries */ + *defs = def; /* insert at head of linked list */ + } +} + +/* Return the value of the given symbol, or NULL if not found. */ +static inline char *definition_get(struct definition *defs, char *symbol) +{ + struct definition *def = definition_find(defs, symbol); + + return def ? def->value : NULL; +} + +struct config { + const char **argv; /* a copy of process argv */ + + enum ip_version_t ip_version; /* v4, v4-mapped-v6, v6 */ + int socket_domain; /* AF_INET or AF_INET6 */ + int wire_protocol; /* AF_INET or AF_INET6 */ + + u16 live_bind_port; /* local port for bind() */ + u16 live_connect_port; /* remote port for connect() */ + + struct ip_address live_bind_ip; /* address for bind() */ + struct ip_address live_connect_ip; /* address for connect() */ + + struct ip_address live_local_ip; /* local interface IP */ + struct ip_address live_remote_ip; /* remote interface IP */ + struct ip_prefix live_remote_prefix; /* remote prefix under test */ + struct ip_address live_gateway_ip; /* gateway interface IP */ + + char live_local_ip_string[ADDR_STR_LEN]; /* human-readable IP */ + char live_remote_ip_string[ADDR_STR_LEN]; /* human-readable IP */ + char live_remote_prefix_string[ADDR_STR_LEN]; /* / */ + + char live_gateway_ip_string[ADDR_STR_LEN]; /* local gateway IP */ + char live_netmask_ip_string[ADDR_STR_LEN]; /* local netmask */ + + int live_prefix_len; /* IPv4/IPv6 interface prefix len */ + + int tolerance_usecs; /* tolerance for time divergence */ + bool tcp_ts_ecr_scaled; /* scale arbitrary inbound TS ECR? */ + int tcp_ts_tick_usecs; /* microseconds per TS val tick */ + + u32 speed; /* speed reported by tun driver; + * may require special tun driver + */ + int mss; /* gso_size for GRO packets to tun device */ + int mtu; /* MTU of tun device */ + + bool strict_segments; /* check exact segmentation? */ + + bool non_fatal_packet; /* treat packet asserts as non-fatal */ + bool non_fatal_syscall; /* treat syscall asserts as non-fatal */ + bool send_omit_free; /* do not call free() */ + + bool dry_run; /* parse script but don't execute? */ + + bool verbose; /* print detailed debug info? */ + char *script_path; /* pathname of script file */ + + /* Shell command to invoke via system(3) to run post-processing code */ + char *code_command_line; + + /* Language to emit when generating post-processing code */ + char *code_format; + + /* setsockopt option number (TCP_INFO, ...) for code */ + char *code_sockopt; + + /* File scripts to run at beginning of test (using system) */ + char *init_scripts; + + /* For remote on-the-wire testing using a real NIC. */ + bool is_wire_client; /* use a real NIC and be client? */ + bool is_wire_server; /* use a real NIC and be server? */ + char *wire_client_device; /* iface name for send/receive */ + char *wire_server_device; /* iface name for send/receive */ + struct ip_address wire_server_ip; /* IP of on-the-wire server */ + char *wire_server_ip_string; /* malloc-ed server IP string */ + u16 wire_server_port; /* the port the server listens on */ + + /* For testing against a shared object (*.so) file. */ + char *so_filename; + char *so_flags; + + /* For anyip testing */ + bool is_anyip; + + /* List of FOO=bar definitions from command line. */ + struct definition *defines; +}; + +/* Top-level info about the invocation of a test script */ +struct invocation { + int argc; /* count of process command line args */ + char **argv; /* process command line args */ + struct config *config; /* run-time configuration */ + struct script *script; /* parse tree of the script to run */ +}; + +/* Set default configuration */ +extern void set_default_config(struct config *config); + +/* Parse the "non-fatal" command line options given the (comma-delimited) string + * from the command line. Modifies the associated booleans in the given + * config. + */ +extern void parse_non_fatal_arg(char *arg, struct config *config); + +/* Perform configuration processing that can only be done after we've + * seen the full config. For example, we only know how to use IP + * addresses after we know if we're doing ipv4, ipv4-mapped-ipv6, or + * ipv6. Call this after all options have been parsed. + */ +extern void finalize_config(struct config *config); + +extern void show_usage(void); + +/* Parse command line options. Returns a pointer to the first argument + * beyond the options. + */ +extern char **parse_command_line_options(int argc, char *argv[], + struct config *config); + +/* The parser calls this function to finalize processing of config info. */ +extern void parse_and_finalize_config(struct invocation *invocation); + +#endif /* __CONFIG_H__ */ diff --git a/test/packetdrill/configure b/test/packetdrill/configure new file mode 100755 index 0000000..e32ffc6 --- /dev/null +++ b/test/packetdrill/configure @@ -0,0 +1,3 @@ +#!/bin/sh + +ln -sf Makefile.`uname` Makefile diff --git a/test/packetdrill/contrib/packetdrill.el b/test/packetdrill/contrib/packetdrill.el new file mode 100644 index 0000000..3b71058 --- /dev/null +++ b/test/packetdrill/contrib/packetdrill.el @@ -0,0 +1,45 @@ +(setq packetdrill-keywords '("sa_family" "sin_port" "sin_addr" "msg_name" "msg_iov" "msg_flags" "fd" "events" "revents" "htons" "icmp" "udp" "inet_addr" "inet6_addr" "ack" "eol" "ecr" "mss" "mtu" "nop" "sack" "sackOK" "TS" "FO" "FOEXP" "val" "win" "wscale" "ect01" "ect0" "ect1" "noecn" "ce")) + +(setq packetdrill-constants '("AF_INET" "AF_INET6" "PF_INET" "PF_INET6" "SOCK_STREAM" "SOCK_DGRAM" "IPPROTO_IP" "IPPROTO_IPV6" "IPPROTO_ICMP" "IPPROTO_TCP" "IPPROTO_UDP" "SOL_SOCKET" "SOL_IP" "SOL_IPV6" "SOL_TCP" "SOL_UDP" "SO_ACCEPTCONN" "SO_ATTACH_FILTER" "SO_BINDTODEVICE" "SO_BROADCAST" "SO_BSDCOMPAT" "SO_DEBUG" "SO_DETACH_FILTER" "SO_DONTROUTE" "SO_ERROR" "SO_KEEPALIVE" "SO_LINGER" "SO_NO_CHECK" "SO_OOBINLINE" "SO_PASSCRED" "SO_PEERCRED" "SO_PEERNAME" "SO_PEERSEC" "SO_PRIORITY" "SO_RCVBUF" "SO_RCVLOWAT" "SO_RCVTIMEO" "SO_REUSEADDR" "SO_REUSEPORT" "SO_SECURITY_AUTHENTICATION" "SO_SECURITY_ENCRYPTION_NETWORK" "SO_SECURITY_ENCRYPTION_TRANSPORT" "SO_SNDBUF" "SO_SNDLOWAT" "SO_SNDTIMEO" "SO_TIMESTAMP" "SO_TYPE" "SO_MAX_PACING_RATE" "SO_ZEROCOPY" "IP_TOS" "IP_MTU_DISCOVER" "IP_PMTUDISC_WANT" "IP_PMTUDISC_DONT" "IP_PMTUDISC_DO" "IP_PMTUDISC_PROBE" "IP_MTU" "IPV6_MTU" "TCP_NODELAY" "TCP_MAXSEG" "TCP_CORK" "TCP_KEEPIDLE" "TCP_KEEPINTVL" "TCP_KEEPCNT" "TCP_SYNCNT" "TCP_LINGER2" "TCP_DEFER_ACCEPT" "TCP_INFO" "TCP_QUICKACK" "TCP_CONGESTION" "TCP_MD5SIG" "TCP_COOKIE_TRANSACTIONS" "TCP_THIN_LINEAR_TIMEOUTS" "TCP_THIN_DUPACK" "TCP_USER_TIMEOUT" "TCP_CWND" "TCP_SAVE_SYN" "TCP_SAVED_SYN" "TCP_FASTOPEN" "TCP_FASTOPEN_CONNECT" "TCP_MULTIPLE_CONNECTIONS" "O_RDONLY" "O_WRONLY" "O_RDWR" "O_ACCMODE" "O_CREAT" "O_EXCL" "O_NOCTTY" "O_TRUNC" "O_APPEND" "O_NONBLOCK" "F_DUPFD" "F_GETFD" "F_SETFD" "F_GETFL" "F_SETFL" "F_GETLK" "F_SETLK" "F_SETLKW" "F_GETOWN" "F_SETOWN" "F_SETSIG" "F_GETSIG" "F_GETOWN" "F_SETOWN" "F_SETLK" "F_SETLKW" "F_GETLK" "F_SETLK64" "F_SETLKW64" "F_GETLK64" "F_SETLEASE" "F_GETLEASE" "F_NOTIFY" "F_DUPFD_CLOEXEC" "FD_CLOEXEC" "LOCK_SH" "LOCK_EX" "LOCK_NB" "LOCK_UN" "F_RDLCK" "F_WRLCK" "F_UNLCK" "F_EXLCK" "F_SHLCK" "SEEK_SET" "SEEK_CUR" "SEEK_END" "MSG_OOB" "MSG_DONTROUTE" "MSG_PEEK" "MSG_CTRUNC" "MSG_PROXY" "MSG_EOR" "MSG_WAITALL" "MSG_TRUNC" "MSG_CTRUNC" "MSG_ERRQUEUE" "MSG_DONTWAIT" "MSG_CONFIRM" "MSG_FIN" "MSG_SYN" "MSG_RST" "MSG_NOSIGNAL" "MSG_MORE" "MSG_CMSG_CLOEXEC" "MSG_FASTOPEN" "MSG_ZEROCOPY" "SIOCINQ" "FIONREAD" "POLLIN" "POLLPRI" "POLLOUT" "POLLRDNORM" "POLLRDBAND" "POLLWRNORM" "POLLWRBAND" "POLLMSG" "POLLREMOVE" "POLLRDHUP" "POLLERR" "POLLHUP" "POLLNVAL" "EPERM" "ENOENT" "ESRCH" "EINTR" "EIO" "ENXIO" "E2BIG" "ENOEXEC" "EBADF" "ECHILD" "EAGAIN" "ENOMEM" "EACCES" "EFAULT" "ENOTBLK" "EBUSY" "EEXIST" "EXDEV" "ENODEV" "ENOTDIR" "EISDIR" "EINVAL" "ENFILE" "EMFILE" "ENOTTY" "ETXTBSY" "EFBIG" "ENOSPC" "ESPIPE" "EROFS" "EMLINK" "EPIPE" "EDOM" "ERANGE" "EDEADLK" "ENAMETOOLONG" "ENOLCK" "ENOSYS" "ENOTEMPTY" "ELOOP" "EWOULDBLOCK" "ENOMSG" "EIDRM" "ECHRNG" "EL2NSYNC" "EL3HLT" "EL3RST" "ELNRNG" "EUNATCH" "ENOCSI" "EL2HLT" "EBADE" "EBADR" "EXFULL" "ENOANO" "EBADRQC" "EBADSLT" "EDEADLOCK" "EBFONT" "ENOSTR" "ENODATA" "ETIME" "ENOSR" "ENONET" "ENOPKG" "EREMOTE" "ENOLINK" "EADV" "ESRMNT" "ECOMM" "EPROTO" "EMULTIHOP" "EDOTDOT" "EBADMSG" "EOVERFLOW" "ENOTUNIQ" "EBADFD" "EREMCHG" "ELIBACC" "ELIBBAD" "ELIBSCN" "ELIBMAX" "ELIBEXEC" "EILSEQ" "ERESTART" "ESTRPIPE" "EUSERS" "ENOTSOCK" "EDESTADDRREQ" "EMSGSIZE" "EPROTOTYPE" "ENOPROTOOPT" "EPROTONOSUPPORT" "ESOCKTNOSUPPORT" "EOPNOTSUPP" "EPFNOSUPPORT" "EAFNOSUPPORT" "EADDRINUSE" "EADDRNOTAVAIL" "ENETDOWN" "ENETUNREACH" "ENETRESET" "ECONNABORTED" "ECONNRESET" "ENOBUFS" "EISCONN" "ENOTCONN" "ESHUTDOWN" "ETOOMANYREFS" "ETIMEDOUT" "ECONNREFUSED" "EHOSTDOWN" "EHOSTUNREACH" "EALREADY" "EINPROGRESS" "ESTALE" "EUCLEAN" "ENOTNAM" "ENAVAIL" "EISNAM" "EREMOTEIO" "EDQUOT" "ENOMEDIUM" "EMEDIUMTYPE" "ECANCELED" "ENOKEY" "EKEYEXPIRED" "EKEYREVOKED" "EKEYREJECTED" "EOWNERDEAD" "ENOTRECOVERABLE" "ERFKILL" "POLLIN" "POLLPRI" "POLLOUT" "POLLRDNORM" "POLLRDBAND" "POLLWRNORM" "POLLWRBAND" "POLLMSG" "POLLREMOVE" "POLLRDHUP" "POLLERR" "POLLHUP" "POLLNVAL")) + +(setq packetdrill-functions '("accept" "bind" "close" "connect" "fcntl" "getsockopt" "ioctl" "listen" "poll" "read" "readv" "recv" "recvfrom" "recvmsg" "send" "sendmsg" "sendto" "setsockopt" "shutdown" "socket" "write" "writev")) + +;; create the regex string for each class of keywords +(setq packetdrill-keywords-regexp (regexp-opt packetdrill-keywords 'words)) +(setq packetdrill-constant-regexp (regexp-opt packetdrill-constants 'words)) +(setq packetdrill-functions-regexp (regexp-opt packetdrill-functions 'words)) + +;; clear memory +(setq packetdrill-keywords nil) +(setq packetdrill-constants nil) +(setq packetdrill-functions nil) + +;; create the list for font-lock. +;; each class of keyword is given a particular face +(setq packetdrill-font-lock-keywords + `( + ("%{\\(.*\\n?\\)*}%" . font-lock-string-face) + ("`\\(.*\\n?\\)*`" . font-lock-warning-face) + ("\\.\\.\\." . font-lock-type-face) + ("\\s-<\\s-" . font-lock-warning-face) + ("\\s->\\s-" . font-lock-keyword-face) + (,packetdrill-constant-regexp . font-lock-constant-face) + (,packetdrill-functions-regexp . font-lock-function-name-face) + (,packetdrill-keywords-regexp . font-lock-preprocessor-face) + )) + +;; define the mode +(define-derived-mode packetdrill-mode c-mode + "packetdrill mode" + "Major mode for editing packetdrill scripts" + ;; code for syntax highlighting + (setq font-lock-defaults '((packetdrill-font-lock-keywords))) + + ;; clear memory + (setq packetdrill-keywords-regexp nil) + (setq packetdrill-types-regexp nil) + (setq packetdrill-constants-regexp nil) + (setq packetdrill-functions-regexp nil) + ) + +(provide 'packetdrill-mode) diff --git a/test/packetdrill/contrib/packetdrill.vim b/test/packetdrill/contrib/packetdrill.vim new file mode 100644 index 0000000..a45da1e --- /dev/null +++ b/test/packetdrill/contrib/packetdrill.vim @@ -0,0 +1,125 @@ +" Vim syntax file +" Language: Packetdrill +" Maintainer: Barath Raghavan +" Last Change: 2013 Jul 27 + +" Quit when a (custom) syntax file was already loaded +if exists("b:current_syntax") + finish +endif + +let s:cpo_save = &cpo +set cpo&vim + +syn keyword pKeyword sa_family sin_port sin_addr msg_name msg_iov msg_flags fd events revents htons icmp udp inet_addr ack eol ecr mss mtu nop sack sackOK TS FO FOEXP val win wscale ect01 ect0 ect1 noecn ce +syn keyword pConstant AF_INET AF_INET6 PF_INET PF_INET6 SOCK_STREAM SOCK_DGRAM IPPROTO_IP IPPROTO_IPV6 IPPROTO_ICMP IPPROTO_TCP IPPROTO_UDP SOL_SOCKET SOL_IP SOL_IPV6 SOL_TCP SOL_UDP SO_ACCEPTCONN SO_ATTACH_FILTER SO_BINDTODEVICE SO_BROADCAST SO_BSDCOMPAT SO_DEBUG SO_DETACH_FILTER SO_DONTROUTE SO_ERROR SO_KEEPALIVE SO_LINGER SO_NO_CHECK SO_OOBINLINE SO_PASSCRED SO_PEERCRED SO_PEERNAME SO_PEERSEC SO_PRIORITY SO_RCVBUF SO_RCVLOWAT SO_RCVTIMEO SO_REUSEADDR SO_REUSEPORT SO_SECURITY_AUTHENTICATION SO_SECURITY_ENCRYPTION_NETWORK SO_SECURITY_ENCRYPTION_TRANSPORT SO_SNDBUF SO_SNDLOWAT SO_SNDTIMEO SO_TIMESTAMP SO_TYPE SO_MAX_PACING_RATE SO_ZEROCOPY IP_TOS IP_MTU_DISCOVER IP_PMTUDISC_WANT IP_PMTUDISC_DONT IP_PMTUDISC_DO IP_PMTUDISC_PROBE IP_MTU IPV6_MTU TCP_NODELAY TCP_MAXSEG TCP_CORK TCP_KEEPIDLE TCP_KEEPINTVL TCP_KEEPCNT TCP_SYNCNT TCP_LINGER2 TCP_DEFER_ACCEPT TCP_INFO TCP_QUICKACK TCP_CONGESTION TCP_MD5SIG TCP_COOKIE_TRANSACTIONS TCP_THIN_LINEAR_TIMEOUTS TCP_THIN_DUPACK TCP_USER_TIMEOUT TCP_CWND TCP_SAVE_SYN TCP_SAVED_SYN TCP_FASTOPEN TCP_MULTIPLE_CONNECTIONS +syn keyword pConstant O_RDONLY O_WRONLY O_RDWR O_ACCMODE O_CREAT O_EXCL O_NOCTTY O_TRUNC O_APPEND O_NONBLOCK F_DUPFD F_GETFD F_SETFD F_GETFL F_SETFL F_GETLK F_SETLK F_SETLKW F_GETOWN F_SETOWN F_SETSIG F_GETSIG F_GETOWN F_SETOWN F_SETLK F_SETLKW F_GETLK F_SETLK64 F_SETLKW64 F_GETLK64 F_SETLEASE F_GETLEASE F_NOTIFY F_DUPFD_CLOEXEC FD_CLOEXEC LOCK_SH LOCK_EX LOCK_NB LOCK_UN F_RDLCK F_WRLCK F_UNLCK F_EXLCK F_SHLCK SEEK_SET SEEK_CUR SEEK_END MSG_OOB MSG_DONTROUTE MSG_PEEK MSG_CTRUNC MSG_PROXY MSG_EOR MSG_WAITALL MSG_TRUNC MSG_CTRUNC MSG_ERRQUEUE MSG_DONTWAIT MSG_CONFIRM MSG_FIN MSG_SYN MSG_RST MSG_NOSIGNAL MSG_MORE MSG_CMSG_CLOEXEC MSG_FASTOPEN MSG_ZEROCOPY SIOCINQ FIONREAD POLLIN POLLPRI POLLOUT POLLRDNORM POLLRDBAND POLLWRNORM POLLWRBAND POLLMSG POLLREMOVE POLLRDHUP POLLERR POLLHUP POLLNVAL EPERM ENOENT ESRCH EINTR EIO ENXIO E2BIG ENOEXEC EBADF ECHILD EAGAIN ENOMEM EACCES EFAULT ENOTBLK EBUSY EEXIST EXDEV ENODEV ENOTDIR EISDIR EINVAL ENFILE EMFILE ENOTTY ETXTBSY EFBIG ENOSPC ESPIPE EROFS EMLINK EPIPE EDOM ERANGE EDEADLK ENAMETOOLONG ENOLCK ENOSYS ENOTEMPTY ELOOP EWOULDBLOCK ENOMSG EIDRM ECHRNG EL2NSYNC EL3HLT EL3RST ELNRNG EUNATCH ENOCSI EL2HLT EBADE EBADR EXFULL ENOANO EBADRQC EBADSLT EDEADLOCK EBFONT ENOSTR ENODATA ETIME ENOSR ENONET ENOPKG EREMOTE ENOLINK EADV ESRMNT ECOMM EPROTO EMULTIHOP EDOTDOT EBADMSG EOVERFLOW ENOTUNIQ EBADFD EREMCHG ELIBACC ELIBBAD ELIBSCN ELIBMAX ELIBEXEC EILSEQ ERESTART ESTRPIPE EUSERS ENOTSOCK EDESTADDRREQ EMSGSIZE EPROTOTYPE ENOPROTOOPT EPROTONOSUPPORT ESOCKTNOSUPPORT EOPNOTSUPP EPFNOSUPPORT EAFNOSUPPORT EADDRINUSE EADDRNOTAVAIL ENETDOWN ENETUNREACH ENETRESET ECONNABORTED ECONNRESET ENOBUFS EISCONN ENOTCONN ESHUTDOWN ETOOMANYREFS ETIMEDOUT ECONNREFUSED EHOSTDOWN EHOSTUNREACH EALREADY EINPROGRESS ESTALE EUCLEAN ENOTNAM ENAVAIL EISNAM EREMOTEIO EDQUOT ENOMEDIUM EMEDIUMTYPE ECANCELED ENOKEY EKEYEXPIRED EKEYREVOKED EKEYREJECTED EOWNERDEAD ENOTRECOVERABLE ERFKILL POLLIN POLLPRI POLLOUT POLLRDNORM POLLRDBAND POLLWRNORM POLLWRBAND POLLMSG POLLREMOVE POLLRDHUP POLLERR POLLHUP POLLNVAL +syn keyword pSyscall accept bind close connect fcntl getsockopt ioctl listen poll read readv recv recvfrom recvmsg send sendmsg sendto setsockopt shutdown socket write writev +syn keyword pPythonCmds contained assert print +syn region pPython start='%{' end='}%' contains=pPythonCmds +syn keyword pShellCmds contained sysctl +syn region pShell start='`' end='`' contains=pShellCmds +syn keyword pEllipsis '...' +syn match pInputPkt "\s\+\zs<\ze\s\+" +syn match pOutputPkt "\s\+\zs>\ze\s\+" + +" Below is stuff inherited from C, suitably modified. +" String and Character constants +" Highlight special characters (those which have a backslash) differently +syn match cSpecial display contained "\\\(x\x\+\|\o\{1,3}\|.\|$\)" +syn match cFormat display "%\(\d\+\$\)\=[-+' #0*]*\(\d*\|\*\|\*\d\+\$\)\(\.\(\d*\|\*\|\*\d\+\$\)\)\=\([hlLjzt]\|ll\|hh\)\=\([aAbdiuoxXDOUfFeEgGcCsSpn]\|\[\^\=.[^]]*\]\)" contained +syn match cFormat display "%%" contained +syn region cString start=+L\="+ skip=+\\\\\|\\"+ end=+"+ contains=cSpecial,cFormat,@Spell +" cCppString: same as cString, but ends at end of line +syn region cCppString start=+L\="+ skip=+\\\\\|\\"\|\\$+ excludenl end=+"+ end='$' contains=cSpecial,cFormat,@Spell + +" This should be before cErrInParen to avoid problems with #define ({ xxx }) +syn match cCurlyError "}" +syn region cBlock start="{" end="}" contains=ALLBUT,cBadBlock,cCurlyError,@cParenGroup,cErrInParen,cCppParen,cErrInBracket,cCppBracket,cCppString,@Spell fold + +"catch errors caused by wrong parenthesis and brackets +" also accept <% for {, %> for }, <: for [ and :> for ] (C99) +" But avoid matching <::. +syn cluster cParenGroup contains=cParenError,cSpecial,cCommentSkip,cCommentString,cComment2String,@cCommentGroup,cCommentStartError,cUserCont,cBitField,cOctalZero,@cCppOutInGroup,cFormat,cNumber,cFloat,cOctal,cOctalError,cNumbersCom +syn region cParen transparent start='(' end=')' end='}'me=s-1 contains=ALLBUT,@cParenGroup,cCppParen,cErrInBracket,cCppBracket,cCppString,@Spell +" cCppParen: same as cParen but ends at end-of-line; used in cDefine +syn region cCppParen transparent start='(' skip='\\$' excludenl end=')' end='$' contained contains=ALLBUT,@cParenGroup,cErrInBracket,cParen,cBracket,cString,@Spell +syn match cParenError display "[\])]" +"syn match cErrInParen display contained "[\]{}]\|<%\|%>" +syn region cBracket transparent start='\[\|<::\@!' end=']\|:>' end='}'me=s-1 contains=ALLBUT,@cParenGroup,cErrInParen,cCppParen,cCppBracket,cCppString,@Spell +" cCppBracket: same as cParen but ends at end-of-line; used in cDefine +syn region cCppBracket transparent start='\[\|<::\@!' skip='\\$' excludenl end=']\|:>' end='$' contained contains=ALLBUT,@cParenGroup,cErrInParen,cParen,cBracket,cString,@Spell +"syn match cErrInBracket display contained "[);{}]\|<%\|%>" + +"integer number, or floating point number without a dot and with "f". +syn case ignore +syn match cNumbers display transparent "\<\d\|\.\d" contains=cNumber,cFloat,cOctalError,cOctal +" Same, but without octal error (for comments) +syn match cNumbersCom display contained transparent "\<\d\|\.\d" contains=cNumber,cFloat,cOctal +syn match cNumber display contained "\d\+\(u\=l\{0,2}\|ll\=u\)\>" +"hex number +syn match cNumber display contained "0x\x\+\(u\=l\{0,2}\|ll\=u\)\>" +" Flag the first zero of an octal number as something special +syn match cOctal display contained "0\o\+\(u\=l\{0,2}\|ll\=u\)\>" contains=cOctalZero +syn match cOctalZero display contained "\<0" +syn match cFloat display contained "\d\+f" +"floating point number, with dot, optional exponent +syn match cFloat display contained "\d\+\.\d*\(e[-+]\=\d\+\)\=[fl]\=" +"floating point number, starting with a dot, optional exponent +syn match cFloat display contained "\.\d\+\(e[-+]\=\d\+\)\=[fl]\=\>" +"floating point number, without dot, with exponent +syn match cFloat display contained "\d\+e[-+]\=\d\+[fl]\=\>" + +" flag an octal number with wrong digits +syn match cOctalError display contained "0\o*[89]\d*" +syn case match + +syn region cCommentL start="//" skip="\\$" end="$" keepend contains=@cCommentGroup,cSpaceError,@Spell +syn region cComment matchgroup=cCommentStart start="/\*" end="\*/" contains=@cCommentGroup,cCommentStartError,cSpaceError,@Spell extend + +" keep a // comment separately, it terminates a preproc. conditional +syn match cCommentError display "\*/" +syn match cCommentStartError display "/\*"me=e-1 contained + +" Define the default highlighting. +" Only used when an item doesn't have highlighting yet +hi def link pKeyword Conditional +hi def link pConstant Constant +hi def link pSyscall Type +hi def link pPythonCmds Label +hi def link pPython PreProc +hi def link pShellCmds Label +hi def link pShell PreCondit +hi def link pEllipsis String +hi def link pInputPkt Todo +hi def link pOutputPkt Error + +hi def link cFormat cSpecial +hi def link cCppString cString +hi def link cCommentL cComment +hi def link cCommentStart cComment +hi def link cNumber Number +hi def link cOctal Number +hi def link cOctalZero PreProc " link this to Error if you want +hi def link cFloat Float +hi def link cOctalError cError +hi def link cParenError cError +hi def link cErrInParen cError +hi def link cErrInBracket cError +hi def link cCommentError cError +hi def link cCommentStartError cError +hi def link cSpecialError cError +hi def link cError Error +hi def link cCommentString cString +hi def link cComment2String cString +hi def link cCommentSkip cComment +hi def link cString String +hi def link cComment Comment +hi def link cSpecial SpecialChar +hi def link cCppOut Comment + +let b:current_syntax = "packetdrill" + +let &cpo = s:cpo_save +unlet s:cpo_save +" vim: ts=8 diff --git a/test/packetdrill/epoll.c b/test/packetdrill/epoll.c new file mode 100644 index 0000000..5e3e79c --- /dev/null +++ b/test/packetdrill/epoll.c @@ -0,0 +1,55 @@ +/* + * Copyright 2017 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: weiwan@google.com (Wei Wang) + * + * Implementation for the epoll fd related state and logic. + */ + +#include "epoll.h" + +#include +#include +#include "run.h" + +void epoll_free(struct epoll *epoll) +{ + memset(epoll, 0, sizeof(*epoll)); + free(epoll); +} + +void epoll_close(struct state *state, struct fd_state *fd) +{ + epoll_free(fd_to_epoll(fd)); +} + +/* Global info about epoll descriptors that point to epolls. */ +struct fd_ops epoll_ops = { + .type = FD_EPOLL, + .close = epoll_close, +}; + +struct epoll *epoll_new(struct state *state) +{ + struct epoll *epoll = calloc(1, sizeof(struct epoll)); + + epoll->fd.ops = &epoll_ops; + state_add_fd(state, to_fd(epoll)); + return epoll; +} diff --git a/test/packetdrill/epoll.h b/test/packetdrill/epoll.h new file mode 100644 index 0000000..dac032a --- /dev/null +++ b/test/packetdrill/epoll.h @@ -0,0 +1,62 @@ +/* + * Copyright 2017 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: weiwan@google.com (Wei Wang) + * + * Interface for tracking epolls in the kernel under test. + */ + +#ifndef __EPOLL_HDR_H__ +#define __EPOLL_HDR_H__ + +#include "types.h" + +#include "fd_state.h" + +/* Type specification for epoll_event->data */ +enum epoll_data_type_t { + EPOLL_DATA_PTR = 1, + EPOLL_DATA_FD, + EPOLL_DATA_U32, + EPOLL_DATA_U64, +}; + +/* The runtime state for epoll */ +struct epoll { + /* NOTE: struct fd_state must be first field in all fd flavors. */ + struct fd_state fd; /* info about fd for this epoll event */ +}; + +/* Convert to epoll pointer if the fd has type FD_EPOLL, + * otherwise return NULL. + */ +static inline struct epoll *fd_to_epoll(struct fd_state *fd) +{ + if (fd && fd->ops->type == FD_EPOLL) + return (struct epoll *)fd; + else + return NULL; +} + +struct state; + +/* Allocate and return a new epoll object. */ +extern struct epoll *epoll_new(struct state *state); + +#endif /* __EPOLL_HDR_H__ */ diff --git a/test/packetdrill/ethernet.h b/test/packetdrill/ethernet.h new file mode 100644 index 0000000..5713d04 --- /dev/null +++ b/test/packetdrill/ethernet.h @@ -0,0 +1,75 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Ethernet-related declarations. + * + * We cannot include the kernel's linux/if_ether.h because this tool + * tries to compile and work for basically any Linux/BSD kernel + * version. So we have our version of the Ethernet-related + * declarations we require here. + */ + +#ifndef __ETHERNET_H__ +#define __ETHERNET_H__ + +#include "types.h" + +/* Bytes in an Ethernet address. */ +#define ETH_ALEN 6 + +/* Ethernet header ether_type values. */ +#define ETHERTYPE_IP 0x0800 /* IP protocol version 4 */ +#define ETHERTYPE_IPV6 0x86dd /* IP protocol version 6 */ +#define ETHERTYPE_MPLS_UC 0x8847 /* MPLS unicast */ +#define ETHERTYPE_MPLS_MC 0x8848 /* MPLS multicast */ + +/* To tell a packet socket that you want traffic for all protocols. */ +#define ETH_P_ALL 0x0003 + +/* Ethernet address. */ +struct ether_addr { + u8 ether_addr_octet[ETH_ALEN]; +} __attribute__ ((__packed__)); + +/* Ethernet header. */ +struct ether_header { + u8 ether_dhost[ETH_ALEN]; /* destination Ethernet address */ + u8 ether_shost[ETH_ALEN]; /* source Ethernet address */ + u16 ether_type; /* packet type ID field */ +} __attribute__ ((__packed__)); + +static inline void ether_copy(void *dst, const void *src) +{ + memcpy(dst, src, sizeof(struct ether_addr)); +} + +/* Return the ether_type field for packets of the given address family. */ +static inline u16 ether_type_for_family(int address_family) +{ + if (address_family == AF_INET) + return ETHERTYPE_IP; + else if (address_family == AF_INET6) + return ETHERTYPE_IPV6; + else + assert(!"bad address family"); +} + +#endif /* __ETHERNET_H__ */ diff --git a/test/packetdrill/fd_state.h b/test/packetdrill/fd_state.h new file mode 100644 index 0000000..f08a559 --- /dev/null +++ b/test/packetdrill/fd_state.h @@ -0,0 +1,64 @@ +/* + * Copyright 2017 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Interface for tracking file descriptors in the kernel under test. + */ + +#ifndef __FD_STATE_H__ +#define __FD_STATE_H__ + +#include "types.h" + +/* The types of file descriptor objects packetdrill can test. */ +enum fd_type_t { + FD_SOCKET = 1, + FD_FILE, + FD_PIPE, + FD_EPOLL, +}; + +struct state; +struct fd_state; + +/* Global info about a particular kind of file descriptor. */ +struct fd_ops { + enum fd_type_t type; /* type of this file descriptor */ + + /* Handler for closing fd. */ + void (*close)(struct state *state, struct fd_state *fd); +}; + +/* State for a file descriptor during script execution. */ +struct fd_state { + struct fd_ops *ops; /* info/ops for this type of fd */ + int script_fd; /* file descriptor in the script source */ + int live_fd; /* file descriptor in packetdrill runtime */ + bool is_closed; /* has app called close(2) ? */ + struct fd_state *next; /* next fd in linked list */ +}; + +/* To cast any type of fd to the base classs. */ +static inline struct fd_state *to_fd(void *fd) +{ + return (struct fd_state *)fd; +} + +#endif /* __FD_STATE_H__ */ diff --git a/test/packetdrill/file.c b/test/packetdrill/file.c new file mode 100644 index 0000000..56e6cf3 --- /dev/null +++ b/test/packetdrill/file.c @@ -0,0 +1,55 @@ +/* + * Copyright 2017 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Implementation for the file-related state and logic. + */ + +#include "file.h" + +#include +#include +#include "run.h" + +void file_free(struct file *file) +{ + memset(file, 0, sizeof(*file)); /* paranoia to help catch bugs */ + free(file); +} + +void file_close(struct state *state, struct fd_state *fd) +{ + file_free(fd_to_file(fd)); +} + +/* Global info about file descriptors that point to files. */ +struct fd_ops file_ops = { + .type = FD_FILE, + .close = file_close, +}; + +struct file *file_new(struct state *state) +{ + struct file *file = calloc(1, sizeof(struct file)); + + file->fd.ops = &file_ops; + state_add_fd(state, to_fd(file)); + return file; +} diff --git a/test/packetdrill/file.h b/test/packetdrill/file.h new file mode 100644 index 0000000..22d084f --- /dev/null +++ b/test/packetdrill/file.h @@ -0,0 +1,52 @@ +/* + * Copyright 2017 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Interface for tracking files in the kernel under test. + */ + +#ifndef __FILE_H__ +#define __FILE_H__ + +#include "types.h" + +#include "fd_state.h" + +/* The runtime state for a file */ +struct file { + /* NOTE: struct fd_state must be first field in all fd flavors. */ + struct fd_state fd; /* info about fd for this file */ +}; + +/* Convert to file pointer if the fd is a file, otherwise return NULL. */ +static inline struct file *fd_to_file(struct fd_state *fd) +{ + if (fd && fd->ops->type == FD_FILE) + return (struct file *)fd; + else + return NULL; +} + +struct state; + +/* Allocate and return a new file object. */ +extern struct file *file_new(struct state *state); + +#endif /* __FILE_H__ */ diff --git a/test/packetdrill/fmemopen.c b/test/packetdrill/fmemopen.c new file mode 100644 index 0000000..5ed2be5 --- /dev/null +++ b/test/packetdrill/fmemopen.c @@ -0,0 +1,81 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * FreeBSD does not have an fmemopen(), so we roll our own minimalist + * implementation here. + */ + +#include "types.h" + +#include +#include + +#include "assert.h" +#include "fmemopen.h" + +#if !defined(HAVE_FMEMOPEN) + +struct fmemopen_read_state { + char *next; /* the next byte to return */ + char *end; /* the byte after the end of the string */ +}; + +static int fmemopen_readfn(void *cookie, char *buf, int len) +{ + struct fmemopen_read_state *read_cookie = + (struct fmemopen_read_state *)cookie; + int bytes = 0; + + assert(read_cookie->next <= read_cookie->end); + if (read_cookie->next == read_cookie->end) + return 0; + + bytes = read_cookie->end - read_cookie->next; + if (len < bytes) + bytes = len; + + memcpy(buf, read_cookie->next, bytes); + read_cookie->next += bytes; + + return bytes; +} + +FILE *fmemopen(char *str, size_t size, const char *mode) +{ + FILE *f = NULL; + struct fmemopen_read_state *read_cookie; + + assert(strcmp(mode, "r") == 0); /* only support read for now */ + + read_cookie = calloc(1, sizeof(struct fmemopen_read_state)); + read_cookie->next = str; + read_cookie->end = str + size; + + f = fropen(read_cookie, fmemopen_readfn); + if (!f) { + free(read_cookie); + return NULL; + } + + return f; +} + +#endif /* HAVE_FMEMOPEN */ diff --git a/test/packetdrill/fmemopen.h b/test/packetdrill/fmemopen.h new file mode 100644 index 0000000..f4bdbbb --- /dev/null +++ b/test/packetdrill/fmemopen.h @@ -0,0 +1,37 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * FreeBSD does not have an fmemopen(), so we roll our own minimalist + * implementation here. + */ + +#ifndef __FMEMOPEN_H__ +#define __FMEMOPEN_H__ + +#ifndef HAVE_FMEMOPEN + +#include "types.h" + +extern FILE *fmemopen(char *buf, size_t size, const char *mode); + +#endif /* HAVE_FMEMOPEN */ + +#endif /* __FMEMOPEN_H__ */ diff --git a/test/packetdrill/gre.h b/test/packetdrill/gre.h new file mode 100644 index 0000000..8947ccd --- /dev/null +++ b/test/packetdrill/gre.h @@ -0,0 +1,102 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Our own GRE header declarations, so we have something that's + * portable and somewhat more readable than a typical system header + * file. + * + * We cannot include the kernel's GRE .h files because this tool tries + * to compile and work for basically any Linux/BSD kernel version. So + * we declare our own version of various GRE-related definitions here. + */ + +#ifndef __GRE_HEADERS_H__ +#define __GRE_HEADERS_H__ + +#include "types.h" + +/* GRE header. See RFC 1701. */ + +#define GRE_MINLEN 4 /* smallest possible GRE header */ + +#define GRE_FLAG_C 0x8000 /* checksum */ +#define GRE_FLAG_R 0x4000 /* routing */ +#define GRE_FLAG_K 0x2000 /* key */ +#define GRE_FLAG_S 0x1000 /* sequence */ + +struct gre { + union { + __be16 flags; + + struct { +#if __BYTE_ORDER == __LITTLE_ENDIAN + __u16 recursion_control:3, + strict_route:1, + has_seq:1, + has_key:1, + has_routing:1, + has_checksum:1, + version:3, + reserved:4, + ack:1; +#elif __BYTE_ORDER == __BIG_ENDIAN + __u16 has_checksum:1, + has_routing:1, + has_key:1, + has_seq:1, + strict_route:1, + recursion_control:3, + ack:1, + reserved:4, + version:3; +#else +# error "Please fix endianness defines" +#endif + }; + }; + __be16 proto; + + /* The optional header fields live here. */ + union { + __be16 be16[6]; + __be32 be32[3]; + }; +}; + +/* Return the length in bytes of a GRE header. */ +static inline int gre_len(const struct gre *gre) +{ + int bytes = GRE_MINLEN; + + assert(gre->version == 0); /* we only support v0 */ + assert(!gre->has_routing); /* routing info is variable-length! */ + + if (gre->has_checksum || gre->has_routing) + bytes += 4; + if (gre->has_key) + bytes += 4; + if (gre->has_seq) + bytes += 4; + + return bytes; +} + +#endif /* __GRE_HEADERS_H__ */ diff --git a/test/packetdrill/gre_packet.c b/test/packetdrill/gre_packet.c new file mode 100644 index 0000000..235e6d9 --- /dev/null +++ b/test/packetdrill/gre_packet.c @@ -0,0 +1,56 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Implementation for module for formatting GRE packets. + */ + +#include "gre_packet.h" + +#include "ip_packet.h" +#include "gre.h" + +int gre_header_append(struct packet *packet, const struct gre *gre, char **error) +{ + struct header *header; + + header = packet_append_header(packet, HEADER_GRE, gre_len(gre)); + if (header == NULL) { + asprintf(error, "too many headers"); + return STATUS_ERR; + } + + memcpy(header->h.gre, gre, gre_len(gre)); + + return STATUS_OK; +} + +int gre_header_finish(struct packet *packet, + struct header *header, struct header *next_inner) +{ + struct gre *gre = header->h.gre; + int gre_bytes = gre_len(gre) + next_inner->total_bytes; + + gre->proto = htons(header_type_info(next_inner->type)->eth_proto); + + header->total_bytes = gre_bytes; + + return STATUS_OK; +} diff --git a/test/packetdrill/gre_packet.h b/test/packetdrill/gre_packet.h new file mode 100644 index 0000000..bceee2d --- /dev/null +++ b/test/packetdrill/gre_packet.h @@ -0,0 +1,45 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Interface for module for formatting GRE packets. + */ + +#ifndef __GRE_PACKET_H__ +#define __GRE_PACKET_H__ + +#include "types.h" + +#include "packet.h" + +/* Append a GRE header to the end of the given packet. On success, + * return STATUS_OK; on error return STATUS_ERR and fill in a + * malloc-allocated error message in *error. + */ +extern int gre_header_append(struct packet *packet, + const struct gre *gre, char **error); + +/* Finalize the GRE header by filling in all necessary fields that + * were not filled in at parse time. + */ +extern int gre_header_finish(struct packet *packet, + struct header *header, struct header *next_inner); + +#endif /* __GRE_PACKET_H__ */ diff --git a/test/packetdrill/hash.c b/test/packetdrill/hash.c new file mode 100644 index 0000000..7cfcd1a --- /dev/null +++ b/test/packetdrill/hash.c @@ -0,0 +1,430 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/*--------------------------------------------------------------------------- + * From public domain code at: + * http://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp + */ + +/*--------------------------------------------------------------------------- + * MurmurHash3 was written by Austin Appleby, and is placed in the public + * domain. The author hereby disclaims copyright to this source code. + * + * Note - The x86 and x64 versions do _not_ produce the same results, as the + * algorithms are optimized for their respective platforms. You can still + * compile and run any of them on any platform, but your performance with the + * non-native version will be less than optimal. + */ +#include "hash.h" + +/*--------------------------------------------------------------------------- + * Platform-specific functions and macros + */ + +static __always_inline u32 rotl32(u32 x, s8 r) +{ + return (x << r) | (x >> (32 - r)); +} + +static __always_inline u64 rotl64(u64 x, s8 r) +{ + return (x << r) | (x >> (64 - r)); +} + +#define ROTL32(x, y) rotl32(x, y) +#define ROTL64(x, y) rotl64(x, y) + +#define BIG_CONSTANT(x) (x##LLU) + +/*--------------------------------------------------------------------------- + * Block read - if your platform needs to do endian-swapping or can only + * handle aligned reads, do the conversion here + */ + +static __always_inline u32 getblock_32(const u32 *p, int i) +{ + return p[i]; +} + +static __always_inline u64 getblock_64(const u64 *p, int i) +{ + return p[i]; +} + +/*--------------------------------------------------------------------------- + * Finalization mix - force all bits of a hash block to avalanche + */ + +static __always_inline u32 fmix_32(u32 h) +{ + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + + return h; +} + +/*---------*/ + +static __always_inline u64 fmix_64(u64 k) +{ + k ^= k >> 33; + k *= BIG_CONSTANT(0xff51afd7ed558ccd); + k ^= k >> 33; + k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); + k ^= k >> 33; + + return k; +} + +/*---------------------------------------------------------------------------*/ + +void MurmurHash3_x86_32(const void *key, int len, u32 seed, void *out) +{ + const u8 *data = (const u8 *)key; + const int nblocks = len / 4; + + u32 h1 = seed; + + u32 c1 = 0xcc9e2d51; + u32 c2 = 0x1b873593; + + /*---------*/ + /* body */ + + const u32 *blocks = (const u32 *)(data + nblocks * 4); + + int i; + for (i = -nblocks; i; i++) { + u32 k1 = getblock_32(blocks, i); + + k1 *= c1; + k1 = ROTL32(k1, 15); + k1 *= c2; + + h1 ^= k1; + h1 = ROTL32(h1, 13); + h1 = h1 * 5 + 0xe6546b64; + } + + /*---------*/ + /* tail */ + + const u8 *tail = (const u8 *)(data + nblocks * 4); + + u32 k1 = 0; + + switch (len & 3) { + case 3: + k1 ^= tail[2] << 16; + case 2: + k1 ^= tail[1] << 8; + case 1: + k1 ^= tail[0]; + k1 *= c1; + k1 = ROTL32(k1, 15); + k1 *= c2; + h1 ^= k1; + }; + + /*---------*/ + /* finalization */ + + h1 ^= len; + + h1 = fmix_32(h1); + + *(u32 *) out = h1; +} + +/*---------------------------------------------------------------------------*/ + +void MurmurHash3_x86_128(const void *key, const int len, u32 seed, void *out) +{ + const u8 *data = (const u8 *)key; + const int nblocks = len / 16; + + u32 h1 = seed; + u32 h2 = seed; + u32 h3 = seed; + u32 h4 = seed; + + u32 c1 = 0x239b961b; + u32 c2 = 0xab0e9789; + u32 c3 = 0x38b34ae5; + u32 c4 = 0xa1e38b93; + + /*---------*/ + /* body */ + + const u32 *blocks = (const u32 *)(data + nblocks * 16); + + int i; + for (i = -nblocks; i; i++) { + u32 k1 = getblock_32(blocks, i * 4 + 0); + u32 k2 = getblock_32(blocks, i * 4 + 1); + u32 k3 = getblock_32(blocks, i * 4 + 2); + u32 k4 = getblock_32(blocks, i * 4 + 3); + + k1 *= c1; + k1 = ROTL32(k1, 15); + k1 *= c2; + h1 ^= k1; + + h1 = ROTL32(h1, 19); + h1 += h2; + h1 = h1 * 5 + 0x561ccd1b; + + k2 *= c2; + k2 = ROTL32(k2, 16); + k2 *= c3; + h2 ^= k2; + + h2 = ROTL32(h2, 17); + h2 += h3; + h2 = h2 * 5 + 0x0bcaa747; + + k3 *= c3; + k3 = ROTL32(k3, 17); + k3 *= c4; + h3 ^= k3; + + h3 = ROTL32(h3, 15); + h3 += h4; + h3 = h3 * 5 + 0x96cd1c35; + + k4 *= c4; + k4 = ROTL32(k4, 18); + k4 *= c1; + h4 ^= k4; + + h4 = ROTL32(h4, 13); + h4 += h1; + h4 = h4 * 5 + 0x32ac3b17; + } + + /*---------*/ + /* tail */ + + const u8 *tail = (const u8 *)(data + nblocks * 16); + + u32 k1 = 0; + u32 k2 = 0; + u32 k3 = 0; + u32 k4 = 0; + + switch (len & 15) { + case 15: + k4 ^= tail[14] << 16; + case 14: + k4 ^= tail[13] << 8; + case 13: + k4 ^= tail[12] << 0; + k4 *= c4; + k4 = ROTL32(k4, 18); + k4 *= c1; + h4 ^= k4; + + case 12: + k3 ^= tail[11] << 24; + case 11: + k3 ^= tail[10] << 16; + case 10: + k3 ^= tail[9] << 8; + case 9: + k3 ^= tail[8] << 0; + k3 *= c3; + k3 = ROTL32(k3, 17); + k3 *= c4; + h3 ^= k3; + + case 8: + k2 ^= tail[7] << 24; + case 7: + k2 ^= tail[6] << 16; + case 6: + k2 ^= tail[5] << 8; + case 5: + k2 ^= tail[4] << 0; + k2 *= c2; + k2 = ROTL32(k2, 16); + k2 *= c3; + h2 ^= k2; + + case 4: + k1 ^= tail[3] << 24; + case 3: + k1 ^= tail[2] << 16; + case 2: + k1 ^= tail[1] << 8; + case 1: + k1 ^= tail[0] << 0; + k1 *= c1; + k1 = ROTL32(k1, 15); + k1 *= c2; + h1 ^= k1; + }; + + /*---------*/ + /* finalization */ + + h1 ^= len; + h2 ^= len; + h3 ^= len; + h4 ^= len; + + h1 += h2; + h1 += h3; + h1 += h4; + h2 += h1; + h3 += h1; + h4 += h1; + + h1 = fmix_32(h1); + h2 = fmix_32(h2); + h3 = fmix_32(h3); + h4 = fmix_32(h4); + + h1 += h2; + h1 += h3; + h1 += h4; + h2 += h1; + h3 += h1; + h4 += h1; + + ((u32 *) out)[0] = h1; + ((u32 *) out)[1] = h2; + ((u32 *) out)[2] = h3; + ((u32 *) out)[3] = h4; +} + +/*---------------------------------------------------------------------------*/ + +void MurmurHash3_x64_128(const void *key, const int len, + const u32 seed, void *out) +{ + const u8 *data = (const u8 *)key; + const int nblocks = len / 16; + + u64 h1 = seed; + u64 h2 = seed; + + u64 c1 = BIG_CONSTANT(0x87c37b91114253d5); + u64 c2 = BIG_CONSTANT(0x4cf5ad432745937f); + + /*---------*/ + /* body */ + + const u64 *blocks = (const u64 *)(data); + + int i; + for (i = 0; i < nblocks; i++) { + u64 k1 = getblock_64(blocks, i * 2 + 0); + u64 k2 = getblock_64(blocks, i * 2 + 1); + + k1 *= c1; + k1 = ROTL64(k1, 31); + k1 *= c2; + h1 ^= k1; + + h1 = ROTL64(h1, 27); + h1 += h2; + h1 = h1 * 5 + 0x52dce729; + + k2 *= c2; + k2 = ROTL64(k2, 33); + k2 *= c1; + h2 ^= k2; + + h2 = ROTL64(h2, 31); + h2 += h1; + h2 = h2 * 5 + 0x38495ab5; + } + + /*---------*/ + /* tail */ + + const u8 *tail = (const u8 *)(data + nblocks * 16); + + u64 k1 = 0; + u64 k2 = 0; + + switch (len & 15) { + case 15: + k2 ^= (u64) (tail[14]) << 48; + case 14: + k2 ^= (u64) (tail[13]) << 40; + case 13: + k2 ^= (u64) (tail[12]) << 32; + case 12: + k2 ^= (u64) (tail[11]) << 24; + case 11: + k2 ^= (u64) (tail[10]) << 16; + case 10: + k2 ^= (u64) (tail[9]) << 8; + case 9: + k2 ^= (u64) (tail[8]) << 0; + k2 *= c2; + k2 = ROTL64(k2, 33); + k2 *= c1; + h2 ^= k2; + + case 8: + k1 ^= (u64) (tail[7]) << 56; + case 7: + k1 ^= (u64) (tail[6]) << 48; + case 6: + k1 ^= (u64) (tail[5]) << 40; + case 5: + k1 ^= (u64) (tail[4]) << 32; + case 4: + k1 ^= (u64) (tail[3]) << 24; + case 3: + k1 ^= (u64) (tail[2]) << 16; + case 2: + k1 ^= (u64) (tail[1]) << 8; + case 1: + k1 ^= (u64) (tail[0]) << 0; + k1 *= c1; + k1 = ROTL64(k1, 31); + k1 *= c2; + h1 ^= k1; + }; + + /*---------*/ + /* finalization */ + + h1 ^= len; + h2 ^= len; + + h1 += h2; + h2 += h1; + + h1 = fmix_64(h1); + h2 = fmix_64(h2); + + h1 += h2; + h2 += h1; + + ((u64 *) out)[0] = h1; + ((u64 *) out)[1] = h2; +} + +/*---------------------------------------------------------------------------*/ diff --git a/test/packetdrill/hash.h b/test/packetdrill/hash.h new file mode 100644 index 0000000..ab2ba52 --- /dev/null +++ b/test/packetdrill/hash.h @@ -0,0 +1,43 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* From: http://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.h */ + +/*--------------------------------------------------------------------------- + * MurmurHash3 was written by Austin Appleby, and is placed in the public + * domain. The author hereby disclaims copyright to this source code. + */ + +#ifndef _MURMURHASH3_H_ +#define _MURMURHASH3_H_ + +#include "types.h" + +#include + +/*---------------------------------------------------------------------------*/ + +void MurmurHash3_x86_32(const void *key, int len, u32 seed, void *out); + +void MurmurHash3_x86_128(const void *key, int len, u32 seed, void *out); + +void MurmurHash3_x64_128(const void *key, int len, u32 seed, void *out); + +/*---------------------------------------------------------------------------*/ + +#endif /* _MURMURHASH3_H_ */ diff --git a/test/packetdrill/hash_map.c b/test/packetdrill/hash_map.c new file mode 100644 index 0000000..c18af55 --- /dev/null +++ b/test/packetdrill/hash_map.c @@ -0,0 +1,162 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Implementation for a simple hash map mapping u32 keys to u32 values. + */ + +#include "hash_map.h" + +#include +#include +#include "hash.h" + +static const size_t MAX_BUCKETS = 1ULL << 30; /* max 1B buckets */ + +/* Hash a key. We use the fast, public-domain MurmurHash3.*/ +static inline size_t hash_key(u32 key) +{ + u32 hash; + MurmurHash3_x86_32(&key, sizeof(key), 0, &hash); + return hash; +} + +/* Find the bucket number for a key. */ +static inline size_t hash_bucket_num(const struct hash_map *map, u32 key) +{ + size_t bucket_num = hash_key(key) & map->bucket_mask; + return bucket_num; +} + +/* Try to find the smallest bucket count that is a power of 2 and is + * greater than the given number of keys. + */ +static inline size_t hash_map_pick_bucket_count(size_t num_keys) +{ + size_t buckets = 1; + while ((buckets < num_keys) && (buckets < MAX_BUCKETS)) + buckets <<= 1; + return buckets; +} + +struct hash_map *hash_map_new(size_t num_keys) +{ + struct hash_map *map = calloc(1, sizeof(struct hash_map)); + map->num_buckets = hash_map_pick_bucket_count(num_keys); + map->bucket_mask = map->num_buckets - 1; + map->buckets = calloc(map->num_buckets, sizeof(struct hash_node *)); + return map; +} + +void hash_map_free(struct hash_map *map) +{ + /* Walk through the buckets and free nodes. */ + int bucket_num; + for (bucket_num = 0; bucket_num < map->num_buckets; ++bucket_num) { + struct hash_node *node = NULL; + struct hash_node *next = NULL; + for (node = map->buckets[bucket_num]; node != NULL; + node = next) { + next = node->next; + free(node); + } + } + + free(map->buckets); + memset(map, 0, sizeof(*map)); /* paranoia to help catch bugs */ + free(map); +} + +/* Link the given node into the correct bucket linked list in the hash map. */ +static void hash_map_link(struct hash_map *map, + struct hash_node *node) +{ + const size_t bucket_num = hash_bucket_num(map, node->key); + node->next = map->buckets[bucket_num]; + map->buckets[bucket_num] = node; +} + +/* Create a new array of buckets that's twice the size of the current + * array. Then Walk through the old buckets and move all the nodes to + * the new buckets. + */ +static void hash_map_grow(struct hash_map *map) +{ + const size_t old_num_buckets = map->num_buckets; + map->num_buckets *= 2; + map->bucket_mask = map->num_buckets - 1; + struct hash_node **old_buckets = map->buckets; + map->buckets = calloc(map->num_buckets, sizeof(struct hash_node *)); + + size_t old_bucket_num = 0; + for (old_bucket_num = 0; old_bucket_num < old_num_buckets; + ++old_bucket_num) { + struct hash_node *node = NULL; + struct hash_node *next = NULL; + for (node = old_buckets[old_bucket_num]; node != NULL; + node = next) { + next = node->next; + hash_map_link(map, node); + } + } + + free(old_buckets); +} + +/* Insert a new node in the hash map, first growing the map if needed. */ +static void hash_map_insert(struct hash_map *map, u32 key, u32 value) +{ + /* To keep things simple, we target a load factor of 1.0. */ + if ((map->num_keys >= map->num_buckets) && + (map->num_buckets < MAX_BUCKETS)) { + hash_map_grow(map); + } + ++map->num_keys; + struct hash_node *node = calloc(1, sizeof(struct hash_node)); + node->key = key; + node->value = value; + hash_map_link(map, node); +} + +void hash_map_set(struct hash_map *map, u32 key, u32 value) +{ + const size_t bucket_num = hash_bucket_num(map, key); + struct hash_node *node = NULL; + for (node = map->buckets[bucket_num]; node != NULL; node = node->next) { + if (node->key == key) { + node->value = value; + return; + } + } + hash_map_insert(map, key, value); +} + +bool hash_map_get(const struct hash_map *map, u32 key, u32 *value) +{ + const size_t bucket_num = hash_bucket_num(map, key); + struct hash_node *node = NULL; + for (node = map->buckets[bucket_num]; node != NULL; node = node->next) { + if (node->key == key) { + *value = node->value; + return true; + } + } + return false; +} diff --git a/test/packetdrill/hash_map.h b/test/packetdrill/hash_map.h new file mode 100644 index 0000000..f6805e2 --- /dev/null +++ b/test/packetdrill/hash_map.h @@ -0,0 +1,56 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Interface and data structure declarations for a simple hash map + * mapping u32 keys to u32 values. + */ + +#ifndef __HASH_MAP_H__ +#define __HASH_MAP_H__ + +#include "types.h" + +/* Node for hash table buckets; maps u32 key to u32 value. */ +struct hash_node { + u32 key; + u32 value; + struct hash_node *next; +}; + +/* Hash map mapping u32 to u32. */ +struct hash_map { + size_t num_keys; /* number of keys */ + size_t num_buckets; /* number of buckets (a power of 2) */ + size_t bucket_mask; /* bit mask to find bucket number */ + struct hash_node **buckets; /* array of hash buckets */ +}; + +extern struct hash_map *hash_map_new(size_t num_keys); + +extern void hash_map_free(struct hash_map *map); + +extern void hash_map_set(struct hash_map *map, + u32 key, u32 value); + +extern bool hash_map_get(const struct hash_map *map, + u32 key, u32 *value); + +#endif /* __HASH_MAP_H__ */ diff --git a/test/packetdrill/header.h b/test/packetdrill/header.h new file mode 100644 index 0000000..bfd339f --- /dev/null +++ b/test/packetdrill/header.h @@ -0,0 +1,93 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Interface and type declarations for packetdrill's representation of + * packet headers. We support multi-layer encapsulation. In order to + * make it easier to iterate through all the headers in a packet, we + * keep separate, explicit metadata about the types and locations of + * headers in a packet. + */ + +#ifndef __HEADER_H__ +#define __HEADER_H__ + +#include "types.h" + +#include +#include "assert.h" +#include "gre.h" +#include "icmp.h" +#include "icmpv6.h" +#include "ip.h" +#include "ipv6.h" +#include "mpls.h" +#include "tcp.h" +#include "udp.h" + +struct packet; + +/* The type of a header in a packet. */ +enum header_t { + HEADER_NONE, + HEADER_IPV4, + HEADER_IPV6, + HEADER_GRE, + HEADER_MPLS, + HEADER_TCP, + HEADER_UDP, + HEADER_ICMPV4, + HEADER_ICMPV6, + HEADER_NUM_TYPES +}; + +/* Metadata about a header in a packet. We support multi-layer encapsulation. */ +struct header { + enum header_t type; /* type of this header */ + u32 header_bytes; /* length of this header */ + u32 total_bytes; /* length of header plus data inside */ + union { + u8 *ptr; /* a pointer to the header bits */ + struct ipv4 *ipv4; + struct ipv6 *ipv6; + struct gre *gre; + struct mpls *mpls; + struct tcp *tcp; + struct udp *udp; + struct icmpv4 *icmpv4; + struct icmpv6 *icmpv6; + } h; +}; + +/* Info for a particular type of header. */ +struct header_type_info { + const char* name; /* human-readable protocol name */ + u8 ip_proto; /* IP protocol code */ + u16 eth_proto; /* Ethernet protocol code */ + + /* Call this to finalize the header once we know what's inside... */ + int (*finish)(struct packet *packet, + struct header *header, struct header *next_inner); +}; + +/* Return the info for the given type of header. */ +extern struct header_type_info *header_type_info(enum header_t header_type); + +#endif /* __HEADER_H__ */ diff --git a/test/packetdrill/icmp.h b/test/packetdrill/icmp.h new file mode 100644 index 0000000..a35c8b5 --- /dev/null +++ b/test/packetdrill/icmp.h @@ -0,0 +1,97 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Our own ICMPv4 header declarations, so we have something that's + * portable and somewhat more readable than a typical system header + * file. + */ + +#ifndef __ICMP_HEADERS_H__ +#define __ICMP_HEADERS_H__ + +#include "types.h" + +/* Most ICMPv6 message types include a copy of the outbound IP header + * and the first few bytes inside, to allow the receiver to demux by + * TCP/UDP port. The following constant specifies the number of bytes + * of TCP header that we will echo. We echo 8 bytes because that + * is the minimum number of bytes that the Linux TCP stack needs to + * read the source and destination TCP port and TCP sequence number, + * which it needs to properly demux an incoming ICMP packet to a + * specific TCP connection. + */ +#define ICMP_ECHO_BYTES 8 + +struct icmpv4 { + __u8 type; + __u8 code; + __sum16 checksum; + union { + struct { + __be16 id; + __be16 sequence; + } echo; + __be32 gateway; + struct { + __be16 unused; + __be16 mtu; + } frag; /* PMTU discovery, RFC 1191 */ + } message; +}; + +/* Our own ICMP definitions, since the names vary between platforms. */ + +/* ICMPv4 types */ +#define ICMP_ECHOREPLY 0 +#define ICMP_DEST_UNREACH 3 +#define ICMP_SOURCE_QUENCH 4 +#define ICMP_REDIRECT 5 +#define ICMP_ECHO 8 +#define ICMP_TIME_EXCEEDED 11 +#define ICMP_PARAMETERPROB 12 +#define ICMP_TIMESTAMP 13 +#define ICMP_TIMESTAMPREPLY 14 +#define ICMP_INFO_REQUEST 15 +#define ICMP_INFO_REPLY 16 +#define ICMP_ADDRESS 17 +#define ICMP_ADDRESSREPLY 18 +#define NR_ICMP_TYPES 18 + +/* Codes for ICMP_DEST_UNREACH */ +#define ICMP_NET_UNREACH 0 +#define ICMP_HOST_UNREACH 1 +#define ICMP_PROT_UNREACH 2 +#define ICMP_PORT_UNREACH 3 +#define ICMP_FRAG_NEEDED 4 +#define ICMP_SR_FAILED 5 +#define ICMP_NET_UNKNOWN 6 +#define ICMP_HOST_UNKNOWN 7 +#define ICMP_HOST_ISOLATED 8 +#define ICMP_NET_ANO 9 +#define ICMP_HOST_ANO 10 +#define ICMP_NET_UNR_TOS 11 +#define ICMP_HOST_UNR_TOS 12 +#define ICMP_PKT_FILTERED 13 +#define ICMP_PREC_VIOLATION 14 +#define ICMP_PREC_CUTOFF 15 +#define NR_ICMP_UNREACH 15 + +#endif /* __ICMP_HEADERS_H__ */ diff --git a/test/packetdrill/icmp_packet.c b/test/packetdrill/icmp_packet.c new file mode 100644 index 0000000..6dc5f9b --- /dev/null +++ b/test/packetdrill/icmp_packet.c @@ -0,0 +1,406 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Implementation for module for formatting ICMP packets. + */ + +#include "icmp_packet.h" + +#include "icmp.h" +#include "icmpv6.h" +#include "ip_packet.h" + +/* A table entry mapping an ICMP code string to byte. */ +struct icmp_code_info { + u8 code_byte; /* type byte on the wire */ + const char *code_string; /* human-readable code */ +}; + +/* A table entry mapping an ICMP type string to byte and code table. */ +struct icmp_type_info { + u8 type_byte; /* type byte on the wire */ + const char *type_string; /* human-readable type */ + const struct icmp_code_info *code_table; /* codes for this type */ +}; + +/* Values for the 'code' byte of an IPv4 ICMP_DEST_UNREACH header (RFC 1700). */ +struct icmp_code_info icmpv4_unreachable_codes[] = { + { ICMP_NET_UNREACH, "net_unreachable" }, + { ICMP_HOST_UNREACH, "host_unreachable" }, + { ICMP_PROT_UNREACH, "protocol_unreachable" }, + { ICMP_PORT_UNREACH, "port_unreachable" }, + { ICMP_FRAG_NEEDED, "frag_needed" }, + { ICMP_SR_FAILED, "source_route_failed" }, + { ICMP_NET_UNKNOWN, "net_unknown" }, + { ICMP_HOST_UNKNOWN, "host_unknown" }, + { ICMP_HOST_ISOLATED, "source_host_isolated" }, + { ICMP_NET_ANO, "net_prohibited" }, + { ICMP_HOST_ANO, "host_prohibited" }, + { ICMP_NET_UNR_TOS, "net_unreachable_for_tos" }, + { ICMP_HOST_UNR_TOS, "host_unreachable_for_tos" }, + { ICMP_PKT_FILTERED, "packet_filtered" }, + { ICMP_PREC_VIOLATION, "precedence_violation" }, + { ICMP_PREC_CUTOFF, "precedence_cutoff" }, + { 0, NULL }, +}; + +/* Information about the supported types of ICMPv4 header (RFC 1700). */ +struct icmp_type_info icmpv4_types[] = { + { ICMP_ECHOREPLY, "echo_reply" }, + { ICMP_DEST_UNREACH, "unreachable", icmpv4_unreachable_codes }, + { ICMP_SOURCE_QUENCH, "source_quench" }, + { ICMP_REDIRECT, "redirect" }, + { ICMP_ECHO, "echo_request" }, + { ICMP_TIME_EXCEEDED, "time_exceeded" }, + { ICMP_PARAMETERPROB, "parameter_problem" }, + { ICMP_TIMESTAMP, "timestamp_request" }, + { ICMP_TIMESTAMPREPLY, "timestamp_reply" }, + { ICMP_INFO_REQUEST, "information_request" }, + { ICMP_INFO_REPLY, "information_reply" }, + { ICMP_ADDRESS, "address_mask_request" }, + { ICMP_ADDRESSREPLY, "address_mask_reply" }, + { 0, NULL, NULL }, +}; + +/* Values for the 'code' byte of an ICMPV6_DEST_UNREACH header (RFC 2463). */ +struct icmp_code_info icmpv6_unreachable_codes[] = { + { ICMP_NET_UNREACH, "net_unreachable" }, + { ICMPV6_NOROUTE, "no_route" }, + { ICMPV6_ADM_PROHIBITED, "admin_prohibited" }, + { ICMPV6_NOT_NEIGHBOUR, "not_neighbour" }, + { ICMPV6_ADDR_UNREACH, "address_unreachable" }, + { ICMPV6_PORT_UNREACH, "port_unreachable" }, + { 0, NULL }, +}; + +/* Values for the 'code' byte of an ICMPV6_TIME_EXCEED header (RFC 2463). */ +struct icmp_code_info icmpv6_time_exceed_codes[] = { + { ICMPV6_EXC_HOPLIMIT, "exceeded_hop_limit" }, + { ICMPV6_EXC_FRAGTIME, "exceeded_frag_time" }, + { 0, NULL }, +}; + +/* Values for the 'code' byte of an ICMPV6_PARAMPROB header (RFC 2463). */ +struct icmp_code_info icmpv6_paramprob_codes[] = { + { ICMPV6_HDR_FIELD, "header_field" }, + { ICMPV6_UNK_NEXTHDR, "unknown_next_header" }, + { ICMPV6_UNK_OPTION, "unknown_option" }, + { 0, NULL }, +}; + +/* Information about the supported types of ICMPv6 header (RFC 2463). */ +struct icmp_type_info icmpv6_types[] = { + { ICMPV6_DEST_UNREACH, "unreachable", icmpv6_unreachable_codes }, + { ICMPV6_PKT_TOOBIG, "packet_too_big" }, + { ICMPV6_TIME_EXCEED, "time_exceeded", icmpv6_time_exceed_codes }, + { ICMPV6_PARAMPROB, "parameter_problem", icmpv6_paramprob_codes }, + { ICMPV6_ECHO_REQUEST, "echo_request" }, + { ICMPV6_ECHO_REPLY, "echo_reply" }, + { 0, NULL, NULL }, +}; + +/* Return the ICMP protocol number for the given address family. */ +static int icmp_protocol(int address_family) +{ + if (address_family == AF_INET) + return IPPROTO_ICMP; + else if (address_family == AF_INET6) + return IPPROTO_ICMPV6; + else + assert(!"bad ip version"); + return 0; +} + +/* Return the length in bytes of the ICMP header. */ +static int icmp_header_len(int address_family) +{ + if (address_family == AF_INET) + return sizeof(struct icmpv4); + else if (address_family == AF_INET6) + return sizeof(struct icmpv6); + else + assert(!"bad ip version"); + return 0; +} + +/* Fill in ICMPv4 header fields. */ +static int set_icmpv4_header(struct icmpv4 *icmpv4, u8 type, u8 code, + s64 mtu, u16 echo_id, char **error) +{ + icmpv4->type = type; + icmpv4->code = code; + icmpv4->checksum = htons(0); + + if (mtu >= 0) { + if ((type != ICMP_DEST_UNREACH) || (code != ICMP_FRAG_NEEDED)) { + asprintf(error, + "ICMPv4 MTU is only valid for " + "unreachable-frag_needed"); + return STATUS_ERR; + } + if (!is_valid_u16(mtu)) { + asprintf(error, "ICMPv4 MTU out of 16-bit range"); + return STATUS_ERR; + } + icmpv4->message.frag.mtu = htons(mtu); + } + if (echo_id > 0) + icmpv4->message.echo.id = htons(echo_id); + + return STATUS_OK; +} + +/* Fill in ICMPv4 header fields. */ +static int set_icmpv6_header(struct icmpv6 *icmpv6, u8 type, u8 code, + s64 mtu, u16 echo_id, char **error) +{ + icmpv6->type = type; + icmpv6->code = code; + icmpv6->checksum = htons(0); + + if (mtu >= 0) { + if ((type != ICMPV6_PKT_TOOBIG) || (code != 0)) { + asprintf(error, + "ICMPv6 MTU is only valid for " + "packet_too_big-0"); + return STATUS_ERR; + } + if (!is_valid_u32(mtu)) { + asprintf(error, "ICMPv6 MTU out of 32-bit range"); + return STATUS_ERR; + } + icmpv6->message.packet_too_big.mtu = htonl(mtu); + } + if (echo_id > 0) { + icmpv6->message.u_echo.identifier = htons(echo_id); + } + return STATUS_OK; +} + +/* Populate ICMP header fields. */ +static int set_packet_icmp_header(struct packet *packet, void *icmp, + int address_family, int icmp_bytes, + u8 type, u8 code, s64 mtu, u16 echo_id, + char **error) +{ + struct header *icmp_header = NULL; + + if (address_family == AF_INET) { + struct icmpv4 *icmpv4 = (struct icmpv4 *) icmp; + packet->icmpv4 = icmpv4; + assert(packet->icmpv6 == NULL); + icmp_header = packet_append_header(packet, HEADER_ICMPV4, + sizeof(*icmpv4)); + icmp_header->total_bytes = icmp_bytes; + return set_icmpv4_header(icmpv4, type, code, mtu, echo_id, error); + } else if (address_family == AF_INET6) { + struct icmpv6 *icmpv6 = (struct icmpv6 *) icmp; + packet->icmpv6 = icmpv6; + assert(packet->icmpv4 == NULL); + icmp_header = packet_append_header(packet, HEADER_ICMPV6, + sizeof(*icmpv6)); + icmp_header->total_bytes = icmp_bytes; + return set_icmpv6_header(icmpv6, type, code, mtu, echo_id, error); + } else { + assert(!"bad ip_version in config"); + } + return STATUS_ERR; +} + +/* Parse the given ICMP type and code strings, and fill in the + * *type and *code with the results. If there is an error during + * parsing, fill in *error and return STATUS_ERR; otherwise return + * STATUS_OK. + */ +static int parse_icmp_type_and_code(int address_family, + const char *type_string, + const char *code_string, + s32 *type, s32 *code, char **error) +{ + int i = 0; + const struct icmp_type_info *icmp_types = NULL; + const struct icmp_code_info *code_table = NULL; /* for this type */ + + if (address_family == AF_INET) + icmp_types = icmpv4_types; + else if (address_family == AF_INET6) + icmp_types = icmpv6_types; + else + assert(!"bad ip_version in config"); + + /* Parse the type string. */ + if (sscanf(type_string, "type_%d", type) == 1) { + /* Legal but non-standard type in tcpdump-inspired notation. */ + } else { + /* Look in our table of known types. */ + for (i = 0; icmp_types[i].type_string != NULL; ++i) { + if (!strcmp(type_string, icmp_types[i].type_string)) { + *type = icmp_types[i].type_byte; + code_table = icmp_types[i].code_table; + } + } + } + if (!is_valid_u8(*type)) { + asprintf(error, "bad ICMP type %s", type_string); + return STATUS_ERR; + } + + /* Parse the code string. */ + if (code_string == NULL) { + *code = 0; /* missing code means code = 0 */ + } else if (sscanf(code_string, "code_%d", code) == 1) { + /* Legal but non-standard code in tcpdump-inspired notation. */ + } else if (code_table != NULL) { + /* Look in our table of known codes. */ + for (i = 0; code_table[i].code_string != NULL; ++i) { + if (!strcmp(code_string, code_table[i].code_string)) + *code = code_table[i].code_byte; + } + } + if (!is_valid_u8(*code)) { + asprintf(error, "bad ICMP code %s", code_string); + return STATUS_ERR; + } + + return STATUS_OK; +} + +struct packet *new_icmp_packet(int address_family, + enum direction_t direction, + const char *type_string, + const char *code_string, + int protocol, + u32 tcp_start_sequence, + u32 payload_bytes, + struct ip_info ip_info, + s64 mtu, + s64 echo_id, + char **error) +{ + s32 type = -1; /* bad type; means "unknown so far" */ + s32 code = -1; /* bad code; means "unknown so far" */ + + struct packet *packet = NULL; /* the newly-allocated result packet */ + /* Calculate lengths in bytes of all sections of the packet. + * For TCP/UDP, for now we only support the most common ICMP message + * format, which includes at the end the original outgoing IP + * header and the first 8 bytes after that (which will + * typically have the port info needed to demux the message). + * For RAW, we pad the icmp packet with 0 and the total length is + * payload_bytes. + */ + const int ip_fixed_bytes = ip_header_min_len(address_family); + const int ip_option_bytes = 0; + const int ip_header_bytes = ip_fixed_bytes + ip_option_bytes; + int echoed_bytes = 0; + int icmp_bytes = 0; + int ip_bytes = 0; + + if (protocol == IPPROTO_TCP || protocol == IPPROTO_UDP) { + echoed_bytes = ip_fixed_bytes + ICMP_ECHO_BYTES; + icmp_bytes = icmp_header_len(address_family) + echoed_bytes; + ip_bytes = ip_header_bytes + icmp_bytes; + } else if (protocol == IPPROTO_RAW) { + echoed_bytes = 0; + icmp_bytes = payload_bytes; + ip_bytes = ip_header_bytes + payload_bytes; + } + + /* Sanity-check on echo_id to make sure it fits in u16 */ + if (echo_id < 0 || echo_id > 65535) { + asprintf(error, + "invalid echo_id, must be between 0 and 65535"); + goto error_out; + } + + /* Sanity-check all the various lengths */ + if (ip_option_bytes & 0x3) { + asprintf(error, "IP options are not padded correctly " + "to ensure IP header is a multiple of 4 bytes: " + "%d excess bytes", ip_option_bytes & 0x3); + goto error_out; + } + assert((ip_header_bytes & 0x3) == 0); + if (icmp_bytes < icmp_header_len(address_family)) { + asprintf(error, "icmp_bytes %d smaller than icmp header " + "length %d", + icmp_bytes, icmp_header_len(address_family)); + goto error_out; + } + + + /* Parse the ICMP type and code */ + if (parse_icmp_type_and_code(address_family, type_string, code_string, + &type, &code, error)) + goto error_out; + assert(is_valid_u8(type)); + assert(is_valid_u8(code)); + + /* Allocate and zero out a packet object of the desired size */ + packet = packet_new(ip_bytes); + memset(packet->buffer, 0, ip_bytes); + + packet->direction = direction; + packet->flags = 0; + packet->tos_chk = ip_info.tos.check; + + /* Set IP header fields */ + set_packet_ip_header(packet, address_family, ip_bytes, ip_info.tos.value, + ip_info.flow_label, ip_info.ttl, + icmp_protocol(address_family)); + + /* Find the start of the ICMP header and then populate common fields. */ + void *icmp_header = ip_start(packet) + ip_header_bytes; + if (set_packet_icmp_header(packet, icmp_header, address_family, + icmp_bytes, type, code, mtu, echo_id, error)) + goto error_out; + + /* All ICMP message types currently supported by this tool + * include a copy of the outbound IP header and the first few + * bytes inside. To ensure that the inbound ICMP message gets + * demuxed to the correct socket in the kernel, here we + * construct enough of a basic IP header and during test + * execution we fill in the port numbers and (if specified) + * TCP sequence number in the TCP header. + */ + if (echoed_bytes) { + u8 *echoed_ip = packet_echoed_ip_header(packet); + const int echoed_ip_bytes = (ip_fixed_bytes + + layer4_header_len(protocol) + + payload_bytes); + set_ip_header(echoed_ip, address_family, echoed_ip_bytes, + 0, 0, 0, protocol); + if (protocol == IPPROTO_TCP) { + u32 *seq = packet_echoed_tcp_seq(packet); + *seq = htonl(tcp_start_sequence); + } + packet->echoed_header = true; + } else + packet->echoed_header = false; + + packet->ip_bytes = ip_bytes; + return packet; + +error_out: + if (packet != NULL) + packet_free(packet); + return NULL; +} diff --git a/test/packetdrill/icmp_packet.h b/test/packetdrill/icmp_packet.h new file mode 100644 index 0000000..020e5bc --- /dev/null +++ b/test/packetdrill/icmp_packet.h @@ -0,0 +1,55 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Interface for module for formatting ICMP packets. + */ + +#ifndef __ICMP_PACKET_H__ +#define __ICMP_PACKET_H__ + +#include "types.h" + +#include "packet.h" + +/* Create and initialize a new struct packet containing an ICMP + * packet. The 'type_string' identifies the ICMP type. The + * 'code_string' identifies the ICMP code (and NULL means no code was + * provided, in which case we assume a default code of 0). + * The 'protocol' is either IPPROTO_UDP or IPPROTO_TCP. + * The 'tcp_start_sequence' and 'payload_bytes' describe the TCP or UDP + * packet echoed inside the ICMP message. The 'mtu' specifies the MTU + * advertised in "packet is too big" ICMP message, or -1 for no + * MTU. On success, returns a newly-allocated packet. On failure, + * returns NULL and fills in *error with an error message. + */ +extern struct packet *new_icmp_packet(int address_family, + enum direction_t direction, + const char *type_string, + const char *code_string, + int protocol, + u32 tcp_start_sequence, + u32 payload_bytes, + struct ip_info ip_info, + s64 mtu, + s64 echo_id, + char **error); + +#endif /* __ICMP_PACKET_H__ */ diff --git a/test/packetdrill/icmpv6.h b/test/packetdrill/icmpv6.h new file mode 100644 index 0000000..047f90d --- /dev/null +++ b/test/packetdrill/icmpv6.h @@ -0,0 +1,81 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Our own ICMPv6 header declarations, so we have something that's + * portable and somewhat more readable than a typical system header + * file. + */ + +#ifndef __ICMPV6_HEADERS_H__ +#define __ICMPV6_HEADERS_H__ + +#include "types.h" + +/* ICMPv6 hader. See RFC 4443. */ +struct icmpv6 { + __u8 type; + __u8 code; + __sum16 checksum; + union { + struct { + __be32 unused; + } unreachable; + struct { + __be32 mtu; + } packet_too_big; + struct { + __be32 unused; + } time_exceeded; + struct { + __be32 pointer; + } parameter_problem; + struct icmpv6_echo { + __be16 identifier; + __be16 sequence; + } u_echo; + } message; +}; + +/* Supported ICMPv6 types */ +#define ICMPV6_DEST_UNREACH 1 +#define ICMPV6_PKT_TOOBIG 2 +#define ICMPV6_TIME_EXCEED 3 +#define ICMPV6_PARAMPROB 4 +#define ICMPV6_ECHO_REQUEST 128 +#define ICMPV6_ECHO_REPLY 129 + +/* Codes for ICMPV6 Destination Unreachable */ +#define ICMPV6_NOROUTE 0 +#define ICMPV6_ADM_PROHIBITED 1 +#define ICMPV6_NOT_NEIGHBOUR 2 +#define ICMPV6_ADDR_UNREACH 3 +#define ICMPV6_PORT_UNREACH 4 + +/* Codes for ICMPV6 Time Exceeded */ +#define ICMPV6_EXC_HOPLIMIT 0 +#define ICMPV6_EXC_FRAGTIME 1 + +/* Codes for ICMPV6 Parameter Problem */ +#define ICMPV6_HDR_FIELD 0 +#define ICMPV6_UNK_NEXTHDR 1 +#define ICMPV6_UNK_OPTION 2 + +#endif /* __ICMPV6_HEADERS_H__ */ diff --git a/test/packetdrill/ip.h b/test/packetdrill/ip.h new file mode 100644 index 0000000..0ffcf12 --- /dev/null +++ b/test/packetdrill/ip.h @@ -0,0 +1,108 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Our own IPv4 header declarations, so we have something that's + * portable and somewhat more readable than a typical system header + * file. + */ + +#ifndef __IP_HEADERS_H__ +#define __IP_HEADERS_H__ + +#include "types.h" + +struct ipv4 { +#if __BYTE_ORDER == __LITTLE_ENDIAN + __u8 ihl:4, + version:4; +#elif __BYTE_ORDER == __BIG_ENDIAN + __u8 version:4, + ihl:4; +#else +# error "Please fix endianness defines" +#endif + __u8 tos; + __be16 tot_len; + __be16 id; + __be16 frag_off; + __u8 ttl; + __u8 protocol; + __sum16 check; + struct in_addr src_ip; + struct in_addr dst_ip; +}; + +/* ----------------------- IP socket option values -------------------- */ + +/* Oddly enough, Linux distributions are typically missing even some + * of the older and more common IP socket options, such as IP_MTU. + */ +#ifdef linux +#define IP_TOS 1 +#define IP_TTL 2 +#define IP_HDRINCL 3 +#define IP_OPTIONS 4 +#define IP_ROUTER_ALERT 5 +#define IP_RECVOPTS 6 +#define IP_RETOPTS 7 +#define IP_PKTINFO 8 +#define IP_PKTOPTIONS 9 +#define IP_MTU_DISCOVER 10 +#define IP_RECVERR 11 +#define IP_RECVTTL 12 +#define IP_RECVTOS 13 +#define IP_MTU 14 +#define IP_FREEBIND 15 +#define IP_IPSEC_POLICY 16 +#define IP_XFRM_POLICY 17 +#define IP_PASSSEC 18 +#define IP_TRANSPARENT 19 +#endif /* linux */ + +/* ECN: RFC 3168: http://tools.ietf.org/html/rfc3168 */ +#define IP_ECN_MASK 3 +#define IP_ECN_NONE 0 +#define IP_ECN_ECT1 1 +#define IP_ECN_ECT0 2 +#define IP_ECN_CE 3 + +static inline u8 ipv4_tos_byte(const struct ipv4 *ipv4) +{ + return ipv4->tos; +} + +static inline u8 ipv4_ttl_byte(const struct ipv4 *ipv4) +{ + return ipv4->ttl; +} + +static inline int ipv4_header_len(const struct ipv4 *ipv4) +{ + return ipv4->ihl * sizeof(u32); +} + +/* IP fragmentation bit flags */ +#define IP_RF 0x8000 /* reserved fragment flag */ +#define IP_DF 0x4000 /* don't fragment flag */ +#define IP_MF 0x2000 /* more fragments flag */ +#define IP_OFFMASK 0x1FFF /* mask for fragmenting bits */ + +#endif /* __IP_HEADERS_H__ */ diff --git a/test/packetdrill/ip_address.c b/test/packetdrill/ip_address.c new file mode 100644 index 0000000..9518f17 --- /dev/null +++ b/test/packetdrill/ip_address.c @@ -0,0 +1,379 @@ +/* + * Copyright 2013-2015 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Implementation for operations for IPv4 and IPv6 addresses. + */ + +#include "ip_address.h" + +#include +#include +#include +#include +#include +#include + +#include "logging.h" + +/* IPv6 prefix for IPv4-mapped addresses. These are in the + * ::FFFF:0:0/96 space, i.e. 10 bytes of 0x00 and 2 bytes of 0xFF. See + * RFC 4291 ("IPv6 Addressing Architecture") section 2.5.5.2 + * ("IPv4-Mapped IPv6 Address"). + */ +const u8 ipv4_mapped_prefix[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF }; + +int ip_address_length(int address_family) +{ + switch (address_family) { + case AF_INET: + return sizeof(struct in_addr); + case AF_INET6: + return sizeof(struct in6_addr); + default: + die("ip_address_length: bad address family: %d\n", + address_family); + break; + } + return 0; /* not reached */ +} + +int sockaddr_length(int address_family) +{ + switch (address_family) { + case AF_INET: + return sizeof(struct sockaddr_in); + case AF_INET6: + return sizeof(struct sockaddr_in6); + default: + die("sockaddr_length: bad address family: %d\n", + address_family); + break; + } + return 0; /* not reached */ +} + +static void ipv4_init(struct ip_address *ipv4) +{ + memset(ipv4, 0, sizeof(*ipv4)); + ipv4->address_family = AF_INET; +} + +static void ipv6_init(struct ip_address *ipv6) +{ + memset(ipv6, 0, sizeof(*ipv6)); + ipv6->address_family = AF_INET6; +} + +void ip_from_ipv4(const struct in_addr *ipv4, struct ip_address *ip) +{ + ipv4_init(ip); + ip->ip.v4 = *ipv4; +} + +void ip_from_ipv6(const struct in6_addr *ipv6, struct ip_address *ip) +{ + ipv6_init(ip); + ip->ip.v6 = *ipv6; +} + +void ip_to_ipv4(const struct ip_address *ip, struct in_addr *ipv4) +{ + *ipv4 = ip->ip.v4; +} + +void ip_to_ipv6(const struct ip_address *ip, struct in6_addr *ipv6) +{ + *ipv6 = ip->ip.v6; +} + +struct ip_address ipv4_parse(const char *ip_string) +{ + struct ip_address ipv4; + ipv4_init(&ipv4); + + if (inet_pton(AF_INET, ip_string, &ipv4.ip.v4) != 1) + die("bad IPv4 address: %s\n", ip_string); + + return ipv4; +} + +struct ip_address ipv6_parse(const char *ip_string) +{ + struct ip_address ipv6; + ipv6_init(&ipv6); + + if (inet_pton(AF_INET6, ip_string, &ipv6.ip.v6) != 1) + die("bad IPv6 address: %s\n", ip_string); + + return ipv6; +} + +const char *ip_to_string(const struct ip_address *ip, char *buffer) +{ + if (!inet_ntop(ip->address_family, &ip->ip, buffer, ADDR_STR_LEN)) + die_perror("inet_ntop"); + + return buffer; +} + +struct ip_address ipv6_map_from_ipv4(const struct ip_address ipv4) +{ + struct ip_address ipv6; + ipv6_init(&ipv6); + + assert(sizeof(ipv4.ip.v4) + sizeof(ipv4_mapped_prefix) == + sizeof(ipv6.ip.v6)); + memcpy(ipv6.ip.v6.s6_addr, ipv4_mapped_prefix, + sizeof(ipv4_mapped_prefix)); + memcpy(ipv6.ip.v6.s6_addr + sizeof(ipv4_mapped_prefix), + &ipv4.ip.v4, sizeof(ipv4.ip.v4)); + return ipv6; +} + +int ipv6_map_to_ipv4(const struct ip_address ipv6, struct ip_address *ipv4) +{ + if (memcmp(&ipv6.ip.v6.s6_addr, + ipv4_mapped_prefix, sizeof(ipv4_mapped_prefix)) == 0) { + ipv4_init(ipv4); + memcpy(&ipv4->ip.v4, + ipv6.ip.v6.s6_addr + sizeof(ipv4_mapped_prefix), + sizeof(ipv4->ip.v4)); + return STATUS_OK; + } else { + return STATUS_ERR; + } +} + +/* Fill in a sockaddr struct and socklen_t using the given IPv4 + * address and port. + */ +static void ipv4_to_sockaddr(const struct ip_address *ipv4, u16 port, + struct sockaddr *address, socklen_t *length) +{ + struct sockaddr_in sa_v4; + memset(&sa_v4, 0, sizeof(sa_v4)); +#ifndef linux + sa_v4.sin_len = sizeof(sa_v4); +#endif + sa_v4.sin_family = AF_INET; + sa_v4.sin_port = htons(port); + memcpy(&sa_v4.sin_addr, &ipv4->ip.v4, sizeof(sa_v4.sin_addr)); + *length = sizeof(sa_v4); + memcpy(address, &sa_v4, *length); +} + +/* Fill in a sockaddr struct and socklen_t using the given IPv6 + * address and port. + */ +static void ipv6_to_sockaddr(const struct ip_address *ipv6, u16 port, + struct sockaddr *address, socklen_t *length) +{ + struct sockaddr_in6 sa_v6; + memset(&sa_v6, 0, sizeof(sa_v6)); +#ifndef linux + sa_v6.sin6_len = sizeof(sa_v6); +#endif + sa_v6.sin6_family = AF_INET6; + sa_v6.sin6_port = htons(port); + memcpy(&sa_v6.sin6_addr, &ipv6->ip.v6, sizeof(sa_v6.sin6_addr)); + *length = sizeof(sa_v6); + memcpy(address, &sa_v6, *length); +} + +void ip_to_sockaddr(const struct ip_address *ip, u16 port, + struct sockaddr *address, socklen_t *length) +{ + switch (ip->address_family) { + case AF_INET: + ipv4_to_sockaddr(ip, port, address, length); + break; + case AF_INET6: + ipv6_to_sockaddr(ip, port, address, length); + break; + default: + die("ip_to_sockaddr: bad address family: %d\n", + ip->address_family); + break; + } +} + +/* Extract and return the IPv4 address and port from the given sockaddr. */ +static void ipv4_from_sockaddr(const struct sockaddr *address, socklen_t length, + struct ip_address *ipv4, u16 *port) +{ + assert(address->sa_family == AF_INET); + ipv4_init(ipv4); + + struct sockaddr_in sa_v4; + assert(length == sizeof(sa_v4)); + memcpy(&sa_v4, address, length); /* to avoid aliasing issues */ + ipv4->ip.v4 = sa_v4.sin_addr; + *port = ntohs(sa_v4.sin_port); +} + +/* Extract and return the IPv6 address and port from the given sockaddr. */ +static void ipv6_from_sockaddr(const struct sockaddr *address, socklen_t length, + struct ip_address *ipv4, u16 *port) +{ + assert(address->sa_family == AF_INET6); + ipv6_init(ipv4); + + struct sockaddr_in6 sa_v6; + assert(length == sizeof(sa_v6)); + memcpy(&sa_v6, address, length); /* to avoid aliasing issues */ + ipv4->ip.v6 = sa_v6.sin6_addr; + *port = ntohs(sa_v6.sin6_port); +} + +void ip_from_sockaddr(const struct sockaddr *address, socklen_t length, + struct ip_address *ip, u16 *port) +{ + switch (address->sa_family) { + case AF_INET: + ipv4_from_sockaddr(address, length, ip, port); + break; + case AF_INET6: + ipv6_from_sockaddr(address, length, ip, port); + break; + default: + die("ip_from_sockaddr: bad address family: %d\n", + address->sa_family); + break; + } +} + +int get_ip_device(const struct ip_address *ip, char *dev_name) +{ + struct ifaddrs *ifaddr_list, *ifaddr; + bool is_local = false; + + if (getifaddrs(&ifaddr_list)) + die_perror("getifaddrs"); + + for (ifaddr = ifaddr_list; ifaddr != NULL; ifaddr = ifaddr->ifa_next) { + int family; + struct ip_address interface_ip; + u16 port; + + if (ifaddr->ifa_addr == NULL) + continue; + + family = ifaddr->ifa_addr->sa_family; + if (family != ip->address_family) + continue; + + ip_from_sockaddr(ifaddr->ifa_addr, sockaddr_length(family), + &interface_ip, &port); + if (is_equal_ip(ip, &interface_ip)) { + assert(ifaddr->ifa_name); + assert(strlen(ifaddr->ifa_name) < IFNAMSIZ); + strcpy(dev_name, ifaddr->ifa_name); + is_local = true; + break; + } + } + + freeifaddrs(ifaddr_list); + + return is_local; +} + +int is_ip_local(const struct ip_address *ip) +{ + char dev_name[IFNAMSIZ]; + + return get_ip_device(ip, dev_name); +} + +int netmask_to_prefix(const char *netmask) +{ + int pos; + struct ip_address mask = ipv4_parse(netmask); + u32 mask_addr = ntohl(mask.ip.v4.s_addr); + int prefix_len = 0; + + for (pos = 31; pos >= 0; --pos) { + if (!(mask_addr & (1< 128 - 32) { + mask = (1U << (128 - prefixlen)) - 1; + rnd &= mask; + } + if (!rnd) + rnd++; + if (rnd == mask) + rnd--; + addr.ip.v6.s6_addr32[3] |= htonl(rnd); + ip_to_string(&addr, result); +} diff --git a/test/packetdrill/ip_address.h b/test/packetdrill/ip_address.h new file mode 100644 index 0000000..6ee586b --- /dev/null +++ b/test/packetdrill/ip_address.h @@ -0,0 +1,131 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Types and operations for IPv4 and IPv6 addresses. + */ + +#ifndef __IP_ADDRESS_H__ +#define __IP_ADDRESS_H__ + +#include "types.h" + +#include + +/* IPv4 or IPv6 address. */ +struct ip_address { + int address_family; /* AF_INET or AF_INET6 */ + union { + struct in_addr v4; + struct in6_addr v6; + u8 bytes[16]; + } ip; /* IP address (network order) */ +}; + +static inline void ip_reset(struct ip_address *ip) +{ + memset(ip, 0, sizeof(*ip)); +} + +/* Fill in an ip_address using the given family-specific struct. */ +extern void ip_from_ipv4(const struct in_addr *ipv4, struct ip_address *ip); +extern void ip_from_ipv6(const struct in6_addr *ipv6, struct ip_address *ip); + +/* Fill in the given family-specific struct using the given ip_address. */ +extern void ip_to_ipv4(const struct ip_address *ip, struct in_addr *ipv4); +extern void ip_to_ipv6(const struct ip_address *ip, struct in6_addr *ipv6); + +/* Return the number of bytes in the on-the-wire representation of + * addresses of the given family. + */ +extern int ip_address_length(int address_family); + +/* Return the number of bytes in sockaddr of the given family. */ +extern int sockaddr_length(int address_family); + +/* Return true iff the two addresses are the same. */ +static inline bool is_equal_ip(const struct ip_address *a, + const struct ip_address *b) +{ + return ((a->address_family == b->address_family) && + !memcmp(&a->ip, &b->ip, ip_address_length(a->address_family))); +} + +/* Parse a human-readable IPv4 address and return it. Print an error + * to stderr and exit if there is an error parsing the address. + */ +extern struct ip_address ipv4_parse(const char *ip_string); + +/* Parse a human-readable IPv6 address and return it. Print an error + * to stderr and exit if there is an error parsing the address. + */ +extern struct ip_address ipv6_parse(const char *ip_string); + +/* Print a human-readable representation of the given IP address in the + * given buffer, which must be at least ADDR_STR_LEN bytes long. + * Returns a pointer to the given buffer. + */ +extern const char *ip_to_string(const struct ip_address *ip, char *buffer); + +/* Create an IPv4-mapped IPv6 address. */ +extern struct ip_address ipv6_map_from_ipv4(const struct ip_address ipv4); + +/* Deconstruct an IPv4-mapped IPv6 address and fill in *ipv4 with the + * IPv4 address that was mapped into IPv6 space. Return STATUS_OK on + * success, or STATUS_ERR on failure (meaning the input ipv6 was not + * actually an IPv4-mapped IPv6 address). + */ +extern int ipv6_map_to_ipv4(const struct ip_address ipv6, + struct ip_address *ipv4); + +/* Fill in a sockaddr struct and socklen_t using the given IP and port. + * The IP address may be IPv4 or IPv6. + */ +extern void ip_to_sockaddr(const struct ip_address *ip, u16 port, + struct sockaddr *address, socklen_t *length); + +/* Fill in an IP address and port by parsing a sockaddr struct and + * socklen_t using the given IP and port. The IP address may be IPv4 + * or IPv6. Exits with an error message if the address family is other + * than AF_INET or AF_INET6. + */ +extern void ip_from_sockaddr(const struct sockaddr *address, socklen_t length, + struct ip_address *ip, u16 *port); + +/* Return true iff the address is that of a local interface. */ +/* Note: this should return bool, but that doesn't compile on NetBSD. */ +extern int is_ip_local(const struct ip_address *ip); + +/* Fill in the name of the device configured with the given IP, if + * any. The dev_name buffer should be at least IFNAMSIZ bytes. + * Return true iff the IP is found on a local device. + */ +/* Note: this should return bool, but that doesn't compile on NetBSD. */ +extern int get_ip_device(const struct ip_address *ip, char *dev_name); + +/* Convert dotted decimal netmask to equivalent CIDR prefix length */ +extern int netmask_to_prefix(const char *netmask); + +void generate_random_ipv4_addr(char *result, const char *base, + const char *netmask); + +void generate_random_ipv6_addr(char *result, const char *base, int prefixlen); + +#endif /* __IP_ADDRESS_H__ */ diff --git a/test/packetdrill/ip_packet.c b/test/packetdrill/ip_packet.c new file mode 100644 index 0000000..4a68ea6 --- /dev/null +++ b/test/packetdrill/ip_packet.c @@ -0,0 +1,221 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Implementation for module for formatting IPv4 and IPv6 packets. + */ + +#include "ip_packet.h" + +#include "checksum.h" +#include "ip.h" +#include "ipv6.h" + +/* Fill in IPv4 header fields. */ +static void set_ipv4_header(struct ipv4 *ipv4, + u16 ip_bytes, u8 tos, + u8 ttl, u8 protocol) +{ + ipv4->version = 4; + ipv4->ihl = sizeof(struct ipv4) / sizeof(u32); + ipv4->tos = tos; + + ipv4->tot_len = htons(ip_bytes); + ipv4->id = 0; + ipv4->frag_off = 0; + if (ttl) + ipv4->ttl = ttl; + else + ipv4->ttl = TTL_CHECK_NONE; + ipv4->protocol = protocol; + ipv4->check = 0; + + ipv4->src_ip = in4addr_any; + ipv4->dst_ip = in4addr_any; +} + +/* Fill in IPv6 header fields. */ +static void set_ipv6_header(struct ipv6 *ipv6, + u16 ip_bytes, + u8 tos, u32 flow_label, + u8 hop_limit, u8 protocol) +{ + ipv6->version = 6; + ipv6->traffic_class_hi = tos >> 4; + ipv6->traffic_class_lo = tos & 0x0f; + ipv6->flow_label_hi = (flow_label >> 16) & 0xf; + ipv6->flow_label_lo = htons(flow_label & 0xffff); + + assert(ip_bytes >= sizeof(*ipv6)); + ipv6->payload_len = htons(ip_bytes - sizeof(*ipv6)); + ipv6->next_header = protocol; + if (hop_limit) + ipv6->hop_limit = hop_limit; + else + ipv6->hop_limit = TTL_CHECK_NONE; + + ipv6->src_ip = in6addr_any; + ipv6->dst_ip = in6addr_any; +} + +void set_ip_header(void *ip_header, + int address_family, + u16 ip_bytes, + u8 tos, u32 flowlabel, + u8 ttl, u8 protocol) +{ + if (address_family == AF_INET) + set_ipv4_header(ip_header, ip_bytes, tos, ttl, protocol); + else if (address_family == AF_INET6) + set_ipv6_header(ip_header, ip_bytes, tos, flowlabel, + ttl, protocol); + else + assert(!"bad ip_version in config"); +} + +void set_packet_ip_header(struct packet *packet, + int address_family, + u16 ip_bytes, + u8 tos, u32 flowlabel, + u8 ttl, u8 protocol) +{ + struct header *ip_header = NULL; + + if (address_family == AF_INET) { + struct ipv4 *ipv4 = (struct ipv4 *) packet->buffer; + packet->ipv4 = ipv4; + assert(packet->ipv6 == NULL); + ip_header = packet_append_header(packet, HEADER_IPV4, + sizeof(*ipv4)); + ip_header->total_bytes = ip_bytes; + set_ipv4_header(ipv4, ip_bytes, tos, ttl, protocol); + } else if (address_family == AF_INET6) { + struct ipv6 *ipv6 = (struct ipv6 *) packet->buffer; + packet->ipv6 = ipv6; + assert(packet->ipv4 == NULL); + ip_header = packet_append_header(packet, HEADER_IPV6, + sizeof(*ipv6)); + ip_header->total_bytes = ip_bytes; + set_ipv6_header(ipv6, ip_bytes, tos, flowlabel, ttl, protocol); + } else { + assert(!"bad ip_version in config"); + } +} + +int ipv4_header_append(struct packet *packet, + const char *ip_src, + const char *ip_dst, + const u8 tos, + const u8 ttl, + char **error) +{ + struct header *header = NULL; + const int ipv4_bytes = sizeof(struct ipv4); + struct ipv4 *ipv4 = NULL; + + header = packet_append_header(packet, HEADER_IPV4, ipv4_bytes); + if (header == NULL) { + asprintf(error, "too many headers"); + return STATUS_ERR; + } + + ipv4 = header->h.ipv4; + set_ip_header(ipv4, AF_INET, 0, tos, 0, ttl, 0); + + if (inet_pton(AF_INET, ip_src, &ipv4->src_ip) != 1) { + asprintf(error, "bad IPv4 src address: '%s'\n", ip_src); + return STATUS_ERR; + } + + if (inet_pton(AF_INET, ip_dst, &ipv4->dst_ip) != 1) { + asprintf(error, "bad IPv4 dst address: '%s'\n", ip_dst); + return STATUS_ERR; + } + + return STATUS_OK; +} + +int ipv6_header_append(struct packet *packet, + const char *ip_src, + const char *ip_dst, + const u8 tos, + const u8 hop_limit, + char **error) +{ + struct header *header = NULL; + const int ipv6_bytes = sizeof(struct ipv6); + struct ipv6 *ipv6 = NULL; + + header = packet_append_header(packet, HEADER_IPV6, ipv6_bytes); + if (header == NULL) { + asprintf(error, "too many headers"); + return STATUS_ERR; + } + + ipv6 = header->h.ipv6; + set_ip_header(ipv6, AF_INET6, sizeof(struct ipv6), tos, 0, hop_limit, 0); + + if (inet_pton(AF_INET6, ip_src, &ipv6->src_ip) != 1) { + asprintf(error, "bad IPv6 src address: '%s'\n", ip_src); + return STATUS_ERR; + } + + if (inet_pton(AF_INET6, ip_dst, &ipv6->dst_ip) != 1) { + asprintf(error, "bad IPv6 dst address: '%s'\n", ip_dst); + return STATUS_ERR; + } + + return STATUS_OK; +} + +int ipv4_header_finish(struct packet *packet, + struct header *header, struct header *next_inner) +{ + struct ipv4 *ipv4 = header->h.ipv4; + int ip_bytes = sizeof(struct ipv4) + next_inner->total_bytes; + + ipv4->tot_len = htons(ip_bytes); + ipv4->protocol = header_type_info(next_inner->type)->ip_proto; + + /* Fill in IPv4 header checksum. */ + ipv4->check = 0; + ipv4->check = ipv4_checksum(ipv4, ipv4->ihl * sizeof(u32)); + + header->total_bytes = ip_bytes; + + return STATUS_OK; +} + +int ipv6_header_finish(struct packet *packet, + struct header *header, struct header *next_inner) +{ + struct ipv6 *ipv6 = header->h.ipv6; + int ip_bytes = sizeof(struct ipv6) + next_inner->total_bytes; + + assert(next_inner->total_bytes <= 0xffff); + ipv6->payload_len = htons(next_inner->total_bytes); + ipv6->next_header = header_type_info(next_inner->type)->ip_proto; + + /* IPv6 has no header checksum. */ + + header->total_bytes = ip_bytes; + + return STATUS_OK; +} diff --git a/test/packetdrill/ip_packet.h b/test/packetdrill/ip_packet.h new file mode 100644 index 0000000..05f2a07 --- /dev/null +++ b/test/packetdrill/ip_packet.h @@ -0,0 +1,80 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Interface for module for formatting IP packets. + */ + +#ifndef __IP_PACKET_H__ +#define __IP_PACKET_H__ + +#include "types.h" + +#include "packet.h" + +/* Populate header fields in the IP header at the given address. */ +extern void set_ip_header(void *ip_header, + int address_family, + u16 ip_bytes, + u8 tos, u32 flowlabel, + u8 ttl, u8 protocol); + +/* Set the packet's IP header pointer and then populate the IP header fields. */ +extern void set_packet_ip_header(struct packet *packet, + int address_family, + u16 ip_bytes, + u8 tos, u32 flowlabel, + u8 ttl, u8 protocol); + +/* Append an IPv4 header to the end of the given packet and fill in + * src/dst. On success, return STATUS_OK; on error return STATUS_ERR + * and fill in a malloc-allocated error message in *error. + */ +extern int ipv4_header_append(struct packet *packet, + const char *ip_src, + const char *ip_dst, + const u8 tos, + const u8 ttl, + char **error); + +/* Append an IPv6 header to the end of the given packet and fill in + * src/dst. On success, return STATUS_OK; on error return STATUS_ERR + * and fill in a malloc-allocated error message in *error. + */ +extern int ipv6_header_append(struct packet *packet, + const char *ip_src, + const char *ip_dst, + const u8 tos, + const u8 hop_limit, + char **error); + +/* Finalize the IPv4 header by filling in all necessary fields that + * were not filled in at parse time. + */ +extern int ipv4_header_finish(struct packet *packet, + struct header *header, struct header *next_inner); + +/* Finalize the IPv6 header by filling in all necessary fields that + * were not filled in at parse time. + */ +extern int ipv6_header_finish(struct packet *packet, + struct header *header, struct header *next_inner); + +#endif /* __IP_PACKET_H__ */ diff --git a/test/packetdrill/ip_prefix.c b/test/packetdrill/ip_prefix.c new file mode 100644 index 0000000..044b94d --- /dev/null +++ b/test/packetdrill/ip_prefix.c @@ -0,0 +1,148 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Implementation for operations for IPv4 and IPv6 prefixes. + */ + +#include "ip_prefix.h" + +#include +#include +#include + +#include "logging.h" + +struct ip_prefix ip_to_prefix(const struct ip_address *ip, int prefix_len) +{ + int max_prefix_bits = 8 * ip_address_length(ip->address_family); + struct ip_prefix prefix; + + if (prefix_len < 0 || prefix_len > max_prefix_bits) + die("invalid prefix_len: %d bits", prefix_len); + + prefix.ip = *ip; + prefix.prefix_len = prefix_len; + + return prefix; +} + +void ip_prefix_normalize(struct ip_prefix *prefix) +{ + /* Find the byte and bit offset where the prefix ends. */ + int bytes = prefix->prefix_len / 8; + int bits = prefix->prefix_len % 8; + int max_prefix_bytes = ip_address_length(prefix->ip.address_family); + + /* Zero the bits beyond the prefix in the byte where it ends. */ + if (bits != 0) { + int pos = 8 - bits; + prefix->ip.ip.bytes[bytes] &= ~((1U << pos) - 1); + ++bytes; + + } + /* Zero out the rest of the bytes in the address. */ + memset(prefix->ip.ip.bytes + bytes, 0, max_prefix_bytes - bytes); +} + +/* Parse and return a prefix length (in bits) like /16 or /64 from the + * end of a string, and die if the prefix is bigger than the given max + * length. Use the maximum length if there is no prefix in the string. + */ +static int prefix_len_parse(const char *prefix_string, int max_len) +{ + int prefix_len = 0; + const char *len_str = NULL; + + len_str = strstr(prefix_string, "/"); + if (len_str != NULL) { + /* Parse prefix len in string */ + char *end = NULL; + + ++len_str; /* advance beyond '/' */ + errno = 0; + prefix_len = strtol(len_str, &end, 10); + + if (errno != 0 || *end != '\0' || + (prefix_len < 0) || (prefix_len > max_len)) + die("bad prefix length in prefix '%s'\n", + prefix_string); + } else { + /* Default prefix length is all address bits */ + prefix_len = max_len; + } + + return prefix_len; +} + +/* Copy the address part of a "
/" string. */ +static char *copy_prefix_address(const char *prefix_string) +{ + const char *slash = strstr(prefix_string, "/"); + int len = 0; + if (slash != NULL) + len = slash - prefix_string; + else + len = strlen(prefix_string); + return strndup(prefix_string, len); +} + +struct ip_prefix ipv4_prefix_parse(const char *prefix_string) +{ + char *ip_str = copy_prefix_address(prefix_string); + struct ip_address ip = ipv4_parse(ip_str); + int prefix_len = prefix_len_parse(prefix_string, + 8 * ip_address_length(AF_INET)); + + free(ip_str); + + return ip_to_prefix(&ip, prefix_len); +} + +struct ip_prefix ipv6_prefix_parse(const char *prefix_string) +{ + char *ip_str = copy_prefix_address(prefix_string); + struct ip_address ip = ipv6_parse(ip_str); + int prefix_len = prefix_len_parse(prefix_string, + 8 * ip_address_length(AF_INET6)); + + free(ip_str); + + return ip_to_prefix(&ip, prefix_len); +} + +const char *ip_prefix_to_string(struct ip_prefix *prefix, char *buffer) +{ + char ip_str[ADDR_STR_LEN]; + int bytes = 0; + + memset(ip_str, 0, sizeof(ip_str)); + ip_to_string(&prefix->ip, ip_str); + + if (strlen(ip_str) + strlen("/128") + 1 > ADDR_STR_LEN) + die("address prefix would overflow buffer!"); + + bytes = snprintf(buffer, ADDR_STR_LEN, "%s/%d", + ip_str, prefix->prefix_len); + if (bytes >= ADDR_STR_LEN) + die("address prefix overflowed buffer!"); + + return buffer; +} diff --git a/test/packetdrill/ip_prefix.h b/test/packetdrill/ip_prefix.h new file mode 100644 index 0000000..0b82260 --- /dev/null +++ b/test/packetdrill/ip_prefix.h @@ -0,0 +1,69 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Types and operations for IPv4 and IPv6 address prefixes. + */ + +#ifndef __IP_PREFIX_H__ +#define __IP_PREFIX_H__ + +#include "types.h" + +#include "ip_address.h" + +/* IPv4 or IPv6 address prefix. */ +struct ip_prefix { + struct ip_address ip; + int prefix_len; /* prefix length in bits */ +}; + +static inline void ip_prefix_reset(struct ip_prefix *prefix) +{ + memset(prefix, 0, sizeof(*prefix)); +} + +/* Parse a human-readable IPv4 prefix and return it. Print an error + * to stderr and exit if there is an error parsing the prefix. + */ +extern struct ip_prefix ipv4_prefix_parse(const char *prefix_string); + +/* Parse a human-readable IPv6 prefix and return it. Print an error + * to stderr and exit if there is an error parsing the prefix. + */ +extern struct ip_prefix ipv6_prefix_parse(const char *prefix_string); + +/* Fill in the given prefix using the first 'prefix_len' bits of the + * given IP address, zeroing out bits beyond the prefix length. + */ +extern struct ip_prefix ip_to_prefix(const struct ip_address *ip, + int prefix_len); + +/* Zero the bits beyond the prefix length. */ +void ip_prefix_normalize(struct ip_prefix *prefix); + +/* Print a human-readable representation of the given IP prefix in the + * given buffer, which must be at least ADDR_STR_LEN bytes long. + * Returns a pointer to the given buffer. + */ +extern const char *ip_prefix_to_string(struct ip_prefix *prefix, + char *buffer); + +#endif /* __IP_PREFIX_H__ */ diff --git a/test/packetdrill/ipv6.h b/test/packetdrill/ipv6.h new file mode 100644 index 0000000..07a0964 --- /dev/null +++ b/test/packetdrill/ipv6.h @@ -0,0 +1,92 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Our own IPv6 header declarations, so we have something that's + * portable and somewhat more readable than a typical system header + * file. + */ + +#ifndef __IPV6_HEADERS_H__ +#define __IPV6_HEADERS_H__ + +#include "types.h" + +#include + +struct ipv6 { +#if __BYTE_ORDER == __LITTLE_ENDIAN + __u8 traffic_class_hi:4, + version:4; + __u8 flow_label_hi:4, + traffic_class_lo:4; + __u16 flow_label_lo; +#elif __BYTE_ORDER == __BIG_ENDIAN + __u8 version:4, + traffic_class_hi:4; + __u8 traffic_class_lo:4, + flow_label_hi:4; + __u16 flow_label_lo; +#else +# error "Please fix endianness defines" +#endif + + __be16 payload_len; + __u8 next_header; + __u8 hop_limit; + + struct in6_addr src_ip; + struct in6_addr dst_ip; +}; + +#ifdef linux +#define IPV6_HOPLIMIT 52 +#define IPV6_TCLASS 67 +#endif /* linux */ + +static inline u8 ipv6_tos_byte(const struct ipv6 *ipv6) +{ + return (ipv6->traffic_class_hi << 4) | ipv6->traffic_class_lo; +} + +static inline u32 ipv6_flow_label(const struct ipv6 *ipv6) +{ + return (ntohs(ipv6->flow_label_lo)) | (ipv6->flow_label_hi << 16); +} + +static inline u8 ipv6_hoplimit_byte(const struct ipv6 *ipv6) +{ + return ipv6->hop_limit; +} + +/* The following struct declaration is needed for the IPv6 ioctls + * SIOCSIFADDR and SIOCDIFADDR that add and delete IPv6 addresses from + * a network interface. We have to declare our own version here + * because this struct is only available in /usr/include/linux/ipv6.h, + * but that .h file has kernel IPv6 declarations that conflict with + * standard user-space IPv6 declarations. + */ +struct in6_ifreq { + struct in6_addr ifr6_addr; + __u32 ifr6_prefixlen; + int ifr6_ifindex; +}; + +#endif /* __IPV6_HEADERS_H__ */ diff --git a/test/packetdrill/lexer.l b/test/packetdrill/lexer.l new file mode 100644 index 0000000..7d063d3 --- /dev/null +++ b/test/packetdrill/lexer.l @@ -0,0 +1,280 @@ +%{ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * This is the specification for the lexical scanner for the packetdrill + * script language. It is processed by the flex lexical scanner + * generator. + * + * For full documentation see: http://flex.sourceforge.net/manual/ + * + * Here is a quick and dirty tutorial on flex: + * + * A flex lexical scanner specification is basically a list of rules, + * where each rule is a regular expressions for a lexical token to + * match, followed by a C fragment to execute when the scanner sees + * that pattern. + * + * The lexer feeds a stream of terminal symbols up to this parser, + * passing up a FOO token for each "return FOO" in the lexer spec. The + * lexer specifies what value to pass up to the parser by setting a + * yylval.fooval field, where fooval is a field in the %union in the + * .y file. + * + * TODO: detect overflow in numeric literals. + */ + +#include "types.h" + +#include +#include +#include +#include "script.h" +#include "tcp_options.h" +#include "parse.h" +#include "config.h" + +/* This include of the bison-generated .h file must go last so that we + * can first include all of the declarations on which it depends. + */ +#include "parser.h" + +/* Suppress flex's generation of an uncalled static input() function, which + * leads to a compiler warning: + * warning: ‘input’ defined but not used + */ +#define YY_NO_INPUT + +/* Copy the string name "foo" after the "--" of a "--foo" option. */ +static char *option(const char *s) +{ + const int dash_dash_len = 2; + return strndup(s + dash_dash_len, strlen(s) - dash_dash_len); +} + +/* Copy the string inside a quoted string. */ +static char *quoted(const char *s) +{ + const int delim_len = 1; + return strndup(s + delim_len, strlen(s) - 2*delim_len); +} + +/* Check to see if the word in yytext is a user-defined symbol, and if so then + * return its value. Otherwise return the word itself. + */ +int word(void) +{ + char *word = yytext; + char *value = NULL; + + /* Look in symbol table for matching user-defined symbol->value map. */ + value = definition_get(in_config->defines, word); + if (value) { + if (value[0] == '"') { + yylval.string = quoted(value); /* SYM="val" */ + return STRING; + } else if (value[0] == '`') { + yylval.string = quoted(value); /* SYM=`val` */ + return BACK_QUOTED; + } else { + yylval.string = strdup(value); /* SYM=val */ + return WORD; + } + } + /* A literal word (e.g. system call name or socket option name). */ + yylval.string = strdup(word); + return WORD; +} + +/* Copy the code inside a code snippet that is enclosed in %{ }% after + * first stripping the space and tab characters from either end of the + * snippet. We strip leading and trailing whitespace for Python users + * to remain sane, since Python is sensitive to whitespace. To summarize, + * given an input %{}% we return: + */ +static char *code(const char *s) +{ + const int delim_len = sizeof("%{")-1; + + const char *start = s + delim_len; + while ((*start == ' ') || (*start == '\t')) + ++start; + + const char *end = s + (strlen(s) - 1) - delim_len; + while ((*end == ' ') || (*end == '\t')) + --end; + + const int code_len = end - start + 1; + return strndup(start, code_len); +} + +/* Convert a hex string prefixed by "0x" to an integer value. */ +static s64 hextol(const char *s) +{ + return strtol(yytext + 2, NULL, 16); +} + +%} + +%{ +#define YY_USER_ACTION yylloc.first_line = yylloc.last_line = yylineno; +%} +%option yylineno +%option nounput + +/* A regexp for C++ comments: */ +cpp_comment \/\/[^\n]*\n + +/* Here is a summary of the regexp for C comments: + * open-comment + * any number of: + * (non-stars) or (star then non-slash) + * close comment + */ +c_comment \/\*(([^*])|(\*[^\/]))*\*\/ + +/* The regexp for code snippets is analogous to that for C comments. + * Here is a summary of the regexp for code snippets: + * %{ + * any number of: + * (non-}) or (} then non-%) + * }% + */ +code \%\{(([^}])|(\}[^\%]))*\}\% + +/* IPv4: a regular experssion for an IPv4 address */ +ipv4_addr [0-9]+[.][0-9]+[.][0-9]+[.][0-9]+ + +/* IPv6: a regular experssion for an IPv6 address. The complexity is + * unfortunate, but we can't use a super-simple approach because TCP + * sequence number ranges like 1:1001 can look like IPv6 addresses if + * we use a naive approach. + */ +seg [0-9a-fA-F]{1,4} +v0 [:][:] +v1 ({seg}[:]){7,7}{seg} +v2 ({seg}[:]){1,7}[:] +v3 ({seg}[:]){1,6}[:]{seg} +v4 ({seg}[:]){1,5}([:]{seg}){1,2} +v5 ({seg}[:]){1,4}([:]{seg}){1,3} +v6 ({seg}[:]){1,3}([:]{seg}){1,4} +v7 ({seg}[:]){1,2}([:]{seg}){1,5} +v8 {seg}[:](([:]{seg}){1,6}) +v9 [:]([:]{seg}){1,7} +/* IPv4-mapped IPv6 address: */ +v10 [:][:]ffff[:]{ipv4_addr} +/* IPv4-translated IPv6 address: */ +v11 [:][:]ffff[:](0){1,4}[:]{ipv4_addr} +/* IPv4-embedded IPv6 addresses: */ +v12 ({seg}[:]){1,4}[:]{ipv4_addr} +ipv6_addr ({v0}|{v1}|{v2}|{v3}|{v4}|{v5}|{v6}|{v7}|{v8}|{v9}|{v10}|{v11}|{v12}) + +%% +sa_family return SA_FAMILY; +sin_port return SIN_PORT; +sin_addr return SIN_ADDR; +msg_name return MSG_NAME; +msg_iov return MSG_IOV; +msg_flags return MSG_FLAGS; +msg_control return MSG_CONTROL; +cmsg_data return CMSG_DATA; +cmsg_level return CMSG_LEVEL; +cmsg_type return CMSG_TYPE; +ee_errno return EE_ERRNO; +ee_origin return EE_ORIGIN; +ee_type return EE_TYPE; +ee_code return EE_CODE; +ee_info return EE_INFO; +ee_data return EE_DATA; +scm_sec return SCM_SEC; +scm_nsec return SCM_NSEC; +fd return FD; +u32 return U32; +u64 return U64; +ptr return PTR; +events return EVENTS; +revents return REVENTS; +onoff return ONOFF; +linger return LINGER; +htons return _HTONS_; +ipv4 return IPV4; +ipv6 return IPV6; +icmp return ICMP; +udp return UDP; +GREv0 return GRE; +gre return GRE; +raw return RAW; +sum return SUM; +off return OFF; +key return KEY; +seq return SEQ; +none return NONE; +checksum return CHECKSUM; +sequence# return SEQUENCE; +present return PRESENT; +mpls return MPLS; +label return LABEL; +tc return TC; +ttl return TTL; +inet_addr return INET_ADDR; +inet6_addr return INET6_ADDR; +ack return ACK; +eol return EOL; +ecr return ECR; +mss return MSS; +mtu return MTU; +nop return NOP; +sack return SACK; +sackOK return SACKOK; +md5 return MD5; +TS return TIMESTAMP; +FO return FAST_OPEN; +FOEXP return FAST_OPEN_EXP; +tos return TOS; +flowlabel return FLOWLABEL; +flags return FLAGS; +Flags return FLAGS; +val return VAL; +win return WIN; +urg return URG; +wscale return WSCALE; +ect01 return ECT01; +ect0 return ECT0; +ect1 return ECT1; +noecn return NO_ECN; +ce return CE; +id return ID; +[.][.][.] return ELLIPSIS; +--[a-zA-Z0-9_]+ yylval.string = option(yytext); return OPTION; +[-]?[0-9]*[.][0-9]+ yylval.floating = atof(yytext); return FLOAT; +[-]?[0-9]+ yylval.integer = atoll(yytext); return INTEGER; +0x[0-9a-fA-F]+ yylval.integer = hextol(yytext); return HEX_INTEGER; +[a-zA-Z0-9_]+ return word(); +\"(\\.|[^"])*\" yylval.string = quoted(yytext); return STRING; +\`(\\.|[^`])*\` yylval.string = quoted(yytext); return BACK_QUOTED; +[^ \t\n] return (int) yytext[0]; +[ \t\n]+ /* ignore whitespace */; +{cpp_comment} /* ignore C++-style comment */; +{c_comment} /* ignore C-style comment */; +{code} yylval.string = code(yytext); return CODE; +{ipv4_addr} yylval.string = strdup(yytext); return IPV4_ADDR; +{ipv6_addr} yylval.string = strdup(yytext); return IPV6_ADDR; +%% diff --git a/test/packetdrill/link_layer.c b/test/packetdrill/link_layer.c new file mode 100644 index 0000000..d6e85a4 --- /dev/null +++ b/test/packetdrill/link_layer.c @@ -0,0 +1,104 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Link-layer utilities. + */ + +#include "link_layer.h" + +#include +#include + +#include "logging.h" + +#ifdef linux + +#include +#include + +#include "wrap.h" + +void get_hw_address(const char *name, struct ether_addr *hw_address, + enum ip_version_t ip_version) +{ + u8 *m = NULL; + struct ifreq ifr; + int fd; + + DEBUGP("get_hw_address for device %s\n", name); + + fd = wrap_socket(ip_version, SOCK_DGRAM); + + /* Discover the index of the interface. */ + snprintf(ifr.ifr_name, IFNAMSIZ, "%s", name); + if (ioctl(fd, SIOCGIFINDEX, &ifr) < 0) + die_perror("ioctl SIOCGIFINDEX"); + + /* Get hardware address for the interface. */ + if (ioctl(fd, SIOCGIFHWADDR, &ifr) != 0) + die_perror("ioctl SIOCGIFHWADDR"); + + m = (u8 *)&ifr.ifr_addr.sa_data; + DEBUGP("%s HWaddr: %02x:%02x:%02x:%02x:%02x:%02x\n", + name, m[0], m[1], m[2], m[3], m[4], m[5]); + memcpy(hw_address, m, sizeof(*hw_address)); + + if (close(fd)) + die_perror("close"); +} + +#else + +#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) +#include +#include +#include +#include +#include +#endif /* defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) */ + +void get_hw_address(const char *name, struct ether_addr *hw_address) +{ + struct ifaddrs *ifaddrs_list, *ifaddr; + + DEBUGP("get_hw_address for device %s\n", name); + + if (getifaddrs(&ifaddrs_list) < 0) + die_perror("getifaddrs"); + + for (ifaddr = ifaddrs_list; ifaddr != NULL; ifaddr = ifaddr->ifa_next) { + if (strcmp(name, ifaddr->ifa_name) == 0 && + ifaddr->ifa_addr->sa_family == AF_LINK) { + struct sockaddr_dl *sdl; + sdl = (struct sockaddr_dl *)ifaddr->ifa_addr; + if (sdl->sdl_type == IFT_ETHER) { + memcpy(hw_address, LLADDR(sdl), + sizeof(*hw_address)); + freeifaddrs(ifaddrs_list); + return; + } + } + } + + die("unable to find hw address for %s\n", name); +} + +#endif diff --git a/test/packetdrill/link_layer.h b/test/packetdrill/link_layer.h new file mode 100644 index 0000000..e5812bf --- /dev/null +++ b/test/packetdrill/link_layer.h @@ -0,0 +1,38 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Link-layer utilities. + */ + +#ifndef __LINK_LAYER_H__ +#define __LINK_LAYER_H__ + +#include "types.h" + +#include "ethernet.h" + +struct config; + +/* Get the link layer address for the device with the given name, or die. */ +void get_hw_address(const char *name, struct ether_addr *hw_address, + enum ip_version_t ip_version); + +#endif /* __LINK_LAYER_H__ */ diff --git a/test/packetdrill/logging.c b/test/packetdrill/logging.c new file mode 100644 index 0000000..730add4 --- /dev/null +++ b/test/packetdrill/logging.c @@ -0,0 +1,51 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Logging and output functions. + */ + +#include "run.h" +#include "system.h" + +#include +#include + +extern void __attribute__((noreturn)) die(char *format, ...) +{ + va_list ap; + + va_start(ap, format); + vfprintf(stderr, format, ap); + va_end(ap); + + run_cleanup_command(); + + exit(EXIT_FAILURE); +} + +void __attribute__((noreturn)) die_perror(char *message) +{ + perror(message); + + run_cleanup_command(); + + exit(EXIT_FAILURE); +} diff --git a/test/packetdrill/logging.h b/test/packetdrill/logging.h new file mode 100644 index 0000000..d961c9d --- /dev/null +++ b/test/packetdrill/logging.h @@ -0,0 +1,46 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Logging and output functions. + */ + +#ifndef __LOGGING_H__ +#define __LOGGING_H__ + +#include "types.h" + +/* Enable this to get debug logging. */ +#define DEBUG_LOGGING 0 + +/* Use a gcc variadic macro to conditionally compile debug printing. */ +#define DEBUGP(...) \ + if (DEBUG_LOGGING) { \ + fprintf(stdout, __VA_ARGS__); \ + fflush(stdout); \ + } + +/* Log the message to stderr and then exit with a failure status code. */ +extern void __attribute__((noreturn)) die(char *format, ...); + +/* Call perror() with message and then exit with a failure status code. */ +extern void __attribute__((noreturn)) die_perror(char *message); + +#endif /* __LOGGING_H__ */ diff --git a/test/packetdrill/mpls.h b/test/packetdrill/mpls.h new file mode 100644 index 0000000..b536437 --- /dev/null +++ b/test/packetdrill/mpls.h @@ -0,0 +1,113 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Our own MPLS header declarations, so we have something that's + * portable and somewhat more readable than a typical system header + * file. + * + * We cannot include the kernel's MPLS .h files because this tool tries + * to compile and work for basically any Linux/BSD kernel version. So + * we declare our own version of various MPLS-related definitions here. + */ + +#ifndef __MPLS_HEADERS_H__ +#define __MPLS_HEADERS_H__ + +#include +#include "types.h" + +/* On-the-wire MPLS "label stack entry", per RFC 3032 and RFC 5462. */ +struct mpls { + __be32 entry; +}; + +/* Bit-shifting macros to access MPLS fields (the label straddles byte + * boundaries so there's no simple/clean way to use bit fields). + */ +#define MPLS_LABEL_MASK 0xfffff000 /* label */ +#define MPLS_LABEL_SHIFT 12 +#define MPLS_TC_MASK 0x00000e00 /* traffic class */ +#define MPLS_TC_SHIFT 9 +#define MPLS_STACK_MASK 0x00000100 /* is stack bottom? */ +#define MPLS_STACK_SHIFT 8 +#define MPLS_TTL_MASK 0x000000ff /* time to live */ +#define MPLS_TTL_SHIFT 0 + +/* Return the label from an MPLS label stack entry. */ +static inline u32 mpls_entry_label(const struct mpls *mpls) +{ + return (ntohl(mpls->entry) & MPLS_LABEL_MASK) >> MPLS_LABEL_SHIFT; +} + +/* Return the traffic class from an MPLS label stack entry. */ +static inline u8 mpls_entry_tc(const struct mpls *mpls) +{ + return (ntohl(mpls->entry) & MPLS_TC_MASK) >> MPLS_TC_SHIFT; +} + +/* Return the "is stack bottom?" bit from an MPLS label stack entry. */ +static inline bool mpls_entry_stack(const struct mpls *mpls) +{ + return (ntohl(mpls->entry) & MPLS_STACK_MASK) >> MPLS_STACK_SHIFT; +} + +/* Return the TTL from an MPLS label stack entry. */ +static inline u8 mpls_entry_ttl(const struct mpls *mpls) +{ + return (ntohl(mpls->entry) & MPLS_TTL_MASK) >> MPLS_TTL_SHIFT; +} + +/* Fill in an MPLS label stack entry with the given field values. */ +static inline void mpls_entry_set(u32 label, u8 traffic_class, + bool is_stack_bottom, u8 ttl, + struct mpls *mpls) +{ + mpls->entry = htonl((label << MPLS_LABEL_SHIFT) | + (traffic_class << MPLS_TC_SHIFT) | + (is_stack_bottom << MPLS_STACK_SHIFT) | + (ttl << MPLS_TTL_SHIFT)); +} + +/* Parse-time representation of an MPLS label stack entry. */ +#define MPLS_STACK_MAX_ENTRIES 6 /* maximum number of label entries */ +struct mpls_stack { + struct mpls entries[MPLS_STACK_MAX_ENTRIES]; + int length; /* number of MPLS label stack entries */ +}; + +/* Allocate and initialize a new MPLS label stack as empty. */ +static inline struct mpls_stack *mpls_stack_new(void) +{ + return calloc(1, sizeof(struct mpls_stack)); +} + +/* Appends the given MPLS label stack entry to the given stack. Returns + * STATUS_OK on success, or STATUS_ERR on error (if the label stack is full). + */ +static inline int mpls_stack_append(struct mpls_stack *stack, struct mpls mpls) +{ + if (stack->length >= ARRAY_SIZE(stack->entries)) + return STATUS_ERR; + stack->entries[stack->length++] = mpls; + return STATUS_OK; +} + +#endif /* __MPLS_HEADERS_H__ */ diff --git a/test/packetdrill/mpls_packet.c b/test/packetdrill/mpls_packet.c new file mode 100644 index 0000000..2448681 --- /dev/null +++ b/test/packetdrill/mpls_packet.c @@ -0,0 +1,77 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Implementation for module for formatting MPLS packets. + */ + +#include "mpls_packet.h" + +#include "mpls.h" + +int new_mpls_stack_entry(s64 label, s64 traffic_class, + bool is_stack_bottom, s64 ttl, + struct mpls *mpls, char **error) +{ + if ((label < 0) || (label >= (1<<20))) { + asprintf(error, "MPLS label out of range for 20 bits"); + return STATUS_ERR; + } + + if ((traffic_class < 0) || (traffic_class >= (1<<3))) { + asprintf(error, "MPLS traffic_class out of range for 3 bits"); + return STATUS_ERR; + } + + if ((ttl < 0) || (ttl >= (1<<8))) { + asprintf(error, "MPLS ttl out of range for 8 bits"); + return STATUS_ERR; + } + + mpls_entry_set(label, traffic_class, is_stack_bottom, ttl, mpls); + return STATUS_OK; +} + +int mpls_header_append(struct packet *packet, struct mpls_stack *mpls_stack, + char **error) +{ + struct header *header; + int mpls_bytes = mpls_stack->length * sizeof(struct mpls); + + header = packet_append_header(packet, HEADER_MPLS, mpls_bytes); + if (header == NULL) { + asprintf(error, "too many headers"); + return STATUS_ERR; + } + + memcpy(header->h.mpls, mpls_stack->entries, mpls_bytes); + + return STATUS_OK; +} + +int mpls_header_finish(struct packet *packet, + struct header *header, struct header *next_inner) +{ + int mpls_bytes = header->header_bytes + next_inner->total_bytes; + + header->total_bytes = mpls_bytes; + + return STATUS_OK; +} diff --git a/test/packetdrill/mpls_packet.h b/test/packetdrill/mpls_packet.h new file mode 100644 index 0000000..16079c0 --- /dev/null +++ b/test/packetdrill/mpls_packet.h @@ -0,0 +1,57 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Interface for module for formatting MPLS packets. + */ + +#ifndef __MPLS_PACKET_H__ +#define __MPLS_PACKET_H__ + +#include "types.h" + +#include "mpls.h" +#include "packet.h" + +/* Fill in the given MPLS label stack entry with the given field + * values, validating that actual parameter value fits inside the + * width of the field on the wire. On success, return STATUS_OK; on + * error return STATUS_ERR and fill in a malloc-allocated error + * message in *error. + */ +extern int new_mpls_stack_entry(s64 label, s64 traffic_class, + bool is_stack_bottom, s64 ttl, + struct mpls *mpls, char **error); + +/* Append an MPLS header to the end of the given packet. On success, + * return STATUS_OK; on error return STATUS_ERR and fill in a + * malloc-allocated error message in *error. + */ +extern int mpls_header_append(struct packet *packet, + struct mpls_stack *mpls_stack, + char **error); + +/* Finalize the MPLS header by filling in all necessary fields that + * were not filled in at parse time. + */ +extern int mpls_header_finish(struct packet *packet, + struct header *header, struct header *next_inner); + +#endif /* __MPLS_PACKET_H__ */ diff --git a/test/packetdrill/net_utils.c b/test/packetdrill/net_utils.c new file mode 100644 index 0000000..1b59d64 --- /dev/null +++ b/test/packetdrill/net_utils.c @@ -0,0 +1,172 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Implementation for various network utilities. + */ + +#include "net_utils.h" + +#include +#include +#include + +#include "logging.h" + +static void verbose_system(const char *command) +{ + int result; + + DEBUGP("running: '%s'\n", command); + result = system(command); + DEBUGP("result: %d\n", result); + if (result != 0) + DEBUGP("error executing command '%s'\n", command); +} + +/* Configure a local IPv4 address and netmask for the device */ +static void net_add_ipv4_address(const char *dev_name, + const struct ip_address *ip, + int prefix_len) +{ + char *command = NULL; + char ip_string[ADDR_STR_LEN]; + + ip_to_string(ip, ip_string); + +#ifdef linux + asprintf(&command, "ip addr add %s/%d dev %s > /dev/null 2>&1", + ip_string, prefix_len, dev_name); +#endif +#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) + asprintf(&command, "/sbin/ifconfig %s %s/%d alias", + dev_name, ip_string, prefix_len); +#endif /* defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) */ + + verbose_system(command); + free(command); +} + +/* Configure a local IPv6 address and prefix length for the device */ +static void net_add_ipv6_address(const char *dev_name, + const struct ip_address *ip, + int prefix_len) +{ + char *command = NULL; + char ip_string[ADDR_STR_LEN]; + + ip_to_string(ip, ip_string); + +#ifdef linux + + asprintf(&command, "ip addr add %s/%d dev %s > /dev/null 2>&1", + ip_string, prefix_len, dev_name); +#endif +#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) + + asprintf(&command, "/sbin/ifconfig %s inet6 %s/%d", + dev_name, ip_string, prefix_len); +#endif /* defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) */ + + verbose_system(command); + free(command); + + /* Wait for IPv6 duplicate address detection to converge, + * so that this address no longer shows as "tentative". + * e.g. "ip addr show" shows: + * inet6 fd3d:fa7b:d17d::36/48 scope global tentative + */ +#ifdef linux + if (!strstr(dev_name, "tun")) + sleep(2); +#endif +#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) + sleep(3); +#endif +} + +void net_add_dev_address(const char *dev_name, + const struct ip_address *ip, + int prefix_len) +{ + switch (ip->address_family) { + case AF_INET: + net_add_ipv4_address(dev_name, ip, prefix_len); + break; + case AF_INET6: + net_add_ipv6_address(dev_name, ip, prefix_len); + break; + default: + assert(!"bad family"); + break; + } +} + +void net_del_dev_address(const char *dev_name, + const struct ip_address *ip, + int prefix_len) +{ + char *command = NULL; + char ip_string[ADDR_STR_LEN]; + + ip_to_string(ip, ip_string); + +#ifdef linux + asprintf(&command, "ip addr del %s/%d dev %s > /dev/null 2>&1", + ip_string, prefix_len, dev_name); +#endif +#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) + asprintf(&command, "/sbin/ifconfig %s %s %s/%d -alias", + dev_name, + ip->address_family == AF_INET6 ? "inet6" : "", + ip_string, prefix_len); +#endif /* defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) */ + + verbose_system(command); + free(command); +} + +/* In general we want to avoid configuring a new IP address on an + * interface, because we do not want to pay the latency penaly + * (e.g. it takes about one second for IPv6 duplicate address + * detection). So if we find the IP configured the correct local + * network device, then we're done, and we short-circuit and return + * immediately. Otherwise remove the address from the current device + * and add it on the newly-requested device. + */ +void net_setup_dev_address(const char *dev_name, + const struct ip_address *ip, + int prefix_len) +{ + char cur_dev_name[IFNAMSIZ]; + + bool found = get_ip_device(ip, cur_dev_name); + + DEBUGP("net_setup_dev_address: found: %d\n", found); + + if (found && strcmp(cur_dev_name, dev_name) == 0) { + DEBUGP("net_setup_dev_address: found on correct device\n"); + return; + } + + if (found) + net_del_dev_address(cur_dev_name, ip, prefix_len); + net_add_dev_address(dev_name, ip, prefix_len); +} diff --git a/test/packetdrill/net_utils.h b/test/packetdrill/net_utils.h new file mode 100644 index 0000000..bdc1009 --- /dev/null +++ b/test/packetdrill/net_utils.h @@ -0,0 +1,56 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Interface for various network utilities related to configuring IP + * addresses for network devices. + */ + +#ifndef __NET_UTILS_H__ +#define __NET_UTILS_H__ + +#include "types.h" + +#include "ip_address.h" + +/* Add the given IP address, with the given subnet/prefix length, + * to the given device. + */ +extern void net_add_dev_address(const char *dev_name, + const struct ip_address *ip, + int prefix_len); + +/* Delete the given IP address, with the given subnet/prefix length, + * from the given device. + */ +extern void net_del_dev_address(const char *dev_name, + const struct ip_address *ip, + int prefix_len); + +/* See if the given IP address, with the given subnet/prefix length, + * is already on the given device. If so, return without doing + * anything. If not, delete it from any device it's currently on, and + * add it to the given network device. + */ +extern void net_setup_dev_address(const char *dev_name, + const struct ip_address *ip, + int prefix_len); + +#endif /* __NET_UTILS_H__ */ diff --git a/test/packetdrill/netdev.c b/test/packetdrill/netdev.c new file mode 100644 index 0000000..7734709 --- /dev/null +++ b/test/packetdrill/netdev.c @@ -0,0 +1,502 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Implementation for a "virtual network device" module to + * inject packets into the kernel and read packets leaving the kernel. + */ + +#include "netdev.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) +#include +#endif /* defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) */ + +#include "assert.h" +#include "ip.h" +#include "ipv6.h" +#include "logging.h" +#include "net_utils.h" +#include "packet.h" +#include "packet_parser.h" +#include "packet_socket.h" +#include "tcp.h" +#include "tun.h" +#include "wrap.h" + +/* Internal private state for the netdev for purely local tests. */ +struct local_netdev { + struct netdev netdev; /* "inherit" from netdev */ + + char *name; /* malloc-ed copy of interface name (owned) */ + int tun_fd; /* tun for sending/receiving packets */ + int control_fd; /* fd for configuration of tun interface */ + int index; /* interface index from if_nametoindex */ + struct packet_socket *psock; /* for sniffing packets (owned) */ +}; + +struct netdev_ops local_netdev_ops; + +/* "Downcast" an abstract netdev to our local flavor. */ +static inline struct local_netdev *to_local_netdev(struct netdev *netdev) +{ + return (struct local_netdev *)netdev; +} + +/* Clean up any old tun device state that might be lying around from + * previous tests. NetBSD the kernel does not automatically tear down + * unreferenced tun devices and routes referencing those routes. + */ +static void cleanup_old_device(struct config *config, + struct local_netdev *netdev) +{ +#if defined(__NetBSD__) + char *cleanup_command = NULL; + int result; + + asprintf(&cleanup_command, + "/sbin/ifconfig %s down delete > /dev/null 2>&1", + TUN_DEV); + DEBUGP("running: '%s'\n", cleanup_command); + result = system(cleanup_command); + DEBUGP("result: %d\n", result); + free(cleanup_command); +#endif /* defined(__NetBSD__) */ +} + +/* Check that the remote IP is actually remote. It must be to ensure + * that test packets will pass into our tun device. + */ +static void check_remote_address(struct config *config, + struct local_netdev *netdev) +{ + if (is_ip_local(&config->live_remote_ip)) { + die("error: live_remote_ip %s is not remote\n", + config->live_remote_ip_string); + } +} + +/* Make sure config->live_local_ip is not configured on any devices. + * This is only used for anyip tests. + */ +static void check_local_anyip(struct config *config) +{ + if (is_ip_local(&config->live_local_ip)) { + die("error: live_local_ip %s is not remote for anyip\n", + config->live_local_ip_string); + } +} + +/* Create a tun device for the lifetime of this test. */ +static void create_device(struct config *config, struct local_netdev *netdev) +{ + /* Open the tun device, which "clones" it for our purposes. */ + int tun_fd; +#ifdef linux + int nb = 0; + +loop: + if (++nb > 10) + die_perror("open tun device"); +#endif + tun_fd = open(TUN_PATH, O_RDWR); + if (tun_fd < 0) + die_perror("open tun device"); + + netdev->tun_fd = tun_fd; + +#ifdef linux + /* Create the device. Since we do not specify a device name, the + * kernel will try to allocate the "next" device of the specified + * type. This device will disappear when we are done. + */ + struct ifreq ifr; + memset(&ifr, 0, sizeof(ifr)); + ifr.ifr_flags = IFF_TUN | IFF_NO_PI | IFF_VNET_HDR; + int status = ioctl(netdev->tun_fd, TUNSETIFF, (void *)&ifr); + if (status < 0) + die_perror("TUNSETIFF"); + + /* Our tests rely on using tun0. + * We might change this in the future, by passing a variable filled + * with tunnel name. In the mean time, wait a bit that tun0 gets free. + */ + if (strcmp(ifr.ifr_name, "tun0")) { + close(tun_fd); + usleep(100000); + goto loop; + } + netdev->name = strdup(ifr.ifr_name); +#endif + +#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) + const int mode = IFF_BROADCAST | IFF_MULTICAST; + if (ioctl(netdev->tun_fd, TUNSIFMODE, &mode, sizeof(mode)) < 0) + die_perror("TUNSIFMODE"); + + netdev->name = strdup(TUN_DEV); +#endif /* defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) */ + +#if defined(__FreeBSD__) || defined(__NetBSD__) + /* On FreeBSD and NetBSD we need to explicitly ask to be able + * to prepend the address family when injecting tun packets. + * OpenBSD presumes we are doing this, even without the ioctl. + */ + const int header = 1; + if (ioctl(netdev->tun_fd, TUNSIFHEAD, &header, sizeof(header)) < 0) + die_perror("TUNSIFHEAD"); +#endif /* defined(__FreeBSD__) || defined(__NetBSD__) */ + + DEBUGP("tun name: '%s'\n", netdev->name); + + netdev->index = if_nametoindex(netdev->name); + if (netdev->index == 0) + die_perror("if_nametoindex"); + + DEBUGP("tun index: '%d'\n", netdev->index); + + if (config->speed != TUN_DRIVER_SPEED_CUR) { + char *command; + asprintf(&command, "ethtool -s %s speed %u autoneg off", + netdev->name, config->speed); + if (system(command) < 0) + die("Error executing %s\n", command); + free(command); + + /* Need to bring interface down and up so the interface speed + * will be copied to the link_speed field. This field is + * used by TCP's cwnd bound. */ + asprintf(&command, "ifconfig %s down; sleep 1; ifconfig %s up; " + "sleep 1", netdev->name, netdev->name); + if (system(command) < 0) + die("Error executing %s\n", command); + free(command); + } + + if (config->mtu != TUN_DRIVER_DEFAULT_MTU) { + char *command; + asprintf(&command, "ifconfig %s mtu %d", + netdev->name, config->mtu); + if (system(command) < 0) + die("Error executing %s\n", command); + free(command); + } + + /* Open a socket we can use to configure the tun interface. */ + netdev->control_fd = wrap_socket(config->ip_version, SOCK_DGRAM); +} + +/* Set the offload flags to be like a typical ethernet device */ +static void set_device_offload_flags(struct local_netdev *netdev) +{ +#ifdef linux + const u32 offload = + TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | TUN_F_TSO_ECN; + if (ioctl(netdev->tun_fd, TUNSETOFFLOAD, offload) != 0) + die_perror("TUNSETOFFLOAD"); +#endif +} + +/* Bring up the device */ +static void bring_up_device(struct local_netdev *netdev) +{ + struct ifreq ifr; + memset(&ifr, 0, sizeof(ifr)); + strncpy(ifr.ifr_name, netdev->name, IFNAMSIZ); + if (ioctl(netdev->control_fd, SIOCGIFFLAGS, &ifr) < 0) + die_perror("SIOCGIFFLAGS"); + ifr.ifr_flags |= IFF_UP | IFF_RUNNING; + if (ioctl(netdev->control_fd, SIOCSIFFLAGS, &ifr) < 0) + die_perror("SIOCSIFFLAGS"); +} + +/* Route traffic destined for our remote IP through this device. + * In anyip environment, we don't use the gateway IP. + */ +static void route_traffic_to_device(struct config *config, + struct local_netdev *netdev) +{ + char *route_command = NULL; +#ifdef linux + asprintf(&route_command, + "ip -%d route del %s > /dev/null 2>&1 ; " + "ip -%d route add %s dev %s %s%s > /dev/null 2>&1", + (config->wire_protocol == AF_INET) ? 4 : 6, + config->live_remote_prefix_string, + (config->wire_protocol == AF_INET) ? 4 : 6, + config->live_remote_prefix_string, + netdev->name, + config->is_anyip ? "" : "via ", + config->is_anyip ? "" : + config->live_gateway_ip_string); +#endif +#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) + if (config->wire_protocol == AF_INET) { + asprintf(&route_command, + "route delete %s > /dev/null 2>&1 ; " + "route add %s %s > /dev/null", + config->live_remote_prefix_string, + config->live_remote_prefix_string, + config->live_gateway_ip_string); + } else if (config->wire_protocol == AF_INET6) { + asprintf(&route_command, + "route delete -inet6 %s > /dev/null 2>&1 ; " +#if defined(__FreeBSD__) + "route add -inet6 %s -interface tun0 %s > /dev/null", +#elif defined(__OpenBSD__) || defined(__NetBSD__) + "route add -inet6 %s %s > /dev/null", +#endif + config->live_remote_prefix_string, + config->live_remote_prefix_string, + config->live_gateway_ip_string); + } else { + assert(!"bad wire protocol"); + } +#endif /* defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) */ + int result = system(route_command); + if ((result == -1) || (WEXITSTATUS(result) != 0)) { + die("error executing route command '%s'\n", + route_command); + } + free(route_command); +} + +struct netdev *local_netdev_new(struct config *config) +{ + struct local_netdev *netdev = calloc(1, sizeof(struct local_netdev)); + + netdev->netdev.ops = &local_netdev_ops; + + cleanup_old_device(config, netdev); + + check_remote_address(config, netdev); + create_device(config, netdev); + set_device_offload_flags(netdev); + bring_up_device(netdev); + + if (config->is_anyip) + check_local_anyip(config); + else + net_setup_dev_address(netdev->name, + &config->live_local_ip, + config->live_prefix_len); + + route_traffic_to_device(config, netdev); + netdev->psock = packet_socket_new(netdev->name); + + return (struct netdev *)netdev; +} + +static void local_netdev_free(struct netdev *a_netdev) +{ + struct local_netdev *netdev = to_local_netdev(a_netdev); + + if (netdev->psock) + packet_socket_free(netdev->psock); + if (netdev->tun_fd >= 0) + close(netdev->tun_fd); + if (netdev->control_fd >= 0) + close(netdev->control_fd); + if (netdev->name != NULL) + free(netdev->name); + memset(netdev, 0, sizeof(*netdev)); /* paranoia to help catch bugs */ + free(netdev); +} + +#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) +/* According to `man 4 tun` on OpenBSD: "Each packet read or written + * is prefixed with a tunnel header consisting of a 4-byte network + * byte order integer containing the address family in the case of + * layer 3 tunneling." Similarly, on FreeBSD and NetBSD one must use + * ioctl(TUNSIFHEAD) and prepend an address family, in order to be + * able to send IPv6 packets (otherwise FreeBSD and NetBSD assume the + * packets are IPv4). + */ +static void bsd_tun_write(struct local_netdev *netdev, + struct packet *packet) +{ + int address_family = htonl(packet_address_family(packet)); + struct iovec vector[2] = { + { &address_family, sizeof(address_family) }, + { packet_start(packet), packet->ip_bytes } + }; + + if (writev(netdev->tun_fd, vector, ARRAY_SIZE(vector)) < 0) + die_perror("BSD tun write()"); +} +#endif /* defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) */ + +#ifdef linux +#include + +static void linux_tun_write(struct local_netdev *netdev, + struct packet *packet) +{ + struct virtio_net_hdr gso = { 0 }; + struct iovec vector[2] = { + { &gso, sizeof(gso) }, + { packet_start(packet), packet->ip_bytes } + }; + + if (packet->tcp && packet->mss) { + if (packet->ipv4) + gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV4; + else + gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV6; + gso.gso_size = packet->mss; + } + if (writev(netdev->tun_fd, vector, ARRAY_SIZE(vector)) < 0) + die_perror("Linux tun write()"); +} +#endif /* linux */ + +static int local_netdev_send(struct netdev *a_netdev, + struct packet *packet) +{ + struct local_netdev *netdev = to_local_netdev(a_netdev); + + assert(packet->ip_bytes > 0); + /* We do IPv4 and IPv6 */ + assert(packet->ipv4 || packet->ipv6); + /* We only do TCP and ICMP */ + assert(packet->tcp || packet->udp || packet->icmpv4 || packet->icmpv6); + + DEBUGP("local_netdev_send\n"); + +#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) + bsd_tun_write(netdev, packet); +#endif /* defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) */ + +#ifdef linux + linux_tun_write(netdev, packet); +#endif /* linux */ + + return STATUS_OK; +} + +/* Read the given number of packets out of the tun device. We read + * these packets so that the kernel can exercise its normal code paths + * for packet transmit completion, since this code path may feed back + * to TCP behavior; e.g., see the Linux patch "tcp: avoid retransmits + * of TCP packets hanging in host queues". We don't need to actually + * need the packet contents, but on Linux we need to read at least 1 + * byte of packet data to consume the packet. + * After we added IFF_VNET_HDR attribute to the linux tun device, + * we expect to receive a virtio_net_hdr at the beginning. + */ +static void local_netdev_read_queue(struct local_netdev *netdev, + int num_packets) +{ +#ifdef linux + char buf[sizeof(struct virtio_net_hdr) + 1]; +#else + char buf[1]; +#endif + int i = 0, in_bytes = 0; + + for (i = 0; i < num_packets; ++i) { + in_bytes = read(netdev->tun_fd, buf, sizeof(buf)); + assert(in_bytes <= (int)sizeof(buf)); + + if (in_bytes < 0) { + if (errno == EINTR) + continue; + else + die_perror("tun read()"); + } + } +} + +static int local_netdev_receive(struct netdev *a_netdev, + struct packet **packet, char **error) +{ + struct local_netdev *netdev = to_local_netdev(a_netdev); + int status = STATUS_ERR; + int num_packets = 0; + + DEBUGP("local_netdev_receive\n"); + + status = netdev_receive_loop(netdev->psock, PACKET_LAYER_3_IP, + DIRECTION_OUTBOUND, packet, &num_packets, + error); + local_netdev_read_queue(netdev, num_packets); + return status; +} + +int netdev_receive_loop(struct packet_socket *psock, + enum packet_layer_t layer, + enum direction_t direction, + struct packet **packet, + int *num_packets, + char **error) +{ + assert(*packet == NULL); /* should be no packet yet */ + + *num_packets = 0; + while (1) { + int in_bytes = 0; + enum packet_parse_result_t result; + + *packet = packet_new(PACKET_READ_BYTES); + + /* Sniff the next outbound packet from the kernel under test. */ + if (packet_socket_receive(psock, direction, *packet, &in_bytes)) + continue; + + ++*num_packets; + result = parse_packet(*packet, in_bytes, layer, error); + + if (result == PACKET_OK) + return STATUS_OK; + + packet_free(*packet); + *packet = NULL; + + if (result == PACKET_BAD) + return STATUS_ERR; + + DEBUGP("parse_result:%d; error parsing packet: %s\n", + result, *error); + } + + assert(!"should not be reached"); + return STATUS_ERR; /* not reached */ +} + +struct netdev_ops local_netdev_ops = { + .free = local_netdev_free, + .send = local_netdev_send, + .receive = local_netdev_receive, +}; diff --git a/test/packetdrill/netdev.h b/test/packetdrill/netdev.h new file mode 100644 index 0000000..c69c138 --- /dev/null +++ b/test/packetdrill/netdev.h @@ -0,0 +1,99 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Interface for a "virtual network device" module to inject packets + * into the kernel and sniff packets leaving the kernel. + */ + +#ifndef __PACKET_NETDEV_H__ +#define __PACKET_NETDEV_H__ + +#include "types.h" + +#include "config.h" +#include "packet.h" +#include "packet_parser.h" +#include "packet_socket.h" + +struct netdev_ops; + +/* A C-style poor-man's "pure virtual" netdev. */ +struct netdev { + struct netdev_ops *ops; /* C-style vtable pointer */ +}; + +struct netdev_ops { + /* Tear down a netdev and free up the resources it has allocated. */ + void (*free)(struct netdev *netdev); + + /* Inject a raw TCP/IP packet into the kernel. */ + int (*send)(struct netdev *netdev, + struct packet *packet); + + /* Sniff the next TCP/IP packet leaving the kernel and return a + * pointer to the newly-allocated packet. Caller must free the packet + * with packet_free(). + */ + int (*receive)(struct netdev *netdev, + struct packet **packet, char **error); +}; + + +/* Tear down a netdev and free up the resources it has allocated. */ +static inline void netdev_free(struct netdev *netdev) +{ + netdev->ops->free(netdev); +} + +/* Inject a raw TCP/IP packet into the kernel. */ +static inline int netdev_send(struct netdev *netdev, + struct packet *packet) +{ + return netdev->ops->send(netdev, packet); +} + +/* Sniff the next TCP/IP packet leaving the kernel and return a + * pointer to the newly-allocated packet. Caller must free the packet + * with packet_free(). + */ +static inline int netdev_receive(struct netdev *netdev, + struct packet **packet, + char **error) +{ + return netdev->ops->receive(netdev, packet, error); +} + + +/* Keep sniffing packets leaving the kernel until we see one we know + * about and can parse. Return a pointer to the newly-allocated + * packet. Caller must free the packet with packet_free(). + */ +extern int netdev_receive_loop(struct packet_socket *psock, + enum packet_layer_t layer, + enum direction_t direction, + struct packet **packet, + int *num_packets, + char **error); + +/* Allocate and return a new netdev for purely local tests. */ +extern struct netdev *local_netdev_new(struct config *config); + +#endif /* __PACKET_NETDEV_H__ */ diff --git a/test/packetdrill/open_memstream.c b/test/packetdrill/open_memstream.c new file mode 100644 index 0000000..9114091 --- /dev/null +++ b/test/packetdrill/open_memstream.c @@ -0,0 +1,142 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * FreeBSD does not have open_memstream(), so we roll our own minimalist + * implementation here. + */ + +#include "types.h" + +#ifndef HAVE_OPEN_MEMSTREAM + +#include "assert.h" +#include "open_memstream.h" +#include +#include +#include +#include + +/* Our internal state for the memstream. */ +struct mem_stream { + char **buf; /* pointer to the output buffer pointer */ + size_t *sizeloc; /* pointer to the output final buffer size */ + + size_t buf_size; /* currently allocated size of buffer */ + size_t offset; /* current write offset */ +}; + +#define INITIAL_BUF_SIZE 1024 + +/* Grow buffer, if needed, to write "write_bytes" bytes at the current + * offset. We also have to take into account the extra '\0' that we + * maintain just past the end. Returns 0 on success, or -1 on failure. + */ +static int mem_stream_grow(struct mem_stream *stream, int write_bytes) +{ + char *new_buf = NULL; + size_t new_size = 0; + size_t needed_bytes = 0; + + needed_bytes = stream->offset + write_bytes + 1; + if (needed_bytes <= stream->buf_size) + return 0; + + if (stream->buf_size == 0) + new_size = INITIAL_BUF_SIZE; + else + new_size = 2 * stream->buf_size; + + if (new_size < needed_bytes) + new_size = needed_bytes; + + new_buf = (char *) realloc(*stream->buf, new_size); + if (new_buf == NULL) + return -1; + + *stream->buf = new_buf; + stream->buf_size = new_size; + + return 0; +} + +/* Write the give data to our memstream, expanding our buffer if we + * need to. Per the specification in the Linux man pages, "A null byte + * is maintained at the end of the buffer. This byte is not included + * in the size value stored at sizeloc." + */ +static int write_memstream(void *cookie, const char *buf, int write_bytes) +{ + struct mem_stream *stream = (struct mem_stream *) cookie; + + if (mem_stream_grow(stream, write_bytes) < 0) + return -1; + + memcpy(*stream->buf + stream->offset, buf, write_bytes); + stream->offset += write_bytes; + + *(*stream->buf + stream->offset) = '\0'; + + *stream->sizeloc = stream->offset; /* size does not include '\0' */ + + return write_bytes; +} + +/* Clean up */ +static int close_memstream(void *cookie) +{ + struct mem_stream *stream = (struct mem_stream *) cookie; + + free(stream); + + return 0; +} + +/* Create a memstream. */ +FILE *open_memstream(char **ptr, size_t *sizeloc) +{ + FILE *f; + struct mem_stream *stream; + + if (ptr == NULL || sizeloc == NULL) { + errno = EINVAL; + return NULL; + } + + stream = (struct mem_stream *) calloc(1, sizeof(struct mem_stream)); + if (stream == NULL) + return NULL; + + f = funopen(stream, NULL, write_memstream, NULL, close_memstream); + if (f == NULL) { + free(stream); + return NULL; + } + + *ptr = NULL; + *sizeloc = 0; + + stream->buf = ptr; + stream->sizeloc = sizeloc; + + return f; +} + +#endif /* HAVE_OPEN_MEMSTREAM */ diff --git a/test/packetdrill/open_memstream.h b/test/packetdrill/open_memstream.h new file mode 100644 index 0000000..bf23705 --- /dev/null +++ b/test/packetdrill/open_memstream.h @@ -0,0 +1,37 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * FreeBSD does not have open_memstream(), so we roll our own minimalist + * implementation here. + */ + +#ifndef __OPEN_MEMSTREAM_H__ +#define __OPEN_MEMSTREAM_H__ + +#ifndef HAVE_OPEN_MEMSTREAM + +#include + +FILE *open_memstream(char **ptr, size_t *sizeloc); + +#endif /*HAVE_OPEN_MEMSTREAM*/ + +#endif /* __OPEN_MEMSTREAM_H__ */ diff --git a/test/packetdrill/packet.c b/test/packetdrill/packet.c new file mode 100644 index 0000000..d2d792a --- /dev/null +++ b/test/packetdrill/packet.c @@ -0,0 +1,327 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Implementation for a representation of TCP/IP packets. + * Packets are represented in their wire format. + */ + +#include "packet.h" + +#include +#include +#include "assert.h" +#include "ethernet.h" +#include "gre_packet.h" +#include "ip_packet.h" +#include "logging.h" +#include "mpls_packet.h" + + +/* Info for all types of header we support. */ +struct header_type_info header_types[HEADER_NUM_TYPES] = { + { "NONE", 0, 0, NULL }, + { "IPV4", IPPROTO_IPIP, ETHERTYPE_IP, ipv4_header_finish }, + { "IPV6", IPPROTO_IPV6, ETHERTYPE_IPV6, ipv6_header_finish }, + { "GRE", IPPROTO_GRE, 0, gre_header_finish }, + { "MPLS", 0, ETHERTYPE_MPLS_UC, mpls_header_finish }, + { "TCP", IPPROTO_TCP, 0, NULL }, + { "UDP", IPPROTO_UDP, 0, NULL }, + { "ICMPV4", IPPROTO_ICMP, 0, NULL }, + { "ICMPV6", IPPROTO_ICMPV6, 0, NULL }, +}; + +struct packet *packet_new(u32 buffer_bytes) +{ + struct packet *packet = calloc(1, sizeof(struct packet)); + packet->buffer = malloc(buffer_bytes); + packet->buffer_bytes = buffer_bytes; + return packet; +} + +void packet_free(struct packet *packet) +{ + free(packet->buffer); + memset(packet, 0, sizeof(*packet)); /* paranoia to help catch bugs */ + free(packet); +} + +struct packet_list *packet_list_new(void) +{ + struct packet_list *list = calloc(1, sizeof(struct packet_list)); + list->packet = NULL; + list->next = NULL; + return list; +} + +void packet_list_free(struct packet_list *list) +{ + while (list != NULL) { + struct packet_list *dead_list = list; + if (list->packet) + packet_free(list->packet); + list = list->next; + free(dead_list); + } +} + +int packet_header_count(const struct packet *packet) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(packet->headers); ++i) { + if (packet->headers[i].type == HEADER_NONE) + break; + } + return i; +} + +/* Copy any header info from old_packet to new_packet. */ +static void packet_copy_headers(struct packet *new_packet, + struct packet *old_packet, + int bytes_headroom) +{ + int i; + u8 *base = new_packet->buffer + bytes_headroom; + + for (i = 0; i < ARRAY_SIZE(old_packet->headers); ++i) { + struct header *old_header = &old_packet->headers[i]; + struct header *new_header = &new_packet->headers[i]; + int offset = 0; + + if (old_header->type == HEADER_NONE) + break; + offset = old_header->h.ptr - old_packet->buffer; + new_header->h.ptr = base + offset; + new_header->header_bytes = old_header->header_bytes; + new_header->total_bytes = old_header->total_bytes; + new_header->type = old_header->type; + } +} + +struct header *packet_append_header(struct packet *packet, + enum header_t header_type, + int header_bytes) +{ + struct header *header = NULL; + int num_headers = packet_header_count(packet); + int packet_bytes; + + assert(num_headers <= PACKET_MAX_HEADERS); + if (num_headers == PACKET_MAX_HEADERS) + return NULL; + + header = &packet->headers[num_headers]; + + if (packet->ip_bytes + header_bytes > packet->buffer_bytes) + return NULL; + packet_bytes = packet->l2_header_bytes + packet->ip_bytes; + header->h.ptr = packet->buffer + packet_bytes; + packet->ip_bytes += header_bytes; + + header->type = header_type; + header->header_bytes = header_bytes; + header->total_bytes = 0; + return header; +} + +/* Map a pointer to a packet offset from an old base to a new base. */ +static void *offset_ptr(u8 *old_base, u8* new_base, void *old_ptr) +{ + u8 *old = (u8*)old_ptr; + + return (old == NULL) ? NULL : (new_base + (old - old_base)); +} + +static void packet_duplicate_info(struct packet *packet, + struct packet *old_packet, + int bytes_headroom, + int extra_payload) +{ + u8 *old_base = old_packet->buffer; + u8 *new_base = packet->buffer + bytes_headroom; + + packet->ip_bytes = old_packet->ip_bytes + extra_payload; + packet->direction = old_packet->direction; + packet->time_usecs = old_packet->time_usecs; + packet->flags = old_packet->flags; + packet->tos_chk = old_packet->tos_chk; + + packet_copy_headers(packet, old_packet, bytes_headroom); + + /* Set up layer 3 header pointer. */ + packet->ipv4 = offset_ptr(old_base, new_base, old_packet->ipv4); + packet->ipv6 = offset_ptr(old_base, new_base, old_packet->ipv6); + packet->tcp = offset_ptr(old_base, new_base, old_packet->tcp); + packet->udp = offset_ptr(old_base, new_base, old_packet->udp); + packet->icmpv4 = offset_ptr(old_base, new_base, old_packet->icmpv4); + packet->icmpv6 = offset_ptr(old_base, new_base, old_packet->icmpv6); + + packet->tcp_ts_val = offset_ptr(old_base, new_base, + old_packet->tcp_ts_val); + packet->tcp_ts_ecr = offset_ptr(old_base, new_base, + old_packet->tcp_ts_ecr); + packet->echoed_header = old_packet->echoed_header; +} + +/* Make a copy of the given old packet, but in the new copy reserve the + * given number of bytes of headroom at the start of the packet->buffer. + * This empty headroom can later be filled with outer packet headers. + * A slow but simple model. + */ +static struct packet *packet_copy_with_headroom(struct packet *old_packet, + int bytes_headroom) +{ + /* Allocate a new packet and copy link layer header and IP datagram. */ + const int bytes_used = packet_end(old_packet) - old_packet->buffer; + assert(bytes_used >= 0); + assert(bytes_used <= 128*1024); + struct packet *packet = packet_new(bytes_headroom + bytes_used); + u8 *old_base = old_packet->buffer; + u8 *new_base = packet->buffer + bytes_headroom; + + memcpy(new_base, old_base, bytes_used); + + packet_duplicate_info(packet, old_packet, bytes_headroom, 0); + + return packet; +} + +struct packet *packet_copy(struct packet *old_packet) +{ + return packet_copy_with_headroom(old_packet, 0); +} + +/* Finalize all the headers once we know what's inside inner layers. */ +static void packet_finish_encapsulation_headers(struct packet *packet) +{ + int i; + struct header *header = NULL, *next = NULL; + + /* Proceed from inner to outer. */ + for (i = ARRAY_SIZE(packet->headers) - 1; i >= 0; --i, next = header) { + struct header_type_info *type_info = NULL; + + header = &packet->headers[i]; + if (header->type == HEADER_NONE) + continue; + + type_info = header_type_info(header->type); + if (type_info->finish != NULL) + type_info->finish(packet, header, next); + } +} + +struct packet *packet_encapsulate(struct packet *outer, struct packet *inner) +{ + struct packet *packet = NULL; + const int outer_headers = packet_header_count(outer); + const int inner_headers = packet_header_count(inner); + + assert(outer_headers + inner_headers <= PACKET_MAX_HEADERS); + + /* Copy the inner packet bits and header metadata. */ + packet = packet_copy_with_headroom(inner, outer->ip_bytes); + + /* Copy over the bits in the outer headers. */ + memcpy(packet->buffer, outer->buffer, outer->ip_bytes); + + /* Move the inner header metadata to make room for the outer. */ + memmove(packet->headers + outer_headers, packet->headers + 0, + inner_headers * sizeof(struct header)); + + /* Copy over the metadata about the outer headers. */ + packet_copy_headers(packet, outer, 0); + + assert(packet_header_count(packet) == outer_headers + inner_headers); + + packet_finish_encapsulation_headers(packet); + + packet->ip_bytes = outer->ip_bytes + inner->ip_bytes; + + return packet; +} + +struct header_type_info *header_type_info(enum header_t header_type) +{ + assert(header_type > HEADER_NONE); + assert(header_type < HEADER_NUM_TYPES); + assert(ARRAY_SIZE(header_types) == HEADER_NUM_TYPES); + return &header_types[header_type]; +} + +/* Aggregate a list of input packets into a single output packet. */ +struct packet *aggregate_packets(const struct packet_list *head, + const struct packet_list *tail, + int payload_size) +{ + int i; + /* Copy the headers from the last source packet. */ + struct packet *first_packet = head->packet; + struct packet *last_packet = tail->packet; + struct packet *old_packet = last_packet; + /* Allocate a new packet that can accommodate the combined payload */ + int extra_payload = payload_size - packet_payload_len(old_packet); + int headers_len = packet_payload(old_packet) - old_packet->buffer; + int old_packet_size = packet_end(old_packet) - old_packet->buffer; + struct packet *packet = packet_new(old_packet_size + extra_payload); + + u8 *old_base = old_packet->buffer; + u8 *new_base = packet->buffer; + u8 *iter_base = new_base + headers_len; + + DEBUGP("aggregate_packets with combined payload size of %d bytes\n", + payload_size); + memcpy(new_base, old_base, headers_len); + + /* Copy the payload from all the source packets. */ + do { + memcpy(iter_base, packet_payload(head->packet), + packet_payload_len(head->packet)); + iter_base += packet_payload_len(head->packet); + head = head->next; + } while (head != NULL); + + packet_duplicate_info(packet, old_packet, 0, extra_payload); + + /* Adjust header bytes information to account for the larger payload. */ + for (i = 0; i < ARRAY_SIZE(packet->headers); ++i) { + struct header *new_header = &packet->headers[i]; + + if (new_header->type == HEADER_NONE) + break; + new_header->total_bytes += extra_payload; + DEBUGP("%s header starts at %p\n", + header_type_info(new_header->type)->name, + new_header->h.ptr); + /* For TCP header, we must copy the seq number and the cwr flag + * from the first packet. + */ + if (new_header->type == HEADER_TCP) { + assert(packet->tcp != NULL); + assert(first_packet->tcp != NULL); + packet->tcp->seq = first_packet->tcp->seq; + packet->tcp->cwr = first_packet->tcp->cwr; + } + } + packet_finish_encapsulation_headers(packet); + + return packet; +} diff --git a/test/packetdrill/packet.h b/test/packetdrill/packet.h new file mode 100644 index 0000000..aa41104 --- /dev/null +++ b/test/packetdrill/packet.h @@ -0,0 +1,425 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Interface and type declarations for a representation of TCP/IP packets. + * Packets are represented in their wire format. + */ + +#ifndef __PACKET_H__ +#define __PACKET_H__ + +#include "types.h" + +#include +#include "assert.h" +#include "gre.h" +#include "header.h" +#include "icmp.h" +#include "icmpv6.h" +#include "ip.h" +#include "ipv6.h" +#include "tcp.h" +#include "udp.h" +#include "unaligned.h" + +/* The data offset field is 4 bits, and specifies the length of the TCP header, + * including options, in 32-bit words. + */ +#define MAX_TCP_HEADER_BYTES (15*4) + +#define MAX_TCP_DATAGRAM_BYTES (64*1024) /* for sanity-checking */ +#define MAX_UDP_DATAGRAM_BYTES (64*1024) /* for sanity-checking */ + +/* We allow reading pretty big packets, since some interface MTUs can + * be pretty big (the Linux loopback MTU, for example, is typically + * around 16KB). + */ +static const int PACKET_READ_BYTES = 64 * 1024; + +/* Maximum number of headers. */ +#define PACKET_MAX_HEADERS 6 + +/* Maximum number of bytes of headers. */ +#define PACKET_MAX_HEADER_BYTES 256 + +/* TCP/UDP/IPv4 packet, including IPv4 header, TCP/UDP header, and data. There + * may also be a link layer header between the 'buffer' and 'ip' + * pointers, but we typically ignore that. The 'buffer_bytes' field + * gives the total space in the buffer, which may be bigger than the + * actual amount occupied by the packet data. + */ +struct packet { + u8 *buffer; /* data buffer: full contents of packet */ + u32 buffer_bytes; /* bytes of space in data buffer */ + u32 l2_header_bytes; /* bytes in outer hardware/layer-2 header */ + u32 ip_bytes; /* bytes in outermost IP hdrs/payload */ + enum direction_t direction; /* direction packet is traveling */ + + /* Metadata about all the headers in the packet, including all + * layers of encapsulation, from outer to inner, starting from + * the outermost IP header at headers[0]. + */ + struct header headers[PACKET_MAX_HEADERS]; + + /* The following pointers point into the 'buffer' area. Each + * pointer may be NULL if there is no header of that type + * present in the packet. In each case these are pointers to + * the innermost header of that kind, since that is where most + * of the interesting TCP/UDP/IP action is. + */ + + /* Layer 3 */ + struct ipv4 *ipv4; /* start of IPv4 header, if present */ + struct ipv6 *ipv6; /* start of IPv6 header, if present */ + + /* Layer 4 */ + struct tcp *tcp; /* start of TCP header, if present */ + struct udp *udp; /* start of UDP header, if present */ + struct icmpv4 *icmpv4; /* start of ICMPv4 header, if present */ + struct icmpv6 *icmpv6; /* start of ICMPv6 header, if present */ + bool echoed_header; /* icmp payload is an echoed header? + This is for TCP/UDP */ + + + s64 time_usecs; /* wall time of receive/send if non-zero */ + + u32 flags; /* various meta-flags */ +#define FLAG_WIN_NOCHECK 0x1 /* don't check TCP receive window */ +#define FLAG_OPTIONS_NOCHECK 0x2 /* don't check TCP options */ + + enum tos_chk_t tos_chk; /* how to treat the TOS byte of a packet */ + + __be32 *tcp_ts_val; /* location of TCP timestamp val, or NULL */ + __be32 *tcp_ts_ecr; /* location of TCP timestamp ecr, or NULL */ + int mss; +}; + +/* A simple list of packets. */ +struct packet_list { + struct packet *packet; /* the packet content */ + struct packet_list *next; /* link to next element, or NULL if last */ +}; + +/* Allocate a packet_list and initialize its fields to NULL. */ +extern struct packet_list *packet_list_new(void); + +/* Free an entire packet list. */ +extern void packet_list_free(struct packet_list *list); + +/* Allocate and initialize a packet. */ +extern struct packet *packet_new(u32 buffer_length); + +/* Free all the memory used by the packet. */ +extern void packet_free(struct packet *packet); + +/* Create a packet that is a copy of the contents of the given packet. */ +extern struct packet *packet_copy(struct packet *old_packet); + +/* Return the number of headers in the given packet. */ +extern int packet_header_count(const struct packet *packet); + +/* Attempt to append a new header to the given packet. Return a + * pointer to the new header metadata, or NULL if we can't add the + * header. + */ +extern struct header *packet_append_header(struct packet *packet, + enum header_t header_type, + int header_bytes); + +/* Return a newly-allocated packet that is a copy of the given inner packet + * but with the given outer packet prepended. + */ +extern struct packet *packet_encapsulate(struct packet *outer, + struct packet *inner); + +/* Aggregate a list of packets into a new packet carrying the combined + * payload and return the newly allocated packet. The head and tail parameters + * point to the first and the last packet, respectively, in the input list. + * payload_size is the payload size for the aggregated packet, equal to the + * summed payload across all the packets in the list. + * The source packets were previously checked to have compatible headers. Copy + * the headers from the last source packet, and update the length fields in all + * the headers to match the combined payload. + */ +extern struct packet *aggregate_packets(const struct packet_list *head, + const struct packet_list *tail, + int payload_size); + +/* Encapsulate a packet and free the original outer and inner packets. */ +static inline struct packet *packet_encapsulate_and_free(struct packet *outer, + struct packet *inner) +{ + struct packet *packet = packet_encapsulate(outer, inner); + packet_free(outer); + packet_free(inner); + return packet; +} + +/* Return the direction in which the given packet is traveling. */ +static inline enum direction_t packet_direction(const struct packet *packet) +{ + return packet->direction; +} + +/* Convenience accessors for peeking around in the packet... */ + +/* Return the address family corresponding to the packet protocol. */ +static inline int packet_address_family(const struct packet *packet) +{ + if (packet->ipv4 != NULL) + return AF_INET; + if (packet->ipv6 != NULL) + return AF_INET6; + return AF_UNSPEC; +} + +/* Return a pointer to the first byte of the outermost IP header. */ +static inline u8 *packet_start(const struct packet *packet) +{ + u8 *start = packet->headers[0].h.ptr; + assert(start != NULL); + return start; +} + +/* Return a pointer to the first byte of the innermost IP header. */ +static inline u8 *ip_start(struct packet *packet) +{ + if (packet->ipv4 != NULL) + return (u8 *)packet->ipv4; + if (packet->ipv6 != NULL) + return (u8 *)packet->ipv6; + assert(!"bad address family"); + return 0; +} + + +/* Return the length in bytes of the IP header for packets of the + * given address family, assuming no IP options. + */ +static inline int ip_header_min_len(int address_family) +{ + if (address_family == AF_INET) + return sizeof(struct ipv4); + else if (address_family == AF_INET6) + return sizeof(struct ipv6); + else + assert(!"bad ip_version in config"); +} + +/* Return the layer4 protocol of the packet. */ +static inline int packet_ip_protocol(const struct packet *packet) +{ + if (packet->ipv4 != NULL) + return packet->ipv4->protocol; + if (packet->ipv6 != NULL) + return packet->ipv6->next_header; + assert(!"no valid IP header"); + return 0; +} + +/* Return the length of an optionless TCP or UDP header. */ +static inline int layer4_header_len(int protocol) +{ + if (protocol == IPPROTO_TCP) + return sizeof(struct tcp); + if (protocol == IPPROTO_UDP) + return sizeof(struct udp); + assert(!"bad protocol"); + return 0; +} + +/* Return the length of the TCP header, including options. */ +static inline int packet_tcp_header_len(const struct packet *packet) +{ + assert(packet->tcp); + return packet->tcp->doff * sizeof(u32); +} + +/* Return the length of the UDP header. */ +static inline int packet_udp_header_len(const struct packet *packet) +{ + assert(packet->udp); + return sizeof(struct udp); +} + +/* Return the length of the ICMPv4 header. */ +static inline int packet_icmpv4_header_len(const struct packet *packet) +{ + assert(packet->icmpv4); + return sizeof(struct icmpv4); +} + +/* Return the length of the ICMPv6 header. */ +static inline int packet_icmpv6_header_len(const struct packet *packet) +{ + assert(packet->icmpv6); + return sizeof(struct icmpv6); +} + +/* Return the length of the TCP options. */ +static inline int packet_tcp_options_len(const struct packet *packet) +{ + assert(packet->tcp); + return packet_tcp_header_len(packet) - sizeof(*(packet->tcp)); +} + +/* Return a pointer to the TCP options. */ +static inline u8 *packet_tcp_options(struct packet *packet) +{ + assert(packet->tcp); + return (u8 *) (packet->tcp + 1); +} + +static inline u32 packet_tcp_ts_val(const struct packet *packet) +{ + return get_unaligned_be32(packet->tcp_ts_val); +} + +static inline u32 packet_tcp_ts_ecr(const struct packet *packet) +{ + return get_unaligned_be32(packet->tcp_ts_ecr); +} + +static inline void packet_set_tcp_ts_val(struct packet *packet, u32 ts_val) +{ + put_unaligned_be32(ts_val, packet->tcp_ts_val); +} + +static inline void packet_set_tcp_ts_ecr(struct packet *packet, u32 ts_ecr) +{ + put_unaligned_be32(ts_ecr, packet->tcp_ts_ecr); +} + +/* Return a pointer to the TCP/UDP data payload. */ +static inline u8 *packet_payload(const struct packet *packet) +{ + if (packet->tcp) + return ((u8 *) packet->tcp) + packet_tcp_header_len(packet); + if (packet->udp) + return ((u8 *) packet->udp) + packet_udp_header_len(packet); + if (packet->icmpv4) + return ((u8 *) packet->icmpv4) + packet_icmpv4_header_len(packet); + if (packet->icmpv6) + return ((u8 *) packet->icmpv6) + packet_icmpv6_header_len(packet); + + assert(!"no valid payload; not TCP or UDP or ICMP!?"); + return NULL; +} + +/* Return a pointer to the byte beyond the end of the packet. */ +static inline u8 *packet_end(const struct packet *packet) +{ + return packet_start(packet) + packet->ip_bytes; +} + +/* Return the length of the TCP/UDP payload. */ +static inline int packet_payload_len(const struct packet *packet) +{ + return packet_end(packet) - packet_payload(packet); +} + +/* Return the location of the IP header echoed by an ICMP message. */ +static inline u8 *packet_echoed_ip_header(struct packet *packet) +{ + if (packet->icmpv4 != NULL) + return (u8 *)(packet->icmpv4 + 1); + if (packet->icmpv6 != NULL) + return (u8 *)(packet->icmpv6 + 1); + assert(!"no valid icmp header"); + return NULL; +} + +/* Return the location of the IPv4 header echoed by an ICMP message, or NULL. */ +static inline struct ipv4 *packet_echoed_ipv4_header(struct packet *packet) +{ + return (struct ipv4 *)((packet->icmpv4 != NULL) ? + (packet->icmpv4 + 1) : NULL); +} + +/* Return the location of the IPv6 header echoed by an ICMP message, or NULL. */ +static inline struct ipv6 *packet_echoed_ipv6_header(struct packet *packet) +{ + return (struct ipv6 *)((packet->icmpv6 != NULL) ? + (packet->icmpv6 + 1) : NULL); +} + +/* Return the length in bytes of the IP header echoed by an ICMP message. + * For now we do not generate any IP options for echoed IP headers. + */ +static inline int packet_echoed_ip_header_len(struct packet *packet) +{ + if (packet->icmpv4 != NULL) + return sizeof(struct ipv4); + if (packet->icmpv6 != NULL) + return sizeof(struct ipv6); + assert(!"no valid icmp header"); + return 0; +} + +/* Return the layer4 protocol of the packet echoed inside an ICMP packet. */ +static inline int packet_echoed_ip_protocol(struct packet *packet) +{ + if (packet->icmpv4 != NULL) + return packet_echoed_ipv4_header(packet)->protocol; + if (packet->icmpv6 != NULL) + return packet_echoed_ipv6_header(packet)->next_header; + assert(!"no valid icmp header"); + return 0; +} + +/* Return the location of the TCP or UDP header echoed by an ICMP message. */ +static inline u8 *packet_echoed_layer4_header(struct packet *packet) +{ + u8 *echoed_ip = packet_echoed_ip_header(packet); + int ip_header_len = packet_echoed_ip_header_len(packet); + return echoed_ip + ip_header_len; +} + +/* Return the location of the TCP header echoed by an ICMP message. */ +static inline struct tcp *packet_echoed_tcp_header(struct packet *packet) +{ + if (packet_echoed_ip_protocol(packet) == IPPROTO_TCP) + return (struct tcp *)(packet_echoed_layer4_header(packet)); + return NULL; +} + +/* Return the location of the UDP header echoed by an ICMP message. */ +static inline struct udp *packet_echoed_udp_header(struct packet *packet) +{ + if (packet_echoed_ip_protocol(packet) == IPPROTO_UDP) + return (struct udp *)(packet_echoed_layer4_header(packet)); + return NULL; +} + +/* Return the location of the TCP sequence number echoed by an ICMP message. */ +static inline u32 *packet_echoed_tcp_seq(struct packet *packet) +{ + struct tcp *echoed_tcp = packet_echoed_tcp_header(packet); + assert(echoed_tcp); + u32 *seq = &(echoed_tcp->seq); + /* Check that the seq field is actually in the space we + * reserved for the echoed prefix of the TCP header. + */ + assert((char *) (seq + 1) <= (char *) echoed_tcp + ICMP_ECHO_BYTES); + return seq; +} + +#endif /* __PACKET_H__ */ diff --git a/test/packetdrill/packet_checksum.c b/test/packetdrill/packet_checksum.c new file mode 100644 index 0000000..d5164b3 --- /dev/null +++ b/test/packetdrill/packet_checksum.c @@ -0,0 +1,116 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Implementation for a module to checksum TCP/IP packets. + */ + +#include "packet_checksum.h" + +#include "checksum.h" +#include "icmp.h" +#include "icmpv6.h" +#include "ip.h" +#include "ipv6.h" +#include "tcp.h" + +static void checksum_ipv4_packet(struct packet *packet) +{ + struct ipv4 *ipv4 = packet->ipv4; + + /* Fill in IPv4 header checksum. */ + ipv4->check = 0; + ipv4->check = ipv4_checksum(ipv4, ipv4_header_len(ipv4)); + assert(packet->ip_bytes >= ntohs(ipv4->tot_len)); + + /* Find the length of layer 4 header, options, and payload. */ + const int l4_bytes = ntohs(ipv4->tot_len) - ipv4_header_len(ipv4); + assert(l4_bytes > 0); + + /* Fill in IPv4-based layer 4 checksum. */ + if (packet->tcp != NULL) { + struct tcp *tcp = packet->tcp; + tcp->check = 0; + tcp->check = tcp_udp_v4_checksum(ipv4->src_ip, + ipv4->dst_ip, + IPPROTO_TCP, tcp, l4_bytes); + } else if (packet->udp != NULL) { + struct udp *udp = packet->udp; + udp->check = 0; + udp->check = tcp_udp_v4_checksum(ipv4->src_ip, + ipv4->dst_ip, + IPPROTO_UDP, udp, l4_bytes); + } else if (packet->icmpv4 != NULL) { + struct icmpv4 *icmpv4 = packet->icmpv4; + icmpv4->checksum = 0; + icmpv4->checksum = ipv4_checksum(icmpv4, l4_bytes); + } else { + assert(!"not TCP or ICMP"); + } +} + +static void checksum_ipv6_packet(struct packet *packet) +{ + struct ipv6 *ipv6 = packet->ipv6; + + /* IPv6 has no header checksum. */ + /* For now we do not support IPv6 extension headers. */ + assert(packet->ip_bytes >= sizeof(*ipv6) + ntohs(ipv6->payload_len)); + + /* Find the length of layer 4 header, options, and payload. */ + const int l4_bytes = ntohs(ipv6->payload_len); + assert(l4_bytes > 0); + + /* Fill in IPv6-based layer 4 checksum. */ + if (packet->tcp != NULL) { + struct tcp *tcp = packet->tcp; + tcp->check = 0; + tcp->check = tcp_udp_v6_checksum(&ipv6->src_ip, + &ipv6->dst_ip, + IPPROTO_TCP, tcp, l4_bytes); + } else if (packet->udp != NULL) { + struct udp *udp = packet->udp; + udp->check = 0; + udp->check = tcp_udp_v6_checksum(&ipv6->src_ip, + &ipv6->dst_ip, + IPPROTO_UDP, udp, l4_bytes); + } else if (packet->icmpv6 != NULL) { + /* IPv6 ICMP has a pseudo-header checksum, like TCP. */ + struct icmpv6 *icmpv6 = packet->icmpv6; + icmpv6->checksum = 0; + icmpv6->checksum = + tcp_udp_v6_checksum(&ipv6->src_ip, + &ipv6->dst_ip, + IPPROTO_ICMPV6, icmpv6, l4_bytes); + } else { + assert(!"not TCP or ICMP"); + } +} + +void checksum_packet(struct packet *packet) +{ + int address_family = packet_address_family(packet); + if (address_family == AF_INET) + return checksum_ipv4_packet(packet); + else if (address_family == AF_INET6) + return checksum_ipv6_packet(packet); + else + assert(!"bad ip version"); +} diff --git a/test/packetdrill/packet_checksum.h b/test/packetdrill/packet_checksum.h new file mode 100644 index 0000000..2c87df3 --- /dev/null +++ b/test/packetdrill/packet_checksum.h @@ -0,0 +1,33 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Interface for a module to checksum TCP/IP packets. + */ + +#ifndef __PACKET_CHECKSUM_H__ +#define __PACKET_CHECKSUM_H__ + +#include "packet.h" + +/* Fill in layer 3 and layer 4 checksums for the given input 'packet'. */ +extern void checksum_packet(struct packet *packet); + +#endif /* __PACKET_CHECKSUM_H__ */ diff --git a/test/packetdrill/packet_parser.c b/test/packetdrill/packet_parser.c new file mode 100644 index 0000000..f593233 --- /dev/null +++ b/test/packetdrill/packet_parser.c @@ -0,0 +1,625 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Implementation for a module to parse TCP/IP packets. + */ + +#include "packet_parser.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "assert.h" +#include "checksum.h" +#include "ethernet.h" +#include "gre.h" +#include "ip.h" +#include "ip_address.h" +#include "logging.h" +#include "packet.h" +#include "tcp.h" + +static int parse_ipv4(struct packet *packet, u8 *header_start, u8 *packet_end, + char **error); +static int parse_ipv6(struct packet *packet, u8 *header_start, u8 *packet_end, + char **error); +static int parse_mpls(struct packet *packet, u8 *header_start, u8 *packet_end, + char **error); +static int parse_layer3_packet_by_proto(struct packet *packet, + u16 proto, u8 *header_start, + u8 *packet_end, char **error); +static int parse_layer4(struct packet *packet, u8 *header_start, + int layer4_protocol, int layer4_bytes, + u8 *packet_end, char **error); + +static int parse_layer2_packet(struct packet *packet, + u8 *header_start, u8 *packet_end, + char **error) +{ + u8 *p = header_start; + struct ether_header *ether = NULL; + + /* Find Ethernet header */ + if (p + sizeof(*ether) > packet_end) { + asprintf(error, "Ethernet header overflows packet"); + goto error_out; + } + ether = (struct ether_header *)p; + p += sizeof(*ether); + packet->l2_header_bytes = sizeof(*ether); + + return parse_layer3_packet_by_proto(packet, ntohs(ether->ether_type), + p, packet_end, error); + +error_out: + return PACKET_BAD; +} + +static int parse_layer3_packet_by_proto(struct packet *packet, + u16 proto, u8 *header_start, + u8 *packet_end, char **error) +{ + u8 *p = header_start; + + if (proto == ETHERTYPE_IP) { + struct ipv4 *ip = NULL; + + /* Examine IPv4 header. */ + if (p + sizeof(struct ipv4) > packet_end) { + asprintf(error, "IPv4 header overflows packet"); + goto error_out; + } + + /* Look at the IP version number, which is in the first 4 bits + * of both IPv4 and IPv6 packets. + */ + ip = (struct ipv4 *)p; + if (ip->version == 4) { + return parse_ipv4(packet, p, packet_end, error); + } else { + asprintf(error, "Bad IP version (%d) for ETHERTYPE_IP", ip->version); + goto error_out; + } + } else if (proto == ETHERTYPE_IPV6) { + struct ipv6 *ip = NULL; + + /* Examine IPv6 header. */ + if (p + sizeof(struct ipv6) > packet_end) { + asprintf(error, "IPv6 header overflows packet"); + goto error_out; + } + + /* Look at the IP version number, which is in the first 4 bits + * of both IPv4 and IPv6 packets. + */ + ip = (struct ipv6 *)p; + if (ip->version == 6) { + return parse_ipv6(packet, p, packet_end, error); + } else { + asprintf(error, "Bad IP version for ETHERTYPE_IPV6"); + goto error_out; + } + } else if ((proto == ETHERTYPE_MPLS_UC) || + (proto == ETHERTYPE_MPLS_MC)) { + return parse_mpls(packet, p, packet_end, error); + } else { + return PACKET_UNKNOWN_L4; + } + +error_out: + return PACKET_BAD; +} + +static int parse_layer3_packet(struct packet *packet, + u8 *header_start, u8 *packet_end, + char **error) +{ + u8 *p = header_start; + /* Note that packet_end points to the byte beyond the end of packet. */ + struct ipv4 *ip = NULL; + + /* Examine IPv4/IPv6 header. */ + if (p + sizeof(struct ipv4) > packet_end) { + asprintf(error, "IP header overflows packet"); + return PACKET_BAD; + } + + /* Look at the IP version number, which is in the first 4 bits + * of both IPv4 and IPv6 packets. + */ + ip = (struct ipv4 *) (p); + if (ip->version == 4) + return parse_ipv4(packet, p, packet_end, error); + else if (ip->version == 6) + return parse_ipv6(packet, p, packet_end, error); + + asprintf(error, "Unsupported IP version"); + return PACKET_BAD; +} + +int parse_packet(struct packet *packet, int in_bytes, + enum packet_layer_t layer, char **error) +{ + assert(in_bytes <= packet->buffer_bytes); + char *message = NULL; /* human-readable error summary */ + char *hex = NULL; /* hex dump of bad packet */ + enum packet_parse_result_t result = PACKET_BAD; + u8 *header_start = packet->buffer; + /* packet_end points to the byte beyond the end of packet. */ + u8 *packet_end = packet->buffer + in_bytes; + + if (layer == PACKET_LAYER_2_ETHERNET) + result = parse_layer2_packet(packet, header_start, packet_end, + error); + else if (layer == PACKET_LAYER_3_IP) + result = parse_layer3_packet(packet, header_start, packet_end, + error); + else + assert(!"bad layer"); + + if (result != PACKET_BAD) + return result; + + /* Error. Add a packet hex dump to the error string we're returning. */ + hex_dump(packet->buffer, in_bytes, &hex); + message = *error; + asprintf(error, "%s: packet of %d bytes:\n%s", message, in_bytes, hex); + free(message); + free(hex); + + return PACKET_BAD; +} + +/* Parse the IPv4 header and the TCP header inside. Return a + * packet_parse_result_t. + * Note that packet_end points to the byte beyond the end of packet. + */ +static int parse_ipv4(struct packet *packet, u8 *header_start, u8 *packet_end, + char **error) +{ + struct header *ip_header = NULL; + u8 *p = header_start; + const bool is_outer = (packet->ip_bytes == 0); + enum packet_parse_result_t result = PACKET_BAD; + struct ipv4 *ipv4 = (struct ipv4 *) (p); + + const int ip_header_bytes = ipv4_header_len(ipv4); + assert(ip_header_bytes >= 0); + if (ip_header_bytes < sizeof(*ipv4)) { + asprintf(error, "IP header too short"); + goto error_out; + } + if (p + ip_header_bytes > packet_end) { + asprintf(error, "Full IP header overflows packet"); + goto error_out; + } + const int ip_total_bytes = ntohs(ipv4->tot_len); + + if (p + ip_total_bytes > packet_end) { + asprintf(error, "IP payload overflows packet"); + goto error_out; + } + if (ip_header_bytes > ip_total_bytes) { + asprintf(error, "IP header bigger than datagram"); + goto error_out; + } + if (ntohs(ipv4->frag_off) & IP_MF) { /* more fragments? */ + asprintf(error, "More fragments remaining"); + goto error_out; + } + if (ntohs(ipv4->frag_off) & IP_OFFMASK) { /* fragment offset */ + asprintf(error, "Non-zero fragment offset"); + goto error_out; + } + const u16 checksum = ipv4_checksum(ipv4, ip_header_bytes); + if (checksum != 0) { + asprintf(error, "Bad IP checksum"); + goto error_out; + } + + ip_header = packet_append_header(packet, HEADER_IPV4, ip_header_bytes); + if (ip_header == NULL) { + asprintf(error, "Too many nested headers at IPv4 header"); + goto error_out; + } + ip_header->total_bytes = ip_total_bytes; + + /* Move on to the header inside. */ + p += ip_header_bytes; + assert(p <= packet_end); + + if (DEBUG_LOGGING) { + char src_string[ADDR_STR_LEN]; + char dst_string[ADDR_STR_LEN]; + struct ip_address src_ip, dst_ip; + ip_from_ipv4(&ipv4->src_ip, &src_ip); + ip_from_ipv4(&ipv4->dst_ip, &dst_ip); + DEBUGP("src IP: %s\n", ip_to_string(&src_ip, src_string)); + DEBUGP("dst IP: %s\n", ip_to_string(&dst_ip, dst_string)); + } + + /* Examine the L4 header. */ + const int layer4_bytes = ip_total_bytes - ip_header_bytes; + const int layer4_protocol = ipv4->protocol; + result = parse_layer4(packet, p, layer4_protocol, layer4_bytes, + packet_end, error); + + /* If this is the innermost L3 header then this is the primary. */ + if (!packet->ipv4 && !packet->ipv6) + packet->ipv4 = ipv4; + /* If this is the outermost IP header then this is the packet length. */ + if (is_outer) + packet->ip_bytes = ip_total_bytes; + + return result; + +error_out: + return PACKET_BAD; +} + +/* Parse the IPv6 header and the TCP header inside. We do not + * currently support parsing IPv6 extension headers or any layer 4 + * protocol other than TCP. Return a packet_parse_result_t. + * Note that packet_end points to the byte beyond the end of packet. + */ +static int parse_ipv6(struct packet *packet, u8 *header_start, u8 *packet_end, + char **error) +{ + struct header *ip_header = NULL; + u8 *p = header_start; + const bool is_outer = (packet->ip_bytes == 0); + struct ipv6 *ipv6 = (struct ipv6 *) (p); + enum packet_parse_result_t result = PACKET_BAD; + + /* Check that header fits in sniffed packet. */ + const int ip_header_bytes = sizeof(*ipv6); + if (p + ip_header_bytes > packet_end) { + asprintf(error, "IPv6 header overflows packet"); + goto error_out; + } + + /* Check that payload fits in sniffed packet. */ + const int ip_total_bytes = (ip_header_bytes + + ntohs(ipv6->payload_len)); + + if (p + ip_total_bytes > packet_end) { + asprintf(error, "IPv6 payload overflows packet"); + goto error_out; + } + assert(ip_header_bytes <= ip_total_bytes); + + ip_header = packet_append_header(packet, HEADER_IPV6, ip_header_bytes); + if (ip_header == NULL) { + asprintf(error, "Too many nested headers at IPv6 header"); + goto error_out; + } + ip_header->total_bytes = ip_total_bytes; + + /* Move on to the header inside. */ + p += ip_header_bytes; + assert(p <= packet_end); + + if (DEBUG_LOGGING) { + char src_string[ADDR_STR_LEN]; + char dst_string[ADDR_STR_LEN]; + struct ip_address src_ip, dst_ip; + ip_from_ipv6(&ipv6->src_ip, &src_ip); + ip_from_ipv6(&ipv6->dst_ip, &dst_ip); + DEBUGP("src IP: %s\n", ip_to_string(&src_ip, src_string)); + DEBUGP("dst IP: %s\n", ip_to_string(&dst_ip, dst_string)); + } + + /* Examine the L4 header. */ + const int layer4_bytes = ip_total_bytes - ip_header_bytes; + const int layer4_protocol = ipv6->next_header; + result = parse_layer4(packet, p, layer4_protocol, layer4_bytes, + packet_end, error); + + /* If this is the innermost L3 header then this is the primary. */ + if (!packet->ipv4 && !packet->ipv6) + packet->ipv6 = ipv6; + /* If this is the outermost IP header then this is the packet length. */ + if (is_outer) + packet->ip_bytes = ip_total_bytes; + + return result; + +error_out: + return PACKET_BAD; +} + +/* Parse the TCP header. Return a packet_parse_result_t. */ +static int parse_tcp(struct packet *packet, u8 *layer4_start, int layer4_bytes, + u8 *packet_end, char **error) +{ + struct header *tcp_header = NULL; + u8 *p = layer4_start; + + assert(layer4_bytes >= 0); + if (layer4_bytes < sizeof(struct tcp)) { + asprintf(error, "Truncated TCP header"); + goto error_out; + } + packet->tcp = (struct tcp *) p; + const int tcp_header_len = packet_tcp_header_len(packet); + if (tcp_header_len < sizeof(struct tcp)) { + asprintf(error, "TCP data offset too small"); + goto error_out; + } + if (tcp_header_len > layer4_bytes) { + asprintf(error, "TCP data offset too big"); + goto error_out; + } + + tcp_header = packet_append_header(packet, HEADER_TCP, tcp_header_len); + if (tcp_header == NULL) { + asprintf(error, "Too many nested headers at TCP header"); + goto error_out; + } + tcp_header->total_bytes = layer4_bytes; + + p += layer4_bytes; + assert(p <= packet_end); + + DEBUGP("TCP src port: %d\n", ntohs(packet->tcp->src_port)); + DEBUGP("TCP dst port: %d\n", ntohs(packet->tcp->dst_port)); + return PACKET_OK; + +error_out: + return PACKET_BAD; +} + +/* Parse the UDP header. Return a packet_parse_result_t. */ +static int parse_udp(struct packet *packet, u8 *layer4_start, int layer4_bytes, + u8 *packet_end, char **error) +{ + struct header *udp_header = NULL; + u8 *p = layer4_start; + + assert(layer4_bytes >= 0); + if (layer4_bytes < sizeof(struct udp)) { + asprintf(error, "Truncated UDP header"); + goto error_out; + } + packet->udp = (struct udp *) p; + const int udp_len = ntohs(packet->udp->len); + const int udp_header_len = sizeof(struct udp); + if (udp_len < udp_header_len) { + asprintf(error, "UDP datagram length too small for UDP header"); + goto error_out; + } + if (udp_len < layer4_bytes) { + asprintf(error, "UDP datagram length too small"); + goto error_out; + } + if (udp_len > layer4_bytes) { + asprintf(error, "UDP datagram length too big"); + goto error_out; + } + + udp_header = packet_append_header(packet, HEADER_UDP, udp_header_len); + if (udp_header == NULL) { + asprintf(error, "Too many nested headers at UDP header"); + goto error_out; + } + udp_header->total_bytes = layer4_bytes; + + p += layer4_bytes; + assert(p <= packet_end); + + DEBUGP("UDP src port: %d\n", ntohs(packet->udp->src_port)); + DEBUGP("UDP dst port: %d\n", ntohs(packet->udp->dst_port)); + return PACKET_OK; + +error_out: + return PACKET_BAD; +} + +/* Parse the ICMP header. Return a packet_parse_result_t. */ +static int parse_icmpv4(struct packet *packet, u8 *layer4_start, int layer4_bytes, + u8 *packet_end, char **error) +{ + struct header *icmp_header = NULL; + u8 *p = layer4_start; + + assert(layer4_bytes >= 0); + const int icmpv4_len = sizeof(struct icmpv4); + if (layer4_bytes < icmpv4_len) { + asprintf(error, "Truncated ICMPv4 header"); + goto error_out; + } + packet->icmpv4 = (struct icmpv4 *) p; + icmp_header = packet_append_header(packet, HEADER_ICMPV4, icmpv4_len); + + if (icmp_header == NULL) { + asprintf(error, "Too many nested headers at ICMP header"); + goto error_out; + } + icmp_header->total_bytes = layer4_bytes; + + p += layer4_bytes; + assert(p <= packet_end); + + DEBUGP("ICMPv4 type: %d\n", packet->icmpv4->type); + DEBUGP("ICMPv4 code: %d\n", packet->icmpv4->code); + return PACKET_OK; + +error_out: + return PACKET_BAD; +} + +static int parse_icmpv6(struct packet *packet, u8 *layer4_start, int layer4_bytes, + u8 *packet_end, char **error) +{ + struct header *icmp_header = NULL; + u8 *p = layer4_start; + + assert(layer4_bytes >= 0); + const int icmpv6_len = sizeof(struct icmpv6); + if (layer4_bytes < icmpv6_len) { + asprintf(error, "Truncated ICMPv6 header"); + goto error_out; + } + packet->icmpv6 = (struct icmpv6 *) p; + icmp_header = packet_append_header(packet, HEADER_ICMPV6, icmpv6_len); + + if (icmp_header == NULL) { + asprintf(error, "Too many nested headers at ICMP header"); + goto error_out; + } + icmp_header->total_bytes = layer4_bytes; + + p += layer4_bytes; + assert(p <= packet_end); + + DEBUGP("ICMPv6 type: %d\n", packet->icmpv6->type); + DEBUGP("ICMPv6 code: %d\n", packet->icmpv6->code); + return PACKET_OK; + +error_out: + return PACKET_BAD; +} + +/* Parse the GRE header. Return a packet_parse_result_t. */ +static int parse_gre(struct packet *packet, u8 *layer4_start, int layer4_bytes, + u8 *packet_end, char **error) +{ + struct header *gre_header = NULL; + u8 *p = layer4_start; + struct gre *gre = (struct gre *) p; + + assert(layer4_bytes >= 0); + if (layer4_bytes < GRE_MINLEN) { + asprintf(error, "Truncated GRE header"); + goto error_out; + } + if (gre->version != 0) { + asprintf(error, "GRE header has unsupported version number"); + goto error_out; + } + if (gre->has_routing) { + asprintf(error, "GRE header has unsupported routing info"); + goto error_out; + } + const int gre_header_len = gre_len(gre); + if (gre_header_len < GRE_MINLEN) { + asprintf(error, "GRE header length too small for GRE header"); + goto error_out; + } + if (gre_header_len > layer4_bytes) { + asprintf(error, "GRE header length too big"); + goto error_out; + } + + assert(p + layer4_bytes <= packet_end); + + DEBUGP("GRE header len: %d\n", gre_header_len); + + gre_header = packet_append_header(packet, HEADER_GRE, gre_header_len); + if (gre_header == NULL) { + asprintf(error, "Too many nested headers at GRE header"); + goto error_out; + } + gre_header->total_bytes = layer4_bytes; + + p += gre_header_len; + assert(p <= packet_end); + return parse_layer3_packet_by_proto(packet, ntohs(gre->proto), + p, packet_end, error); + +error_out: + return PACKET_BAD; +} + +int parse_mpls(struct packet *packet, u8 *header_start, u8 *packet_end, + char **error) +{ + struct header *mpls_header = NULL; + u8 *p = header_start; + int mpls_header_bytes = 0; + int mpls_total_bytes = packet_end - p; + bool is_stack_bottom = false; + + do { + struct mpls *mpls_entry = (struct mpls *)(p); + + if (p + sizeof(struct mpls) > packet_end) { + asprintf(error, "MPLS stack entry overflows packet"); + goto error_out; + } + + is_stack_bottom = mpls_entry_stack(mpls_entry); + + p += sizeof(struct mpls); + mpls_header_bytes += sizeof(struct mpls); + } while (!is_stack_bottom && p < packet_end); + + assert(mpls_header_bytes <= mpls_total_bytes); + + mpls_header = packet_append_header(packet, HEADER_MPLS, + mpls_header_bytes); + if (mpls_header == NULL) { + asprintf(error, "Too many nested headers at MPLS header"); + goto error_out; + } + mpls_header->total_bytes = mpls_total_bytes; + + /* Move on to the header inside the MPLS label stack. */ + assert(p <= packet_end); + return parse_layer3_packet(packet, p, packet_end, error); + +error_out: + return PACKET_BAD; +} + +static int parse_layer4(struct packet *packet, u8 *layer4_start, + int layer4_protocol, int layer4_bytes, + u8 *packet_end, char **error) +{ + if (layer4_protocol == IPPROTO_TCP) { + return parse_tcp(packet, layer4_start, layer4_bytes, packet_end, + error); + } else if (layer4_protocol == IPPROTO_UDP) { + return parse_udp(packet, layer4_start, layer4_bytes, packet_end, + error); + } else if (layer4_protocol == IPPROTO_ICMP) { + return parse_icmpv4(packet, layer4_start, layer4_bytes, packet_end, + error); + } else if (layer4_protocol == IPPROTO_ICMPV6) { + return parse_icmpv6(packet, layer4_start, layer4_bytes, packet_end, + error); + } else if (layer4_protocol == IPPROTO_GRE) { + return parse_gre(packet, layer4_start, layer4_bytes, packet_end, + error); + } else if (layer4_protocol == IPPROTO_IPIP) { + return parse_ipv4(packet, layer4_start, packet_end, error); + } else if (layer4_protocol == IPPROTO_IPV6) { + return parse_ipv6(packet, layer4_start, packet_end, error); + } + return PACKET_UNKNOWN_L4; +} diff --git a/test/packetdrill/packet_parser.h b/test/packetdrill/packet_parser.h new file mode 100644 index 0000000..8bd6512 --- /dev/null +++ b/test/packetdrill/packet_parser.h @@ -0,0 +1,53 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Interface for a module to parse TCP/IP packets. + */ + +#ifndef __PACKET_PARSER_H__ +#define __PACKET_PARSER_H__ + +#include "packet.h" + +/* What layer of headers is at the head of the packet? */ +enum packet_layer_t { + PACKET_LAYER_3_IP = 0, /* no layer 2 headers */ + PACKET_LAYER_2_ETHERNET, /* layer 2 is Ethernet */ +}; + +enum packet_parse_result_t { + PACKET_OK, /* no errors detected */ + PACKET_BAD, /* illegal header */ + PACKET_UNKNOWN_L4, /* not TCP or UDP */ +}; + +/* Given an input packet of length 'in_bytes' stored in the buffer + * whose location is given by the packet's 'buffer' field and whose + * full size is given by the 'buffer_bytes' field, parses the packets + * and fills in packet fields 'ip_bytes', 'ip', and 'tcp'. On success, + * returns PACKET_OK; on error, returns a enum packet_parse_result_t error + * code and fills in *error with a human-readable, malloc-allocated + * error message. + */ +int parse_packet(struct packet *packet, int in_bytes, + enum packet_layer_t layer, char **error); + +#endif /* __PACKET_PARSER_H__ */ diff --git a/test/packetdrill/packet_parser_test.c b/test/packetdrill/packet_parser_test.c new file mode 100644 index 0000000..d0d33d9 --- /dev/null +++ b/test/packetdrill/packet_parser_test.c @@ -0,0 +1,484 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Test for parsing IP packets. + */ + +#include "assert.h" +#include "packet_parser.h" + +#include +#include + +static void test_parse_tcp_ipv4_packet(void) +{ + /* A TCP/IPv4 packet. */ + u8 data[] = { + /* 192.0.2.1:53055 > 192.168.0.1:8080 + * . 1:1(0) ack 2202903899 win 257 + * + */ + 0x45, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x00, + 0xff, 0x06, 0x39, 0x11, 0xc0, 0x00, 0x02, 0x01, + 0xc0, 0xa8, 0x00, 0x01, 0xcf, 0x3f, 0x1f, 0x90, + 0x00, 0x00, 0x00, 0x01, 0x83, 0x4d, 0xa5, 0x5b, + 0xa0, 0x10, 0x01, 0x01, 0xdb, 0x2d, 0x00, 0x00, + 0x05, 0x0a, 0x83, 0x4d, 0xab, 0x03, 0x83, 0x4d, + 0xb0, 0xab, 0x08, 0x0a, 0x00, 0x00, 0x01, 0x2c, + 0x60, 0xc2, 0x18, 0x20 + }; + + struct packet *packet = packet_new(sizeof(data)); + + /* Populate and parse a packet */ + memcpy(packet->buffer, data, sizeof(data)); + char *error = NULL; + enum packet_parse_result_t result = + parse_packet(packet, sizeof(data), PACKET_LAYER_3_IP, + &error); + assert(result == PACKET_OK); + assert(error == NULL); + + struct ipv4 *expected_ipv4 = (struct ipv4 *)(packet->buffer); + struct tcp *expected_tcp = (struct tcp *)(expected_ipv4 + 1); + + assert(packet->ip_bytes == sizeof(data)); + assert(packet->ipv4 == expected_ipv4); + assert(packet->ipv6 == NULL); + assert(packet->tcp == expected_tcp); + assert(packet->udp == NULL); + assert(packet->icmpv4 == NULL); + assert(packet->icmpv6 == NULL); + + assert(packet->time_usecs == 0); + assert(packet->flags == 0); + + packet_free(packet); +} + +static void test_parse_tcp_ipv6_packet(void) +{ + /* A TCP/IPv6 packet. */ + u8 data[] = { + /* 2001:db8::1:54242 > fd3d:fa7b:d17d::1:8080 + * S 0:0(0) win 32792 + */ + 0x60, 0x00, 0x00, 0x00, 0x00, 0x20, 0x06, 0xff, + 0x20, 0x01, 0x0d, 0xb8, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, + 0xfd, 0x3d, 0xfa, 0x7b, 0xd1, 0x7d, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, + 0xd3, 0xe2, 0x1f, 0x90, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x80, 0x02, 0x80, 0x18, + 0x06, 0x60, 0x00, 0x00, 0x02, 0x04, 0x03, 0xe8, + 0x04, 0x02, 0x01, 0x01, 0x01, 0x03, 0x03, 0x07, + }; + + struct packet *packet = packet_new(sizeof(data)); + + /* Populate and parse a packet */ + memcpy(packet->buffer, data, sizeof(data)); + char *error = NULL; + enum packet_parse_result_t result = + parse_packet(packet, sizeof(data), PACKET_LAYER_3_IP, + &error); + assert(result == PACKET_OK); + assert(error == NULL); + + struct ipv6 *expected_ipv6 = (struct ipv6 *)(packet->buffer); + struct tcp *expected_tcp = (struct tcp *)(expected_ipv6 + 1); + + assert(packet->ip_bytes == sizeof(data)); + assert(packet->ipv4 == NULL); + assert(packet->ipv6 == expected_ipv6); + assert(packet->tcp == expected_tcp); + assert(packet->udp == NULL); + assert(packet->icmpv4 == NULL); + assert(packet->icmpv6 == NULL); + + assert(packet->time_usecs == 0); + assert(packet->flags == 0); + + packet_free(packet); +} + +static void test_parse_udp_ipv4_packet(void) +{ + /* A UDP/IPv4 packet. */ + u8 data[] = { + /* 192.0.2.1.8080 > 192.168.0.1.57845: UDP, length 4 */ + 0x45, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, + 0xff, 0x11, 0x39, 0x22, 0xc0, 0x00, 0x02, 0x01, + 0xc0, 0xa8, 0x00, 0x01, 0x1f, 0x90, 0xe1, 0xf5, + 0x00, 0x0c, 0x7b, 0xa5, 0x00, 0x00, 0x00, 0x00, + }; + + struct packet *packet = packet_new(sizeof(data)); + + /* Populate and parse a packet */ + memcpy(packet->buffer, data, sizeof(data)); + char *error = NULL; + enum packet_parse_result_t result = + parse_packet(packet, sizeof(data), PACKET_LAYER_3_IP, + &error); + assert(result == PACKET_OK); + assert(error == NULL); + + struct ipv4 *expected_ipv4 = (struct ipv4 *)(packet->buffer); + struct udp *expected_udp = (struct udp *)(expected_ipv4 + 1); + + assert(packet->ip_bytes == sizeof(data)); + assert(packet->ipv4 == expected_ipv4); + assert(packet->ipv6 == NULL); + assert(packet->tcp == NULL); + assert(packet->udp == expected_udp); + assert(packet->icmpv4 == NULL); + assert(packet->icmpv6 == NULL); + + assert(packet->time_usecs == 0); + assert(packet->flags == 0); + + packet_free(packet); +} + + +static void test_parse_udp_ipv6_packet(void) +{ + /* A UDP/IPv6 packet. */ + u8 data[] = { + /* 2001:db8::1.8080 > fd3d:fa7b:d17d::1.51557: UDP, length 4 */ + 0x60, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x11, 0xff, + 0x20, 0x01, 0x0d, 0xb8, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, + 0xfd, 0x3d, 0xfa, 0x7b, 0xd1, 0x7d, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, + 0x1f, 0x90, 0xc9, 0x65, 0x00, 0x0c, 0x1f, 0xee, + 0x00, 0x00, 0x00, 0x00, + }; + + struct packet *packet = packet_new(sizeof(data)); + + /* Populate and parse a packet */ + memcpy(packet->buffer, data, sizeof(data)); + char *error = NULL; + enum packet_parse_result_t result = + parse_packet(packet, sizeof(data), PACKET_LAYER_3_IP, + &error); + assert(result == PACKET_OK); + assert(error == NULL); + + struct ipv6 *expected_ipv6 = (struct ipv6 *)(packet->buffer); + struct udp *expected_udp = (struct udp *)(expected_ipv6 + 1); + + assert(packet->ip_bytes == sizeof(data)); + assert(packet->ipv4 == NULL); + assert(packet->ipv6 == expected_ipv6); + assert(packet->tcp == NULL); + assert(packet->udp == expected_udp); + assert(packet->icmpv4 == NULL); + assert(packet->icmpv6 == NULL); + + assert(packet->time_usecs == 0); + assert(packet->flags == 0); + + packet_free(packet); +} + +static void test_parse_ipv4_gre_ipv4_tcp_packet(void) +{ + u8 *p = NULL; + int i = 0; + + /* An IPv4/GRE/IPv4/TCP packet. */ + u8 data[] = { + /* IP 2.2.2.2 > 1.1.1.1: GREv0, length 48: + IP 192.0.2.1.47078 > 192.168.0.1.8080: + . 2:6(4) ack 1 win 123 */ + 0x45, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x00, + 0xff, 0x2f, 0xb5, 0x85, 0x02, 0x02, 0x02, 0x02, + 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x08, 0x00, + 0x45, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x00, + 0xff, 0x06, 0x39, 0x21, 0xc0, 0x00, 0x02, 0x01, + 0xc0, 0xa8, 0x00, 0x01, 0xb7, 0xe6, 0x1f, 0x90, + 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, + 0x50, 0x10, 0x00, 0x7b, 0x55, 0x31, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00 + }; + + struct packet *packet = packet_new(sizeof(data)); + + /* Populate and parse a packet */ + memcpy(packet->buffer, data, sizeof(data)); + char *error = NULL; + enum packet_parse_result_t result = + parse_packet(packet, sizeof(data), PACKET_LAYER_3_IP, + &error); + assert(result == PACKET_OK); + assert(error == NULL); + + p = packet->buffer; + i = 0; /* outer most layer, 0 */ + + assert(packet->headers[i].type == HEADER_IPV4); + assert(packet->headers[i].h.ptr == p); + assert(packet->headers[i].header_bytes == sizeof(struct ipv4)); + p += packet->headers[i].header_bytes; + i++; + + assert(packet->headers[i].type == HEADER_GRE); + assert(packet->headers[i].h.ptr == p); + assert(packet->headers[i].header_bytes == GRE_MINLEN); + p += packet->headers[i].header_bytes; + i++; + + struct ipv4 *expected_inner_ipv4 = (struct ipv4 *)p; + assert(packet->headers[i].type == HEADER_IPV4); + assert(packet->headers[i].h.ptr == p); + assert(packet->headers[i].header_bytes == sizeof(struct ipv4)); + p += packet->headers[i].header_bytes; + i++; + + struct tcp *expected_tcp = (struct tcp *)p; + assert(packet->headers[i].type == HEADER_TCP); + assert(packet->headers[i].h.ptr == p); + assert(packet->headers[i].header_bytes == sizeof(struct tcp)); + p += packet->headers[i].header_bytes; + i++; + + assert(packet->headers[i].type == HEADER_NONE); + + assert(packet->ip_bytes == sizeof(data)); + assert(packet->ipv4 == expected_inner_ipv4); + assert(packet->ipv6 == NULL); + assert(packet->tcp == expected_tcp); + assert(packet->udp == NULL); + assert(packet->icmpv4 == NULL); + assert(packet->icmpv6 == NULL); + + assert(packet->time_usecs == 0); + assert(packet->flags == 0); + + packet_free(packet); +} + +static void test_parse_ipv4_gre_mpls_ipv4_tcp_packet(void) +{ + u8 *p = NULL; + int i = 0; + + /* An IPv4/GRE/MPLS/IPv4/TCP packet. */ + u8 data[] = { + /* ipv4 192.168.0.1 > 192.0.2.2: gre: + mpls + (label 0, tc 0, ttl 0) + (label 1048575, tc 7, [S], ttl 255): + 192.168.0.1:8080 > 192.0.2.1:56268 + F. 2072102268:2072102268(0) ack 1 win 453 + + */ + + /* IPv4: */ + 0x45, 0x00, 0x00, 0x54, 0x00, 0x00, 0x40, 0x00, + 0x40, 0x2f, 0xb7, 0xcf, 0xc0, 0xa8, 0x00, 0x01, + 0xc0, 0x00, 0x02, 0x02, + /* GRE: */ + 0x00, 0x00, 0x88, 0x47, + /* MPLS: */ + 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, + /* IPv4, TCP: */ + 0x45, 0x00, 0x00, 0x34, 0x86, 0x99, 0x40, 0x00, + 0x40, 0x06, 0x31, 0x80, 0xc0, 0xa8, 0x00, 0x01, + 0xc0, 0x00, 0x02, 0x01, 0x1f, 0x90, 0xdb, 0xcc, + 0x7b, 0x81, 0xc5, 0x7c, 0x00, 0x00, 0x00, 0x01, + 0x80, 0x11, 0x01, 0xc5, 0xa6, 0xa6, 0x00, 0x00, + 0x01, 0x01, 0x08, 0x0a, 0x07, 0x02, 0x08, 0x43, + 0x00, 0x00, 0x00, 0x05 + }; + + struct packet *packet = packet_new(sizeof(data)); + + /* Populate and parse a packet */ + memcpy(packet->buffer, data, sizeof(data)); + char *error = NULL; + enum packet_parse_result_t result = + parse_packet(packet, sizeof(data), PACKET_LAYER_3_IP, + &error); + assert(result == PACKET_OK); + assert(error == NULL); + + p = packet->buffer; + i = 0; /* outer most layer, 0 */ + + assert(packet->headers[i].type == HEADER_IPV4); + assert(packet->headers[i].h.ptr == p); + assert(packet->headers[i].header_bytes == sizeof(struct ipv4)); + p += packet->headers[i].header_bytes; + i++; + + assert(packet->headers[i].type == HEADER_GRE); + assert(packet->headers[i].h.ptr == p); + assert(packet->headers[i].header_bytes == GRE_MINLEN); + p += packet->headers[i].header_bytes; + i++; + + assert(packet->headers[i].type == HEADER_MPLS); + assert(packet->headers[i].h.ptr == p); + assert(packet->headers[i].header_bytes == 2*sizeof(struct mpls)); + p += packet->headers[i].header_bytes; + i++; + + struct ipv4 *expected_inner_ipv4 = (struct ipv4 *)p; + assert(packet->headers[i].type == HEADER_IPV4); + assert(packet->headers[i].h.ptr == p); + assert(packet->headers[i].header_bytes == sizeof(struct ipv4)); + p += packet->headers[i].header_bytes; + i++; + + struct tcp *expected_tcp = (struct tcp *)p; + assert(packet->headers[i].type == HEADER_TCP); + assert(packet->headers[i].h.ptr == p); + assert(packet->headers[i].header_bytes == + sizeof(struct tcp) + TCPOLEN_TIMESTAMP + 2); /* 2 for 2 NOPs */ + p += packet->headers[i].header_bytes; + i++; + + assert(packet->headers[i].type == HEADER_NONE); + + assert(packet->ip_bytes == sizeof(data)); + assert(packet->ipv4 == expected_inner_ipv4); + assert(packet->ipv6 == NULL); + assert(packet->tcp == expected_tcp); + assert(packet->udp == NULL); + assert(packet->icmpv4 == NULL); + assert(packet->icmpv6 == NULL); + + assert(packet->time_usecs == 0); + assert(packet->flags == 0); + + packet_free(packet); +} + +static void test_parse_icmpv4_packet(void) +{ + /* An ICMPv4 packet. */ + u8 data[] = { + /* 192.168.1.101:0 > 192.168.1.103:0 + * icmpv4 echo request, id 10960, seq 1, length 8 + */ + 0x45, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x40, 0x00, + 0x40, 0x01, 0xb6, 0xc4, 0xc0, 0xa8, 0x01, 0x65, + 0xc0, 0xa8, 0x01, 0x67, 0x08, 0x00, 0xcd, 0x2e, + 0x2a, 0xd0, 0x00, 0x01, + }; + + struct packet *packet = packet_new(sizeof(data)); + + /* Populate and parse a packet */ + memcpy(packet->buffer, data, sizeof(data)); + char *error = NULL; + enum packet_parse_result_t result = + parse_packet(packet, sizeof(data), PACKET_LAYER_3_IP, + &error); + assert(result == PACKET_OK); + assert(error == NULL); + + struct ipv4 *expected_ipv4 = (struct ipv4 *)(packet->buffer); + struct icmpv4 *expected_icmpv4 = (struct icmpv4 *)(expected_ipv4 + 1); + + assert(packet->ip_bytes == sizeof(data)); + assert(packet->ipv4 == expected_ipv4); + assert(packet->ipv6 == NULL); + assert(packet->tcp == NULL); + assert(packet->udp == NULL); + assert(packet->icmpv4 == expected_icmpv4); + assert(packet->icmpv6 == NULL); + + assert(packet->time_usecs == 0); + assert(packet->flags == 0); + + packet_free(packet); +} + +static void test_parse_icmpv6_packet(void) +{ + /* An ICMPv6 packet. */ + u8 data[] = { + /* IP6 fd6b:6bbb:34a1::2 > fd6b:6bbb:34a1::1: ICMP6, + * echo request, seq 1, length 64 + */ + /* IPv6: */ + 0x60, 0x00, 0x00, 0x00, 0x00, 0x40, 0x3a, 0x40, + 0xfd, 0x6b, 0x6b, 0xbb, 0x34, 0xa1, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, + 0xfd, 0x6b, 0x6b, 0xbb, 0x34, 0xa1, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, + /* ICMPv6: Echo Request */ + 0x80, 0x00, 0xb7, 0x44, 0x74, 0x7f, 0x00, 0x01, + 0x08, 0xb7, 0xc9, 0x52, 0x4d, 0x1f, 0x0e, 0x00, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37 + }; + + struct packet *packet = packet_new(sizeof(data)); + + /* Populate and parse a packet */ + memcpy(packet->buffer, data, sizeof(data)); + char *error = NULL; + enum packet_parse_result_t result = + parse_packet(packet, sizeof(data), PACKET_LAYER_3_IP, + &error); + assert(result == PACKET_OK); + assert(error == NULL); + + struct ipv6 *expected_ipv6 = (struct ipv6 *)(packet->buffer); + struct icmpv6 *expected_icmpv6 = (struct icmpv6 *)(expected_ipv6 + 1); + + assert(packet->ip_bytes == sizeof(data)); + assert(packet->ipv4 == NULL); + assert(packet->ipv6 == expected_ipv6); + assert(packet->tcp == NULL); + assert(packet->udp == NULL); + assert(packet->icmpv4 == NULL); + assert(packet->icmpv6 == expected_icmpv6); + + assert(packet->time_usecs == 0); + assert(packet->flags == 0); + + packet_free(packet); +} + +int main(void) +{ + test_parse_tcp_ipv4_packet(); + test_parse_tcp_ipv6_packet(); + test_parse_udp_ipv4_packet(); + test_parse_udp_ipv6_packet(); + test_parse_ipv4_gre_ipv4_tcp_packet(); + test_parse_ipv4_gre_mpls_ipv4_tcp_packet(); + test_parse_icmpv4_packet(); + test_parse_icmpv6_packet(); + + return 0; +} diff --git a/test/packetdrill/packet_socket.h b/test/packetdrill/packet_socket.h new file mode 100644 index 0000000..a2defd3 --- /dev/null +++ b/test/packetdrill/packet_socket.h @@ -0,0 +1,69 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Platform-independent API to read and write raw packets. + * + * We allocate and configure things much like tcpdump. We do this so + * we can get timestamps on the outbound packets the kernel sends, to + * verify the correct timing (tun devices do not take timestamps). + */ + +#ifndef __PACKET_SOCKET_H__ +#define __PACKET_SOCKET_H__ + +#include "types.h" + +#include "ethernet.h" +#include "ip_address.h" +#include "packet.h" + +struct packet_socket; + +/* Allocate and initialize a packet socket. */ +extern struct packet_socket *packet_socket_new(const char *device_name); + +/* Free all the memory used by the packet socket. */ +extern void packet_socket_free(struct packet_socket *packet_socket); + +/* Add a filter so we only sniff packets we want. */ +extern void packet_socket_set_filter( + struct packet_socket *psock, + const struct ether_addr *client_ether_addr, + const struct ip_address *client_live_ip); + +/* Send the given packet using writev. Return STATUS_OK on success, + * or STATUS_ERR if writev returns an error. + */ +extern int packet_socket_writev(struct packet_socket *psock, + const struct iovec *iov, int iovcnt); + +/* Do a blocking sniff of the next packet going over the given device + * in the given direction, fill in the given packet with the sniffed + * packet info, and return the number of bytes in the packet in + * *in_bytes. If we successfully read a matching packet, return + * STATUS_OK; else return STATUS_ERR (in which case the caller can + * retry). + */ +extern int packet_socket_receive(struct packet_socket *psock, + enum direction_t direction, + struct packet *packet, int *in_bytes); + +#endif /* __PACKET_SOCKET_H__ */ diff --git a/test/packetdrill/packet_socket_linux.c b/test/packetdrill/packet_socket_linux.c new file mode 100644 index 0000000..a1f49e2 --- /dev/null +++ b/test/packetdrill/packet_socket_linux.c @@ -0,0 +1,280 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * API to read and write raw packets implemented using Linux packet socket. + */ + +#include "packet_socket.h" + +#include +#include +#include +#include +#include +#include +#include + +#ifdef linux + +#include +#include + +#include "assert.h" +#include "ethernet.h" +#include "logging.h" + +/* Number of bytes to buffer in the packet socket we use for sniffing. */ +static const int PACKET_SOCKET_RCVBUF_BYTES = 2*1024*1024; + +struct packet_socket { + int packet_fd; /* socket for sending, sniffing timestamped packets */ + char *name; /* malloc-allocated copy of interface name */ + int index; /* interface index from if_nametoindex */ +}; + +/* Set the receive buffer for a socket to the given size in bytes. */ +static void set_receive_buffer_size(int fd, int bytes) +{ + if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &bytes, sizeof(bytes)) < 0) + die_perror("setsockopt SOL_SOCKET SO_RCVBUF"); +} + +/* Bind the packet socket with the given fd to the given interface. */ +static void bind_to_interface(int fd, int interface_index) +{ + struct sockaddr_ll sll; + memset(&sll, 0, sizeof(sll)); + sll.sll_family = AF_PACKET; + sll.sll_ifindex = interface_index; + sll.sll_protocol = htons(ETH_P_ALL); + + if (bind(fd, (struct sockaddr *)&sll, sizeof(sll)) < 0) + die_perror("bind packet socket"); +} + +/* Allocate and configure a packet socket just like the one tcpdump + * uses. We do this so we can get timestamps on the outbound packets + * the kernel sends, to verify the correct timing (tun devices do not + * take timestamps). To reduce CPU load and filtering complexity, we + * bind the socket to a single device so we only receive packets for + * that device. + */ +static void packet_socket_setup(struct packet_socket *psock) +{ + struct timeval tv; + + psock->packet_fd = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); + if (psock->packet_fd < 0) + die_perror("socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL))"); + + psock->index = if_nametoindex(psock->name); + if (psock->index == 0) + die_perror("if_nametoindex"); + DEBUGP("device index: %s -> %d\n", psock->name, psock->index); + + bind_to_interface(psock->packet_fd, psock->index); + + set_receive_buffer_size(psock->packet_fd, PACKET_SOCKET_RCVBUF_BYTES); + + /* Pay the non-trivial latency cost to enable timestamps now, before + * the test starts, to avoid significant delays in the middle of tests. + */ + ioctl(psock->packet_fd, SIOCGSTAMP, &tv); +} + +/* Add a filter so we only sniff packets we want. */ +void packet_socket_set_filter(struct packet_socket *psock, + const struct ether_addr *client_ether_addr, + const struct ip_address *client_live_ip) +{ + const u8 *client_ether = client_ether_addr->ether_addr_octet; + + struct sock_fprog bpfcode; + struct sock_filter bpf_ipv4_src[] = { + /* this filter works for ethernet interfaces: */ + /* tcpdump -p -n -s 0 -i lo -dd + * "ether src 11:22:33:44:55:66 and ip src 1.2.3.4" + */ + { 0x20, 0, 0, 0x00000008 }, + { 0x15, 0, 7, 0x33445566 }, /* ether: 33:44:55:66 */ + { 0x28, 0, 0, 0x00000006 }, + { 0x15, 0, 5, 0x00001122 }, /* ether: 11:22 */ + { 0x28, 0, 0, 0x0000000c }, + { 0x15, 0, 3, 0x00000800 }, + { 0x20, 0, 0, 0x0000001a }, + { 0x15, 0, 1, 0x01020304 }, /* IPv4: 1.2.3.4 */ + { 0x6, 0, 0, 0x0000ffff }, + { 0x6, 0, 0, 0x00000000 }, + }; + struct sock_filter bpf_ipv6_src[] = { + /* this filter works for ethernet interfaces: */ + /* tcpdump -p -n -s 0 -i lo -dd + * "ether src 11:22:33:44:55:66 and ip6 src 1:2:3:4:5:6:7:8" */ + { 0x20, 0, 0, 0x00000008 }, + { 0x15, 0, 13, 0x33445566 }, /* ether: 33:44:55:66 */ + { 0x28, 0, 0, 0x00000006 }, + { 0x15, 0, 11, 0x00001122 }, /* ether: 11:22 */ + { 0x28, 0, 0, 0x0000000c }, + { 0x15, 0, 9, 0x000086dd }, + { 0x20, 0, 0, 0x00000016 }, + { 0x15, 0, 7, 0x00010002 }, /* IPv6: 1:2 */ + { 0x20, 0, 0, 0x0000001a }, + { 0x15, 0, 5, 0x00030004 }, /* IPv6: 3:4 */ + { 0x20, 0, 0, 0x0000001e }, + { 0x15, 0, 3, 0x00050006 }, /* IPv6: 5:6 */ + { 0x20, 0, 0, 0x00000022 }, + { 0x15, 0, 1, 0x00070008 }, /* IPv6: 7:8 */ + { 0x6, 0, 0, 0x0000ffff }, + { 0x6, 0, 0, 0x00000000 }, + }; + + if (client_live_ip->address_family == AF_INET) { + /* Fill in the client-side IPv6 address to look for. */ + bpf_ipv4_src[7].k = ntohl(client_live_ip->ip.v4.s_addr); + + bpfcode.len = ARRAY_SIZE(bpf_ipv4_src); + bpfcode.filter = bpf_ipv4_src; + } else if (client_live_ip->address_family == AF_INET6) { + /* Fill in the client-side IPv6 address to look for. */ + bpf_ipv6_src[7].k = ntohl(client_live_ip->ip.v6.s6_addr32[0]); + bpf_ipv6_src[9].k = ntohl(client_live_ip->ip.v6.s6_addr32[1]); + bpf_ipv6_src[11].k = ntohl(client_live_ip->ip.v6.s6_addr32[2]); + bpf_ipv6_src[13].k = ntohl(client_live_ip->ip.v6.s6_addr32[3]); + + bpfcode.len = ARRAY_SIZE(bpf_ipv6_src); + bpfcode.filter = bpf_ipv6_src; + } else { + assert(!"bad address family"); + } + + /* Fill in the client-side ethernet address to look for. */ + bpfcode.filter[1].k = ((client_ether[2] << 24) | + (client_ether[3] << 16) | + (client_ether[4] << 8) | + (client_ether[5])); + bpfcode.filter[3].k = ((client_ether[0] << 8) | + (client_ether[1])); + + if (DEBUG_LOGGING) { + int i; + DEBUGP("filter constants:\n"); + for (i = 0; i < bpfcode.len; ++i) + DEBUGP("0x%x\n", bpfcode.filter[i].k); + } + + /* Attach the filter. */ + if (setsockopt(psock->packet_fd, SOL_SOCKET, SO_ATTACH_FILTER, + &bpfcode, sizeof(bpfcode)) < 0) { + die_perror("setsockopt SOL_SOCKET, SO_ATTACH_FILTER"); + } +} + +struct packet_socket *packet_socket_new(const char *device_name) +{ + struct packet_socket *psock = calloc(1, sizeof(struct packet_socket)); + + psock->name = strdup(device_name); + psock->packet_fd = -1; + + packet_socket_setup(psock); + + return psock; +} + +void packet_socket_free(struct packet_socket *psock) +{ + if (psock->packet_fd >= 0) + close(psock->packet_fd); + + if (psock->name != NULL) + free(psock->name); + + memset(psock, 0, sizeof(*psock)); /* paranoia to catch bugs*/ + free(psock); +} + +int packet_socket_writev(struct packet_socket *psock, + const struct iovec *iov, int iovcnt) +{ + if (writev(psock->packet_fd, iov, iovcnt) < 0) { + perror("writev"); + return STATUS_ERR; + } + return STATUS_OK; +} + +int packet_socket_receive(struct packet_socket *psock, + enum direction_t direction, + struct packet *packet, int *in_bytes) +{ + struct sockaddr_ll from; + memset(&from, 0, sizeof(from)); + socklen_t from_len = sizeof(from); + + /* Read the packet out of our kernel packet socket buffer. */ + *in_bytes = recvfrom(psock->packet_fd, + packet->buffer, packet->buffer_bytes, 0, + (struct sockaddr *)&from, &from_len); + assert(*in_bytes <= packet->buffer_bytes); + if (*in_bytes < 0) { + if (errno == EINTR) { + DEBUGP("EINTR\n"); + return STATUS_ERR; + } else { + die_perror("packet socket recvfrom()"); + } + } + + /* We only want packets our kernel is sending out. */ + if (direction == DIRECTION_OUTBOUND && + from.sll_pkttype != PACKET_OUTGOING) { + DEBUGP("not outbound\n"); + return STATUS_ERR; + } + if (direction == DIRECTION_INBOUND && + from.sll_pkttype != PACKET_HOST) { + DEBUGP("not inbound\n"); + return STATUS_ERR; + } + + /* We only want packets on our tun device. The kernel + * can put packets for other devices in our receive + * buffer before we bind the packet socket to the tun + * device. + */ + if (from.sll_ifindex != psock->index) { + DEBUGP("not correct index\n"); + return STATUS_ERR; + } + + /* Get the time at which the kernel sniffed the packet. */ + struct timeval tv; + if (ioctl(psock->packet_fd, SIOCGSTAMP, &tv) < 0) + die_perror("SIOCGSTAMP"); + packet->time_usecs = timeval_to_usecs(&tv); + DEBUGP("sniffed packet sent at %u.%u = %lld\n", + (u32)tv.tv_sec, (u32)tv.tv_usec, + packet->time_usecs); + + return STATUS_OK; +} + +#endif /* linux */ diff --git a/test/packetdrill/packet_socket_pcap.c b/test/packetdrill/packet_socket_pcap.c new file mode 100644 index 0000000..bedef71 --- /dev/null +++ b/test/packetdrill/packet_socket_pcap.c @@ -0,0 +1,290 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * API to read and write raw packets implemented using pcap. + */ + +#include "packet_socket.h" + +#include +#include +#include +#include +#include +#include +#include + +#ifdef USE_LIBPCAP + +#if defined(__FreeBSD__) +#include +#elif defined(__OpenBSD__) || defined(__NetBSD__) +#include +#endif + +#include "assert.h" +#include "ethernet.h" +#include "logging.h" + +struct packet_socket { + char *name; /* malloc-allocated copy of interface name */ + + pcap_t *pcap; /* handle for sending, sniffing timestamped packets */ + char pcap_error[PCAP_ERRBUF_SIZE]; /* for libpcap errors */ + int pcap_offset; /* offset of packet data in pcap buffer */ +}; + +#if defined(__OpenBSD__) +#include +/* Convert a bpf_timeval to microseconds. */ +static inline s64 bpf_timeval_to_usecs(const struct bpf_timeval *tv) +{ + return ((s64)tv->tv_sec) * 1000000LL + (s64)tv->tv_usec; +} +#endif /* defined(__OpenBSD__) */ + +/* Call pcap_perror() and then exit with a failure status code. */ +extern void die_pcap_perror(pcap_t *pcap, char *message) +{ + pcap_perror(pcap, message); + + exit(EXIT_FAILURE); +} + +static void packet_socket_setup(struct packet_socket *psock) +{ + int data_link = -1, bpf_fd = -1, val = -1; + + DEBUGP("calling pcap_create() with %s\n", psock->name); + psock->pcap = pcap_create(psock->name, psock->pcap_error); + if (psock->pcap == NULL) + die_pcap_perror(psock->pcap, "pcap_create"); + + if (pcap_set_snaplen(psock->pcap, PACKET_READ_BYTES) != 0) + die_pcap_perror(psock->pcap, "pcap_set_snaplen"); + + if (pcap_activate(psock->pcap) != 0) + die_pcap_perror(psock->pcap, + "pcap_activate " + "(OpenBSD: another process (tcpdump?) " + "using bpf0?)"); + + bpf_fd = pcap_get_selectable_fd(psock->pcap); + if (bpf_fd < 0) + die_pcap_perror(psock->pcap, "pcap_get_selectable_fd"); + + /* By default libpcap with BPF waits until a read buffer fills + * up before returning any packets. We use BIOCIMMEDIATE to + * force the BPF device to return the first packet + * immediately. + */ + val = 1; + if (ioctl(bpf_fd, BIOCIMMEDIATE, &val) < 0) + die_perror("ioctl BIOCIMMEDIATE on bpf fd"); + + /* Find data link type. */ + data_link = pcap_datalink(psock->pcap); + DEBUGP("data_link: %d\n", data_link); + + /* Based on the data_link type, calculate the offset of the + * packet data in the buffer. + */ + switch (data_link) { + case DLT_EN10MB: + psock->pcap_offset = 0; + break; + case DLT_LOOP: + case DLT_NULL: + psock->pcap_offset = 4; + break; + case DLT_SLIP: + case DLT_RAW: + psock->pcap_offset = 0; + break; + default: + die("Unknown data_link type %d\n", data_link); + break; + } +} + +/* Add a filter so we only sniff packets we want. */ +void packet_socket_set_filter(struct packet_socket *psock, + const struct ether_addr *client_ether_addr, + const struct ip_address *client_live_ip) +{ + const u8 *client_ether = client_ether_addr->ether_addr_octet; + struct bpf_program bpf_code; + char *filter_str = NULL; + char client_live_ip_string[ADDR_STR_LEN]; + + ip_to_string(client_live_ip, client_live_ip_string); + + asprintf(&filter_str, + "ether src %02x:%02x:%02x:%02x:%02x:%02x and %s src %s", + client_ether[0], + client_ether[1], + client_ether[2], + client_ether[3], + client_ether[4], + client_ether[5], + client_live_ip->address_family == AF_INET6 ? "ip6" : "ip", + client_live_ip_string); + + DEBUGP("setting BPF filter: %s\n", filter_str); + + if (pcap_compile(psock->pcap, &bpf_code, filter_str, 1, 0) != 0) + die_pcap_perror(psock->pcap, "pcap_compile"); + + if (pcap_setfilter(psock->pcap, &bpf_code) != 0) + die_pcap_perror(psock->pcap, "pcap_setfilter"); + + pcap_freecode(&bpf_code); + free(filter_str); +} + +struct packet_socket *packet_socket_new(const char *device_name) +{ + struct packet_socket *psock = calloc(1, sizeof(struct packet_socket)); + + psock->name = strdup(device_name); + + packet_socket_setup(psock); + + return psock; +} + +void packet_socket_free(struct packet_socket *psock) +{ + if (psock->name != NULL) + free(psock->name); + + pcap_close(psock->pcap); + + memset(psock, 0, sizeof(*psock)); /* paranoia to catch bugs*/ + free(psock); +} + +int packet_socket_writev(struct packet_socket *psock, + const struct iovec *iov, int iovcnt) +{ + /* Copy the ethernet header and IP datagram into a single buffer, + * since that's all the pcap API supports. TODO: optimize this. + */ + + u8 *buf = NULL, *p = NULL; + int len = 0, i = 0; + + /* Calculate how much space we need. */ + for (i = 0; i < iovcnt; ++i) + len += iov[i].iov_len; + + buf = malloc(len); + + /* Copy into the linear buffer. */ + p = buf; + for (i = 0; i < iovcnt; ++i) { + memcpy(p, iov[i].iov_base, iov[i].iov_len); + p += iov[i].iov_len; + } + + DEBUGP("calling pcap_inject with %d bytes\n", len); + + if (pcap_inject(psock->pcap, buf, len) != len) + die_pcap_perror(psock->pcap, "pcap_inject"); + + free(buf); + return STATUS_OK; +} + +int packet_socket_receive(struct packet_socket *psock, + enum direction_t direction, + struct packet *packet, int *in_bytes) +{ + int status = 0; + struct pcap_pkthdr *pkt_header = NULL; + const u8 *pkt_data = NULL; + + DEBUGP("calling pcap_next_ex()\n"); + + /* Something about the way we're doing BIOCIMMEDIATE + * causes libpcap to return 0 if there's no packet + * yet, which forces us to spin in this loop until + * there's a packet available. If, on the other hand, + * we hack libpcap itself to enable its internal + * BIOCIMMEDIATE code path that it currently only uses + * for AIX, then we don't have to spin + * here. TODO(ncardwell): fix this. + */ + while (1) { + status = pcap_next_ex(psock->pcap, &pkt_header, + &pkt_data); + if (status == 1) + break; /* got a packet */ + else if (status == 0) + return STATUS_ERR; /* no packet yet */ + else if (status == -1) + die_pcap_perror(psock->pcap, "pcap_next_ex"); + else if (status == -2) + die("pcap_next_ex: EOF in save file?!\n"); + else + die("pcap_next_ex: status: %d\n", status); + } + + DEBUGP("time: %u . %u\n", + (u32)pkt_header->ts.tv_sec, + (u32)pkt_header->ts.tv_usec); + +#if defined(__FreeBSD__) || defined(__NetBSD__) + packet->time_usecs = timeval_to_usecs(&pkt_header->ts); +#elif defined(__OpenBSD__) + packet->time_usecs = bpf_timeval_to_usecs(&pkt_header->ts); +#else + packet->time_usecs = implement_me("implement me for your platform"); +#endif /* defined(__OpenBSD__) */ + + DEBUGP("time_usecs= %llu\n", packet->time_usecs); + + DEBUGP("pcap_next_ex: caplen:%u len:%u offset:%d\n", + pkt_header->caplen, pkt_header->len, psock->pcap_offset); + + if (DEBUG_LOGGING) { + /* Dump a hex dump of packet sniffed by pcap. */ + char *hex = NULL; + hex_dump(pkt_data, pkt_header->caplen, &hex); + DEBUGP("pkt from pcap:\n%s\n", hex); + free(hex); + } + + if (pkt_header->caplen != pkt_header->len) { + die("libpcap unable to capture full packet: " + "caplen %u != len %u\n", + pkt_header->caplen, pkt_header->len); + } + assert(pkt_header->len <= packet->buffer_bytes); + + assert(pkt_header->len > psock->pcap_offset); + *in_bytes = pkt_header->len - psock->pcap_offset; + memcpy(packet->buffer, pkt_data + psock->pcap_offset, *in_bytes); + + return STATUS_OK; +} + +#endif /* USE_LIBPCAP */ diff --git a/test/packetdrill/packet_to_string.c b/test/packetdrill/packet_to_string.c new file mode 100644 index 0000000..1fd90b2 --- /dev/null +++ b/test/packetdrill/packet_to_string.c @@ -0,0 +1,303 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Implementation for generating human-readable representations of IP + * packets. + */ + +#include "packet_to_string.h" + +#include +#include "socket.h" +#include "tcp_options_to_string.h" + +static void endpoints_to_string(FILE *s, const struct packet *packet) +{ + char src_string[ADDR_STR_LEN]; + char dst_string[ADDR_STR_LEN]; + struct tuple tuple; + + get_packet_tuple(packet, &tuple); + + fprintf(s, "%s:%u > %s:%u", + ip_to_string(&tuple.src.ip, src_string), ntohs(tuple.src.port), + ip_to_string(&tuple.dst.ip, dst_string), ntohs(tuple.dst.port)); +} + +static void packet_buffer_to_string(FILE *s, struct packet *packet) +{ + char *hex = NULL; + hex_dump(packet->buffer, packet_end(packet) - packet->buffer, &hex); + fputc('\n', s); + fprintf(s, "%s", hex); + free(hex); +} + +static int ipv4_header_to_string(FILE *s, struct packet *packet, int layer, + enum dump_format_t format, char **error) +{ + char src_string[ADDR_STR_LEN]; + char dst_string[ADDR_STR_LEN]; + struct ip_address src_ip, dst_ip; + const struct ipv4 *ipv4 = packet->headers[layer].h.ipv4; + + ip_from_ipv4(&ipv4->src_ip, &src_ip); + ip_from_ipv4(&ipv4->dst_ip, &dst_ip); + + fprintf(s, "ipv4 %s > %s: ", + ip_to_string(&src_ip, src_string), + ip_to_string(&dst_ip, dst_string)); + + return STATUS_OK; +} + +static int ipv6_header_to_string(FILE *s, struct packet *packet, int layer, + enum dump_format_t format, char **error) +{ + char src_string[ADDR_STR_LEN]; + char dst_string[ADDR_STR_LEN]; + struct ip_address src_ip, dst_ip; + const struct ipv6 *ipv6 = packet->headers[layer].h.ipv6; + + ip_from_ipv6(&ipv6->src_ip, &src_ip); + ip_from_ipv6(&ipv6->dst_ip, &dst_ip); + + fprintf(s, "ipv6 %s > %s: ", + ip_to_string(&src_ip, src_string), + ip_to_string(&dst_ip, dst_string)); + + return STATUS_OK; +} + +static int gre_header_to_string(FILE *s, struct packet *packet, int layer, + enum dump_format_t format, char **error) +{ + const struct gre *gre = packet->headers[layer].h.gre; + int i = 0; + + fprintf(s, "gre flags 0x%x proto 0x%04x", + ntohs(gre->flags), + ntohs(gre->proto)); + + if (gre->has_checksum || gre->has_routing) { + fprintf(s, " sum 0x%x off 0x%x", + ntohs(gre->be16[0]), + ntohs(gre->be16[1])); + i++; + } + + if (gre->has_key) { + fprintf(s, " key 0x%x", ntohl(gre->be32[i])); + i++; + } + + if (gre->has_seq) { + fprintf(s, " seq 0x%x", ntohl(gre->be32[i])); + i++; + } + + fprintf(s, ": "); + return STATUS_OK; +} + +static int mpls_header_to_string(FILE *s, struct packet *packet, int layer, + enum dump_format_t format, char **error) +{ + struct header *header = &packet->headers[layer]; + int num_entries = header->header_bytes / sizeof(struct mpls); + int i = 0; + + fprintf(s, "mpls"); + + for (i = 0; i < num_entries; ++i) { + const struct mpls *mpls = header->h.mpls + i; + + fprintf(s, " (label %u, tc %u,%s ttl %u)", + mpls_entry_label(mpls), + mpls_entry_tc(mpls), + mpls_entry_stack(mpls) ? " [S]," : "", + mpls_entry_ttl(mpls)); + } + + fprintf(s, ": "); + return STATUS_OK; +} + +/* Print a string representation of the TCP packet: + * direction opt_ip_info flags seq ack window tcp_options + */ +static int tcp_packet_to_string(FILE *s, struct packet *packet, + enum dump_format_t format, char **error) +{ + int result = STATUS_OK; /* return value */ + + if ((format == DUMP_FULL) || (format == DUMP_VERBOSE)) { + endpoints_to_string(s, packet); + fputc(' ', s); + } + + + /* We print flags in the same order as tcpdump 4.1.1. */ + if (packet->tcp->fin) + fputc('F', s); + if (packet->tcp->syn) + fputc('S', s); + if (packet->tcp->rst) + fputc('R', s); + if (packet->tcp->psh) + fputc('P', s); + if (packet->tcp->ack) + fputc('.', s); + if (packet->tcp->urg) + fputc('U', s); + if (packet->tcp->ece) + fputc('E', s); /* ECN *E*cho sent (ECN) */ + if (packet->tcp->cwr) + fputc('W', s); /* Congestion *W*indow reduced (ECN) */ + + fprintf(s, " %u:%u(%u) ", + ntohl(packet->tcp->seq), + ntohl(packet->tcp->seq) + packet_payload_len(packet), + packet_payload_len(packet)); + + if (packet->tcp->ack) + fprintf(s, "ack %u ", ntohl(packet->tcp->ack_seq)); + + if (!(packet->flags & FLAG_WIN_NOCHECK)) + fprintf(s, "win %u ", ntohs(packet->tcp->window)); + + if (packet_tcp_options_len(packet) > 0) { + char *tcp_options = NULL; + if (tcp_options_to_string(packet, &tcp_options, error)) + result = STATUS_ERR; + else + fprintf(s, "<%s>", tcp_options); + free(tcp_options); + } + + if (format == DUMP_VERBOSE) + packet_buffer_to_string(s, packet); + + return result; +} + +static int udp_packet_to_string(FILE *s, struct packet *packet, + enum dump_format_t format, char **error) +{ + int result = STATUS_OK; /* return value */ + + if ((format == DUMP_FULL) || (format == DUMP_VERBOSE)) { + endpoints_to_string(s, packet); + fputc(' ', s); + } + + fprintf(s, "udp (%u)", packet_payload_len(packet)); + + if (format == DUMP_VERBOSE) + packet_buffer_to_string(s, packet); + + return result; +} + +static int icmpv4_packet_to_string(FILE *s, struct packet *packet, + enum dump_format_t format, char **error) +{ + fprintf(s, "icmpv4"); + /* TODO(ncardwell): print type, code; use tables from icmp_packet.c */ + return STATUS_OK; +} + +static int icmpv6_packet_to_string(FILE *s, struct packet *packet, + enum dump_format_t format, char **error) +{ + fprintf(s, "icmpv6"); + /* TODO(ncardwell): print type, code; use tables from icmp_packet.c */ + return STATUS_OK; +} + +typedef int (*header_to_string_func)(FILE *s, struct packet *packet, int layer, + enum dump_format_t format, char **error); + +static int encap_header_to_string(FILE *s, struct packet *packet, int layer, + enum dump_format_t format, char **error) +{ + header_to_string_func printers[HEADER_NUM_TYPES] = { + [HEADER_IPV4] = ipv4_header_to_string, + [HEADER_IPV6] = ipv6_header_to_string, + [HEADER_GRE] = gre_header_to_string, + [HEADER_MPLS] = mpls_header_to_string, + }; + header_to_string_func printer = NULL; + enum header_t type = packet->headers[layer].type; + + assert(type > HEADER_NONE); + assert(type < HEADER_NUM_TYPES); + printer = printers[type]; + assert(printer != NULL); + return printer(s, packet, layer, format, error); +} + + +int packet_to_string(struct packet *packet, + enum dump_format_t format, + char **ascii_string, char **error) +{ + assert(packet != NULL); + int result = STATUS_ERR; /* return value */ + size_t size = 0; + FILE *s = open_memstream(ascii_string, &size); /* output string */ + int i; + int header_count = packet_header_count(packet); + + /* Print any encapsulation headers preceding layer 3 and 4 headers. */ + for (i = 0; i < header_count - 2; ++i) { + if (packet->headers[i].type == HEADER_NONE) + break; + if (encap_header_to_string(s, packet, i, format, error)) + goto out; + } + + if ((packet->ipv4 == NULL) && (packet->ipv6 == NULL)) { + fprintf(s, "[NO IP HEADER]"); + } else { + if (packet->tcp != NULL) { + if (tcp_packet_to_string(s, packet, format, error)) + goto out; + } else if (packet->udp != NULL) { + if (udp_packet_to_string(s, packet, format, error)) + goto out; + } else if (packet->icmpv4 != NULL) { + if (icmpv4_packet_to_string(s, packet, format, error)) + goto out; + } else if (packet->icmpv6 != NULL) { + if (icmpv6_packet_to_string(s, packet, format, error)) + goto out; + } else { + fprintf(s, "[NO TCP OR ICMP HEADER]"); + } + } + + result = STATUS_OK; + +out: + fclose(s); + return result; +} diff --git a/test/packetdrill/packet_to_string.h b/test/packetdrill/packet_to_string.h new file mode 100644 index 0000000..462a4f9 --- /dev/null +++ b/test/packetdrill/packet_to_string.h @@ -0,0 +1,44 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Interface for generating human-readable representations of IP packets. + */ + +#ifndef __PACKET_TO_STRING_H__ +#define __PACKET_TO_STRING_H__ + +#include "packet.h" + +enum dump_format_t { + DUMP_SHORT, /* brief format used in scripts */ + DUMP_FULL, /* add local and remote address and port */ + DUMP_VERBOSE, /* add hex dump */ +}; + +/* Returns in *ascii_string a human-readable representation of the + * packet 'packet'. Returns STATUS_OK on success; on failure returns + * STATUS_ERR and sets error message. + */ +extern int packet_to_string(struct packet *packet, + enum dump_format_t format, + char **ascii_string, char **error); + +#endif /* __PACKET_TO_STRING_H__ */ diff --git a/test/packetdrill/packet_to_string_test.c b/test/packetdrill/packet_to_string_test.c new file mode 100644 index 0000000..814cddb --- /dev/null +++ b/test/packetdrill/packet_to_string_test.c @@ -0,0 +1,301 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Test for generating human-readable representations of IP packets. + */ + +#include "packet_to_string.h" + +#include +#include +#include "assert.h" +#include "packet_parser.h" + +static void test_tcp_ipv4_packet_to_string(void) +{ + /* An IPv4/GRE/IPv4/TCP packet. */ + u8 data[] = { + /* IPv4: */ + 0x45, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x00, + 0xff, 0x2f, 0xb5, 0x75, 0x02, 0x02, 0x02, 0x02, + 0x01, 0x01, 0x01, 0x01, + /* GRE: */ + 0x00, 0x00, 0x08, 0x00, + /* IPv4, TCP: */ + 0x45, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x00, + 0xff, 0x06, 0x39, 0x11, 0xc0, 0x00, 0x02, 0x01, + 0xc0, 0xa8, 0x00, 0x01, 0xcf, 0x3f, 0x1f, 0x90, + 0x00, 0x00, 0x00, 0x01, 0x83, 0x4d, 0xa5, 0x5b, + 0xa0, 0x10, 0x01, 0x01, 0xdb, 0x2d, 0x00, 0x00, + 0x05, 0x0a, 0x83, 0x4d, 0xab, 0x03, 0x83, 0x4d, + 0xb0, 0xab, 0x08, 0x0a, 0x00, 0x00, 0x01, 0x2c, + 0x60, 0xc2, 0x18, 0x20 + }; + + struct packet *packet = packet_new(sizeof(data)); + + /* Populate and parse a packet */ + memcpy(packet->buffer, data, sizeof(data)); + char *error = NULL; + enum packet_parse_result_t result = + parse_packet(packet, sizeof(data), PACKET_LAYER_3_IP, + &error); + assert(result == PACKET_OK); + assert(error == NULL); + + int status = 0; + char *dump = NULL, *expected = NULL; + + /* Test a DUMP_SHORT dump */ + status = packet_to_string(packet, DUMP_SHORT, &dump, &error); + assert(status == STATUS_OK); + assert(error == NULL); + printf("dump = '%s'\n", dump); + expected = + "ipv4 2.2.2.2 > 1.1.1.1: gre flags 0x0 proto 0x0800: " + ". 1:1(0) ack 2202903899 win 257 " + ""; + assert(strcmp(dump, expected) == 0); + free(dump); + + /* Test a DUMP_FULL dump */ + status = packet_to_string(packet, DUMP_FULL, &dump, &error); + assert(status == STATUS_OK); + assert(error == NULL); + printf("dump = '%s'\n", dump); + expected = + "ipv4 2.2.2.2 > 1.1.1.1: gre flags 0x0 proto 0x0800: " + "192.0.2.1:53055 > 192.168.0.1:8080 " + ". 1:1(0) ack 2202903899 win 257 " + ""; + assert(strcmp(dump, expected) == 0); + free(dump); + + /* Test a DUMP_VERBOSE dump */ + status = packet_to_string(packet, DUMP_VERBOSE, &dump, &error); + assert(status == STATUS_OK); + assert(error == NULL); + printf("dump = '%s'\n", dump); + expected = + "ipv4 2.2.2.2 > 1.1.1.1: gre flags 0x0 proto 0x0800: " + "192.0.2.1:53055 > 192.168.0.1:8080 " + ". 1:1(0) ack 2202903899 win 257 " + "" + "\n" + "0x0000: 45 00 00 54 00 00 00 00 ff 2f b5 75 02 02 02 02 " "\n" + "0x0010: 01 01 01 01 00 00 08 00 45 00 00 3c 00 00 00 00 " "\n" + "0x0020: ff 06 39 11 c0 00 02 01 c0 a8 00 01 cf 3f 1f 90 " "\n" + "0x0030: 00 00 00 01 83 4d a5 5b a0 10 01 01 db 2d 00 00 " "\n" + "0x0040: 05 0a 83 4d ab 03 83 4d b0 ab 08 0a 00 00 01 2c " "\n" + "0x0050: 60 c2 18 20 " "\n"; + assert(strcmp(dump, expected) == 0); + free(dump); + + packet_free(packet); +} + +static void test_tcp_ipv6_packet_to_string(void) +{ + /* An IPv6/GRE/TCP/IPv6 packet. */ + u8 data[] = { + /* IPv6: */ + 0x60, 0x00, 0x00, 0x00, 0x00, 0x4c, 0x2f, 0xff, + 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x22, 0x22, + 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x11, + /* GRE: */ + 0x00, 0x00, 0x86, 0xdd, + /* IPv6, TCP: */ + 0x60, 0x00, 0x00, 0x00, 0x00, 0x20, 0x06, 0xff, + 0x20, 0x01, 0x0d, 0xb8, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, + 0xfd, 0x3d, 0xfa, 0x7b, 0xd1, 0x7d, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, + 0xd3, 0xe2, 0x1f, 0x90, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x80, 0x02, 0x80, 0x18, + 0x06, 0x60, 0x00, 0x00, 0x02, 0x04, 0x03, 0xe8, + 0x04, 0x02, 0x01, 0x01, 0x01, 0x03, 0x03, 0x07, + }; + + struct packet *packet = packet_new(sizeof(data)); + + /* Populate and parse a packet */ + memcpy(packet->buffer, data, sizeof(data)); + char *error = NULL; + enum packet_parse_result_t result = + parse_packet(packet, sizeof(data), PACKET_LAYER_3_IP, + &error); + assert(result == PACKET_OK); + assert(error == NULL); + + int status = 0; + char *dump = NULL, *expected = NULL; + + /* Test a DUMP_SHORT dump */ + status = packet_to_string(packet, DUMP_SHORT, &dump, &error); + assert(status == STATUS_OK); + assert(error == NULL); + printf("dump = '%s'\n", dump); + expected = + "ipv6 2::2222 > 1::1111: gre flags 0x0 proto 0x86dd: " + "S 0:0(0) win 32792 "; + assert(strcmp(dump, expected) == 0); + free(dump); + + /* Test a DUMP_FULL dump */ + status = packet_to_string(packet, DUMP_FULL, &dump, &error); + assert(status == STATUS_OK); + assert(error == NULL); + printf("dump = '%s'\n", dump); + expected = + "ipv6 2::2222 > 1::1111: gre flags 0x0 proto 0x86dd: " + "2001:db8::1:54242 > fd3d:fa7b:d17d::1:8080 " + "S 0:0(0) win 32792 "; + assert(strcmp(dump, expected) == 0); + free(dump); + + /* Test a DUMP_VERBOSE dump */ + status = packet_to_string(packet, DUMP_VERBOSE, &dump, &error); + assert(status == STATUS_OK); + assert(error == NULL); + printf("dump = '%s'\n", dump); + expected = + "ipv6 2::2222 > 1::1111: gre flags 0x0 proto 0x86dd: " + "2001:db8::1:54242 > fd3d:fa7b:d17d::1:8080 " + "S 0:0(0) win 32792 \n" + "0x0000: 60 00 00 00 00 4c 2f ff 00 02 00 00 00 00 00 00 " "\n" + "0x0010: 00 00 00 00 00 00 22 22 00 01 00 00 00 00 00 00 " "\n" + "0x0020: 00 00 00 00 00 00 11 11 00 00 86 dd 60 00 00 00 " "\n" + "0x0030: 00 20 06 ff 20 01 0d b8 00 00 00 00 00 00 00 00 " "\n" + "0x0040: 00 00 00 01 fd 3d fa 7b d1 7d 00 00 00 00 00 00 " "\n" + "0x0050: 00 00 00 01 d3 e2 1f 90 00 00 00 00 00 00 00 00 " "\n" + "0x0060: 80 02 80 18 06 60 00 00 02 04 03 e8 04 02 01 01 " "\n" + "0x0070: 01 03 03 07 " "\n"; + assert(strcmp(dump, expected) == 0); + free(dump); + + packet_free(packet); +} + +static void test_gre_mpls_tcp_ipv4_packet_to_string(void) +{ + /* An IPv4/GRE/MPLS/IPv4/TCP packet. */ + u8 data[] = { + /* IPv4: */ + 0x45, 0x00, 0x00, 0x54, 0x00, 0x00, 0x40, 0x00, + 0x40, 0x2f, 0xb7, 0xcf, 0xc0, 0xa8, 0x00, 0x01, + 0xc0, 0x00, 0x02, 0x02, + /* GRE: */ + 0x00, 0x00, 0x88, 0x47, + /* MPLS: */ + 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, + /* IPv4, TCP: */ + 0x45, 0x00, 0x00, 0x34, 0x86, 0x99, 0x40, 0x00, + 0x40, 0x06, 0x31, 0x80, 0xc0, 0xa8, 0x00, 0x01, + 0xc0, 0x00, 0x02, 0x01, 0x1f, 0x90, 0xdb, 0xcc, + 0x7b, 0x81, 0xc5, 0x7c, 0x00, 0x00, 0x00, 0x01, + 0x80, 0x11, 0x01, 0xc5, 0xa6, 0xa6, 0x00, 0x00, + 0x01, 0x01, 0x08, 0x0a, 0x07, 0x02, 0x08, 0x43, + 0x00, 0x00, 0x00, 0x05 + }; + + struct packet *packet = packet_new(sizeof(data)); + + /* Populate and parse a packet */ + memcpy(packet->buffer, data, sizeof(data)); + char *error = NULL; + enum packet_parse_result_t result = + parse_packet(packet, sizeof(data), PACKET_LAYER_3_IP, + &error); + assert(result == PACKET_OK); + assert(error == NULL); + + int status = 0; + char *dump = NULL, *expected = NULL; + + /* Test a DUMP_FULL dump */ + status = packet_to_string(packet, DUMP_FULL, &dump, &error); + assert(status == STATUS_OK); + assert(error == NULL); + printf("dump = '%s'\n", dump); + expected = + "ipv4 192.168.0.1 > 192.0.2.2: gre flags 0x0 proto 0x8847: " + "mpls (label 0, tc 0, ttl 0) " + "(label 1048575, tc 7, [S], ttl 255): " + "192.168.0.1:8080 > 192.0.2.1:56268 " + "F. 2072102268:2072102268(0) ack 1 win 453 " + ""; + assert(strcmp(dump, expected) == 0); + free(dump); +} + +static void test_tcp_md5_option_to_string(void) +{ + /* An IPv4/TCP packet. */ + u8 data[] = { + /* IPv4, TCP: */ + 0x45, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x00, + 0xff, 0x06, 0x89, 0x56, 0xc0, 0x00, 0x02, 0x01, + 0xc0, 0xa8, 0xaf, 0xbb, 0x8a, 0x6f, 0x1f, 0x90, + 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, + 0xa0, 0x02, 0x01, 0x00, 0x36, 0x14, 0x00, 0x00, + 0x13, 0x12, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, + 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, + 0x0e, 0x0f, 0x01, 0x01 + }; + + struct packet *packet = packet_new(sizeof(data)); + + /* Populate and parse a packet */ + memcpy(packet->buffer, data, sizeof(data)); + char *error = NULL; + enum packet_parse_result_t result = + parse_packet(packet, sizeof(data), PACKET_LAYER_3_IP, + &error); + assert(result == PACKET_OK); + assert(error == NULL); + + int status = 0; + char *dump = NULL, *expected = NULL; + + /* Test a DUMP_SHORT dump */ + status = packet_to_string(packet, DUMP_SHORT, &dump, &error); + assert(status == STATUS_OK); + assert(error == NULL); + printf("dump = '%s'\n", dump); + expected = + "S 1:1(0) win 256 " + ""; + assert(strcmp(dump, expected) == 0); + free(dump); + + packet_free(packet); +} + +int main(void) +{ + test_tcp_ipv4_packet_to_string(); + test_tcp_ipv6_packet_to_string(); + test_gre_mpls_tcp_ipv4_packet_to_string(); + test_tcp_md5_option_to_string(); + return 0; +} diff --git a/test/packetdrill/packetdrill.c b/test/packetdrill/packetdrill.c new file mode 100644 index 0000000..4afa038 --- /dev/null +++ b/test/packetdrill/packetdrill.c @@ -0,0 +1,113 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * This is the main() for the packetdrill TCP testing tool. + */ + +#include "types.h" + +#include +#include +#include +#include +#include +#include "assert.h" +#include "config.h" +#include "parse.h" +#include "run.h" +#include "script.h" +#include "system.h" +#include "wire_server.h" + +static void run_init_scripts(struct config *config) +{ + char *cp1, *cp2, *scripts, *error; + + if (config->init_scripts == NULL) + return; + + cp1 = scripts = strdup(config->init_scripts); + while (*cp1 != 0) { + cp2 = strstr(cp1, ","); + if (cp2 != NULL) + *cp2 = 0; + if (safe_system(cp1, &error)) { + die("%s: error executing init script '%s': %s\n", + config->script_path, cp1, error); + } + if (cp2 == NULL) + break; + else + cp1 = cp2 + 1; + } + free(scripts); +} + +int main(int argc, char *argv[]) +{ + struct config config; + set_default_config(&config); + /* Get command line options and list of test scripts. */ + char **arg = parse_command_line_options(argc, argv, &config); + + /* If we're running as a server, just listen for connections forever. */ + if (config.is_wire_server) { + if (*arg != NULL) { + fprintf(stderr, + "error: do not pass script paths to " + "the wire server on command line\n"); + show_usage(); + exit(EXIT_FAILURE); + } + + run_wire_server(&config); + return 0; + } + + /* Ensure that there is at least one script path, to avoid + * confusion between the lack of output caused by "all tests + * passing" and "no tests listed on command line". + */ + if (*arg == NULL) { + fprintf(stderr, "error: missing script path\n"); + show_usage(); + exit(EXIT_FAILURE); + } + + /* Parse and run each script on the command line. */ + for (; *arg != NULL; ++arg) { + struct script script; + const char *script_path = *arg; + + if (parse_script_and_set_config(argc, argv, &config, &script, + script_path, NULL)) + exit(EXIT_FAILURE); + + /* If --dry_run, then don't actually execute the script. */ + if (config.dry_run) + continue; + + run_init_scripts(&config); + run_script(&config, &script); + } + + return 0; +} diff --git a/test/packetdrill/packetdrill.h b/test/packetdrill/packetdrill.h new file mode 100644 index 0000000..2fb6b58 --- /dev/null +++ b/test/packetdrill/packetdrill.h @@ -0,0 +1,108 @@ +/* + * Copyright 2015 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: xiaoj@google.com (Xiao Jia) + * + * Interface for packetdrill. + * + * To be tested against as a shared object (*.so) file, implement this + * interface, export a function "packetdrill_interface_init", and + * initialize the interface struct passed in with your own functions. + */ + +#ifndef __PACKETDRILL_H__ +#define __PACKETDRILL_H__ + +#include +#include +#include +#include +#include +#include +#include + +struct packetdrill_interface { + void *userdata; + void (*free)(void *userdata); + int (*socket)(void *userdata, int domain, int type, int protocol); + int (*bind)(void *userdata, int sockfd, const struct sockaddr *addr, + socklen_t addrlen); + int (*listen)(void *userdata, int sockfd, int backlog); + int (*accept)(void *userdata, int sockfd, struct sockaddr *addr, + socklen_t *addrlen); + int (*connect)(void *userdata, int sockfd, const struct sockaddr *addr, + socklen_t addrlen); + ssize_t (*read)(void *userdata, int fd, void *buf, size_t count); + ssize_t (*readv)(void *userdata, int fd, const struct iovec *iov, + int iovcnt); + ssize_t (*recv)(void *userdata, int sockfd, void *buf, size_t len, + int flags); + ssize_t (*recvfrom)(void *userdata, int sockfd, void *buf, size_t len, + int flags, struct sockaddr *src_addr, + socklen_t *addrlen); + ssize_t (*recvmsg)(void *userdata, int sockfd, struct msghdr *msg, + int flags); + ssize_t (*write)(void *userdata, int fd, const void *buf, size_t count); + ssize_t (*writev)(void *userdata, int fd, const struct iovec *iov, + int iovcnt); + ssize_t (*send)(void *userdata, int sockfd, const void *buf, size_t len, + int flags); + ssize_t (*sendto)(void *userdata, int sockfd, const void *buf, + size_t len, int flags, + const struct sockaddr *dest_addr, socklen_t addrlen); + ssize_t (*sendmsg)(void *userdata, int sockfd, const struct msghdr *msg, + int flags); + int (*fcntl)(void *userdata, int fd, int cmd, ...); + int (*ioctl)(void *userdata, int fd, unsigned long request, ...); + int (*close)(void *userdata, int fd); + int (*shutdown)(void *userdata, int sockfd, int how); + int (*getsockopt)(void *userdata, int sockfd, int level, int optname, + void *optval, socklen_t *optlen); + int (*setsockopt)(void *userdata, int sockfd, int level, int optname, + const void *optval, socklen_t optlen); + int (*poll)(void *userdata, struct pollfd *fds, nfds_t nfds, + int timeout); + /* Send @count bytes of data starting from @buf to the TCP stack. + * Return 0 on success or -1 on error. */ + int (*netdev_send)(void *userdata, const void *buf, size_t count); + /* Sniff the next packet leaving the TCP stack. + * Put packet data in @buf. @count is passed in as the buffer size. + * The actual number of bytes received should be put in @count. + * Set @count to 0 if received nothing. + * Set @time_usecs to the receive timestamp. + * Return 0 on success or -1 on error. */ + int (*netdev_receive)(void *userdata, void *buf, size_t *count, + long long *time_usecs); + int (*usleep)(void *userdata, useconds_t usec); + int (*gettimeofday)(void *userdata, struct timeval *tv, + struct timezone *tz); + int (*epoll_create)(void *userdata, int size); + int (*epoll_ctl)(void *userdata, int epfd, int op, int fd, + struct epoll_event *event); + int (*epoll_wait)(void *userdata, int epfd, struct epoll_event *events, + int maxevents, int timeout); + int (*pipe)(void *userdata, int pipefd[2]); + int (*splice)(void *userdata, int fd_in, loff_t *off_in, int fd_out, + loff_t *off_out, size_t len, unsigned int flags); +}; + +typedef void (*packetdrill_interface_init_t)(const char *flags, + struct packetdrill_interface *); + +#endif /* __PACKETDRILL_H__ */ diff --git a/test/packetdrill/parse.h b/test/packetdrill/parse.h new file mode 100644 index 0000000..3ac8eae --- /dev/null +++ b/test/packetdrill/parse.h @@ -0,0 +1,62 @@ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: ncardwell@google.com (Neal Cardwell) + * + * Interface for a module to parse test scripts. + */ + +#ifndef __PARSER_H__ +#define __PARSER_H__ + +#include "types.h" + +#include "assert.h" +#include "config.h" +#include "script.h" + +/* Copy the script contents into our single linear buffer. */ +extern void copy_script(const char *script_buffer, + struct script *script); + +/* Read the script file into a single linear buffer. */ +extern void read_script(const char *script_path, + struct script *script); + +/* The public, top-level call to parse a test script. It first parses the + * internal linear script buffer and then fills in the + * 'script' object with the internal representation of the + * script. Uses the given 'config' object to look up configuration + * info needed during parsing (such as whether packets are IPv4 or + * IPv6). Passes the given 'callback_invocation' when calling back to + * parse_and_finalize_config() after parsing all in-script + * options. + * + * Returns STATUS_OK on success; on failure returns STATUS_ERR. The + * implementation for this function is in the bison parser file + * parser.y. + */ +extern int parse_script(struct config *config, + struct script *script, + struct invocation *callback_invocation); + +/* Config for lexing and parsing. */ +extern struct config *in_config; + +#endif /* __PARSER_H__ */ diff --git a/test/packetdrill/parser.y b/test/packetdrill/parser.y new file mode 100644 index 0000000..70219bd --- /dev/null +++ b/test/packetdrill/parser.y @@ -0,0 +1,1739 @@ +%{ +/* + * Copyright 2013 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: Author: ncardwell@google.com (Neal Cardwell) + * + * This is the parser for the packetdrill script language. It is + * processed by the bison parser generator. + * + * For full documentation see: http://www.gnu.org/software/bison/manual/ + * + * Here is a quick and dirty tutorial on bison: + * + * A bison parser specification is basically a BNF grammar for the + * language you are parsing. Each rule specifies a nonterminal symbol + * on the left-hand side and a sequence of terminal symbols (lexical + * tokens) and or nonterminal symbols on the right-hand side that can + * "reduce" to the symbol on the left hand side. When the parser sees + * the sequence of symbols on the right where it "wants" to see a + * nonterminal on the left, the rule fires, executing the semantic + * action code in curly {} braces as it reduces the right hand side to + * the left hand side. + * + * The semantic action code for a rule produces an output, which it + * can reference using the $$ token. The set of possible types + * returned in output expressions is given in the %union section of + * the .y file. The specific type of the output for a terminal or + * nonterminal symbol (corresponding to a field in the %union) is + * given by the %type directive in the .y file. The action code can + * access the outputs of the symbols on the right hand side by using + * the notation $1 for the first symbol, $2 for the second symbol, and + * so on. + * + * The lexer (generated by flex from lexer.l) feeds a stream of + * terminal symbols up to this parser. Parser semantic actions can + * access the lexer output for a terminal symbol with the same + * notation they use for nonterminals. + * + * Here's an example rule with its semantic action in {} braces: + * + * tcp_option + * ... + * | MSS INTEGER { + * $$ = tcp_option_new(...); + * ... + * $$->data.mss.bytes = htons($2); + * } + * + * This rule basically says: + * + * When the parser wants to see a tcp_option, if it sees an MSS from + * the lexer followed by an INTEGER from the lexer then run the + * action code that (a) stores in the output $$ a pointer to a + * struct tcp_option object, and then (b) stores in that object the + * value of the INTEGER token (accessed with $2). + * + */ + +/* The first part of the .y file consists of C code that bison copies + * directly into the top of the .c file it generates. + */ + +#include "types.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "gre_packet.h" +#include "ip.h" +#include "ip_packet.h" +#include "icmp_packet.h" +#include "logging.h" +#include "mpls.h" +#include "mpls_packet.h" +#include "tcp_packet.h" +#include "udp_packet.h" +#include "parse.h" +#include "script.h" +#include "tcp.h" +#include "tcp_options.h" + +/* This include of the bison-generated .h file must go last so that we + * can first include all of the declarations on which it depends. + */ +#include "parser.h" + +/* Change this YYDEBUG to 1 to get verbose debug output for parsing: */ +#define YYDEBUG 0 +#if YYDEBUG +extern int yydebug; +#endif + +extern FILE *yyin; +extern int yylineno; +extern char *yytext; +extern int yylex(void); +extern int yyparse(void); +extern int yywrap(void); +extern const char *cleanup_cmd; + +/* This mutex guards all parser global variables declared in this file. */ +pthread_mutex_t parser_mutex = PTHREAD_MUTEX_INITIALIZER; + +/* The input to the parser: the path name of the script file to parse. */ +static const char* current_script_path = NULL; + +/* The starting line number of the input script statement that we're + * currently parsing. This may be different than yylineno if bison had + * to look ahead and lexically scan a token on the following line to + * decide that the current statement is done. + */ +static int current_script_line = -1; + +/* + * We uses this object to look up configuration info needed during + * parsing (such as whether packets are IPv4 or IPv6). + */ +struct config *in_config = NULL; + +/* The output of the parser: an output script containing + * 1) a linked list of options + * 2) a linked list of events + */ +static struct script *out_script = NULL; + +/* The test invocation to pass back to parse_and_finalize_config(). */ +struct invocation *invocation; + +/* Copy the script contents into our single linear buffer. */ +void copy_script(const char *script_buffer, struct script *script) +{ + DEBUGP("copy_script\n"); + + free(script->buffer); + script->length = strlen(script_buffer); + script->buffer = strdup(script_buffer); + assert(script->buffer != NULL); + + DEBUGP("copy_script: %d bytes\n", script->length); +} + +/* Read the script file into a single linear buffer. */ +void read_script(const char *script_path, struct script *script) +{ + int size = 0; + + DEBUGP("read_script(%s)\n", script_path); + + while (script->buffer == NULL) { + struct stat script_info; + int fd = -1; + + /* Allocate a buffer big enough for the whole file. */ + if (stat(script_path, &script_info) != 0) + die("parse error: stat() of script file '%s': %s\n", + script_path, strerror(errno)); + + /* Pick a buffer size larger than the file, so we'll + * know if the file grew. + */ + size = max((int)script_info.st_size, size) + 1; + + script->buffer = malloc(size); + assert(script->buffer != NULL); + + /* Read the file into our buffer. */ + fd = open(script_path, O_RDONLY); + if (fd < 0) + die("parse error opening script file '%s': %s\n", + script_path, strerror(errno)); + + script->length = read(fd, script->buffer, size); + if (script->length < 0) + die("parse error reading script file '%s': %s\n", + script_path, strerror(errno)); + + /* If we filled the buffer, then probably another + * process wrote more to the file since our stat call, + * so we should try again. + */ + if (script->length == size) { + free(script->buffer); + script->buffer = NULL; + script->length = 0; + } + + if (close(fd)) + die_perror("close"); + } + DEBUGP("read_script: %d bytes\n", script->length); +} + + +/* The public entry point for the script parser. Parses the + * text script file with the given path name and fills in the script + * object with the parsed representation. + */ +int parse_script(struct config *config, + struct script *script, + struct invocation *callback_invocation) +{ + /* This bison-generated parser is not multi-thread safe, so we + * have a lock to prevent more than one thread using the + * parser at the same time. This is useful in the wire server + * context, where in general we may have more than one test + * thread running at the same time. + */ + if (pthread_mutex_lock(&parser_mutex) != 0) + die_perror("pthread_mutex_lock"); + +#if YYDEBUG + yydebug = 1; +#endif + + /* Now parse the script from our buffer. */ + yyin = fmemopen(script->buffer, script->length, "r"); + if (yyin == NULL) + die_perror("fmemopen: parse error opening script buffer"); + + current_script_path = config->script_path; + in_config = config; + out_script = script; + invocation = callback_invocation; + + /* We have to reset the line number here since the wire server + * can do more than one yyparse(). + */ + yylineno = 1; + + int result = yyparse(); /* invoke bison-generated parser */ + current_script_path = NULL; + + if (fclose(yyin)) + die_perror("fclose: error closing script buffer"); + + /* Unlock parser. */ + if (pthread_mutex_unlock(&parser_mutex) != 0) + die_perror("pthread_mutex_unlock"); + + return result ? STATUS_ERR : STATUS_OK; +} + +/* Bison emits code to call this method when there's a parse-time error. + * We print the line number and the error message. + */ +static void yyerror(const char *message) +{ + fprintf(stderr, "%s:%d: parse error at '%s': %s\n", + current_script_path, yylineno, yytext, message); +} + +/* After we finish parsing each line of a script, we analyze the + * semantics of the line. If we encounter an error then we print the + * error message to stderr and exit with an error. + */ +static void semantic_error(const char* message) +{ + assert(current_script_line >= 0); + die("%s:%d: semantic error: %s\n", + current_script_path, current_script_line, message); +} + +/* This standard callback is invoked by flex when it encounters + * the end of a file. We return 1 to tell flex to return EOF. + */ +int yywrap(void) +{ + return 1; +} + +/* Create and initalize a new expression. */ +static struct expression *new_expression(enum expression_t type) +{ + struct expression *expression = calloc(1, sizeof(struct expression)); + expression->type = type; + return expression; +} + +/* Create and initalize a new integer expression with the given + * literal value and format string. + */ +static struct expression *new_integer_expression(s64 num, const char *format) +{ + struct expression *expression = new_expression(EXPR_INTEGER); + expression->value.num = num; + expression->format = format; + return expression; +} + +/* Create and initalize a new one-element expression_list. */ +static struct expression_list *new_expression_list( + struct expression *expression) +{ + struct expression_list *list; + list = calloc(1, sizeof(struct expression_list)); + list->expression = expression; + list->next = NULL; + return list; +} + +/* Add the expression to the end of the list. */ +static void expression_list_append(struct expression_list *list, + struct expression *expression) +{ + while (list->next != NULL) { + list = list->next; + } + list->next = new_expression_list(expression); +} + +/* Create and initialize a new option. */ +static struct option_list *new_option(char *name, char *value) +{ + struct option_list *opt = calloc(1, sizeof(struct option_list)); + opt->name = name; + opt->value = value; + return opt; +} + +/* Create and initialize a new event. */ +static struct event *new_event(enum event_t type) +{ + struct event *e = calloc(1, sizeof(struct event)); + e->type = type; + e->time_usecs_end = NO_TIME_RANGE; + e->offset_usecs = NO_TIME_RANGE; + return e; +} + +static int parse_hex_byte(const char *hex, u8 *byte) +{ + if (!isxdigit((int)hex[0]) || !isxdigit((int)hex[1])) { + return STATUS_ERR; /* need two hex digits per byte */ + } + char buf[] = { hex[0], hex[1], '\0' }; + char* buf_end = NULL; + u32 byte_value = strtoul(buf, &buf_end, 16); + assert(byte_value <= 0xff); + assert(buf_end == buf + 2); + *byte = byte_value; + return STATUS_OK; +} + +/* Converts a hex string in 'hex' into bytes and stores them in a + * buffer 'buf' of length 'buf_len' bytes; returns number of bytes in + * out_len. Works for hex strings of arbitrary size, such as very long + * TCP Fast Open cookies. + */ +static int parse_hex_string(const char *hex, u8 *buf, int buf_len, + int *out_len) +{ + u8 *out = buf; + u8 *buf_end = buf + buf_len; + while (hex[0] != '\0') { + if (out >= buf_end) { + return STATUS_ERR; /* ran out of output space */ + } + if (parse_hex_byte(hex, out)) + return STATUS_ERR; /* bad character */ + hex += 2; + out += 1; + } + *out_len = out - buf; + assert(*out_len <= buf_len); + return STATUS_OK; +} + +static struct tcp_option *new_tcp_fast_open_option(const char *cookie_string, + char **error, bool exp) +{ + int cookie_string_len = strlen(cookie_string); + if (cookie_string_len & 1) { + asprintf(error, + "TCP fast open cookie has an odd number of digits"); + return NULL; + } + int cookie_bytes = cookie_string_len / 2; /* 2 hex chars per byte */ + int max_bytes = exp ? MAX_TCP_FAST_OPEN_EXP_COOKIE_BYTES : + MAX_TCP_FAST_OPEN_COOKIE_BYTES; + if (cookie_bytes > max_bytes) { + asprintf(error, "TCP fast open cookie too long"); + asprintf(error, "TCP fast open cookie of %d bytes " + "exceeds maximum cookie length of %d bytes", + cookie_bytes, max_bytes); + return NULL; + } + u8 option_bytes = cookie_bytes + (exp ? TCPOLEN_EXP_FASTOPEN_BASE : + TCPOLEN_FASTOPEN_BASE); + struct tcp_option *option; + option = tcp_option_new(exp ? TCPOPT_EXP : TCPOPT_FASTOPEN, + option_bytes); + if (exp) + option->data.fast_open_exp.magic = htons(TCPOPT_FASTOPEN_MAGIC); + + int parsed_bytes = 0; + /* Parse cookie. This should be an ASCII hex string + * representing an even number of bytes (4-16 bytes). But we + * do not enforce this, since we want to allow test cases that + * supply invalid cookies. + */ + if (parse_hex_string(cookie_string, + exp ? option->data.fast_open_exp.cookie : + option->data.fast_open.cookie, + exp ? sizeof(option->data.fast_open_exp.cookie): + sizeof(option->data.fast_open.cookie), + &parsed_bytes)) { + free(option); + asprintf(error, + "TCP fast open cookie '%s' is not a valid hex string", + cookie_string); + return NULL; + } + assert(parsed_bytes == cookie_bytes); + return option; +} + +static struct tcp_option *new_md5_option(const char *digest_string, + char **error) +{ + struct tcp_option *option; + int digest_string_len = strlen(digest_string); + int digest_bytes = digest_string_len / 2; + int parsed_bytes = 0; + + if (digest_bytes > TCP_MD5_DIGEST_LEN) { + asprintf(error, "TCP MD5 digest longer than 16 bytes"); + return NULL; + } + + option = tcp_option_new(TCPOPT_MD5SIG, TCPOLEN_MD5_BASE + digest_bytes); + + /* Parse MD5 digest. This should be an ASCII hex string representing 16 + * bytes. But we allow smaller buffers, since we want to allow test + * cases that supply invalid cookies. + */ + if (parse_hex_string(digest_string, + option->data.md5.digest, + sizeof(option->data.md5.digest), + &parsed_bytes)) { + free(option); + asprintf(error, "TCP MD5 digest is not a valid hex string"); + return NULL; + } + assert(parsed_bytes <= digest_bytes); + return option; +} + +static struct packet *append_gre(struct packet *packet, struct expression *expr) +{ + struct gre *gre = &expr->value.gre; + char *error = NULL; + if (gre_header_append(packet, gre, &error)) + semantic_error(error); + free(expr); + return packet; +} + +%} + +%locations +%expect 3 /* we expect shift/reduce conflicts */ +/* The %union section specifies the set of possible types for values + * for all nonterminal and terminal symbols in the grammar. + */ +%union { + s64 integer; + double floating; + char *string; + char *reserved; + s64 time_usecs; + enum direction_t direction; + enum ip_ecn_t ip_ecn; + struct tos_spec tos_spec; + struct ip_info ip_info; + struct mpls_stack *mpls_stack; + struct mpls mpls_stack_entry; + u16 port; + s32 window; + u16 urg_ptr; + u32 sequence_number; + struct { + int protocol; /* IPPROTO_TCP or IPPROTO_UDP */ + u32 start_sequence; + u16 payload_bytes; + } tcp_sequence_info; + struct option_list *option; + struct event *event; + struct packet *packet; + struct syscall_spec *syscall; + struct command_spec *command; + struct code_spec *code; + struct tcp_option *tcp_option; + struct tcp_options *tcp_options; + struct expression *expression; + struct expression_list *expression_list; + struct errno_spec *errno_info; + struct { + u16 src_port; + u16 dst_port; + } port_info; +} + +/* The specific type of the output for a symbol is given by the %type + * directive. By convention terminal symbols returned from the lexer + * have ALL_CAPS names, and nonterminal symbols have lower_case names. + */ +%token ELLIPSIS +%token SA_FAMILY SIN_PORT SIN_ADDR _HTONS_ INET_ADDR INET6_ADDR +%token MSG_NAME MSG_IOV MSG_FLAGS MSG_CONTROL +%token CMSG_LEVEL CMSG_TYPE CMSG_DATA +%token FD EVENTS REVENTS ONOFF LINGER +%token U32 U64 PTR +%token ACK ECR EOL MSS NOP SACK SACKOK TIMESTAMP VAL WIN WSCALE +%token URG MD5 FAST_OPEN FAST_OPEN_EXP +%token TOS FLAGS FLOWLABEL +%token ECT0 ECT1 CE ECT01 NO_ECN +%token IPV4 IPV6 ICMP UDP RAW GRE MTU ID +%token MPLS LABEL TC TTL +%token OPTION +%token SUM OFF KEY SEQ +%token NONE CHECKSUM SEQUENCE PRESENT +%token EE_ERRNO EE_CODE EE_DATA EE_INFO EE_ORIGIN EE_TYPE +%token SCM_SEC SCM_NSEC +%token FLOAT +%token INTEGER HEX_INTEGER +%token WORD STRING BACK_QUOTED CODE IPV4_ADDR IPV6_ADDR +%type direction +%type ip_info opt_ip_info +%type tos_spec +%type ip_ecn +%type